From 0ff70821c9b0b991197fa7f3264bf9dd78b8d4b3 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Sun, 23 Nov 2025 20:18:55 -0800
Subject: [PATCH 001/134] [Core] Deprecate `xformers` (#29262)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 docker/Dockerfile.nightly_torch               |  35 +-
 .../contributing/ci/update_pytorch_version.md |  15 -
 docs/getting_started/quickstart.md            |   2 +-
 .../openai_embedding_long_text/service.sh     |   1 -
 requirements/cuda.txt                         |   1 -
 .../test_basic_correctness.py                 |   3 -
 tests/kernels/attention/test_attention.py     | 129 ------
 .../attention/test_attention_selector.py      |   8 +-
 tests/kernels/attention/test_mha_attn.py      |   4 -
 tests/kernels/utils.py                        |  78 +---
 tests/lora/test_minicpmv_tp.py                |  12 -
 tests/lora/test_qwen2vl.py                    |  15 -
 vllm/attention/backends/registry.py           |   1 -
 vllm/attention/layer.py                       |  38 --
 vllm/attention/ops/vit_attn_wrappers.py       |  38 +-
 vllm/attention/selector.py                    |   9 +-
 vllm/config/multimodal.py                     |   6 +
 vllm/envs.py                                  |   1 -
 vllm/model_executor/models/dots_ocr.py        |  27 +-
 vllm/model_executor/models/ernie45_vl.py      |  33 +-
 vllm/model_executor/models/glm4_1v.py         |  31 +-
 vllm/model_executor/models/keye.py            |  30 +-
 vllm/model_executor/models/paddleocr_vl.py    |  13 -
 vllm/model_executor/models/pixtral.py         |   1 +
 vllm/model_executor/models/qwen2_5_vl.py      |  25 +-
 vllm/model_executor/models/qwen2_vl.py        |  31 +-
 .../models/qwen3_omni_moe_thinker.py          |  12 +-
 vllm/model_executor/models/qwen3_vl.py        |  13 +-
 vllm/platforms/cuda.py                        |   7 +-
 vllm/utils/__init__.py                        |   1 -
 vllm/v1/attention/backends/xformers.py        | 420 ------------------
 31 files changed, 77 insertions(+), 963 deletions(-)
 delete mode 100644 vllm/v1/attention/backends/xformers.py

diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index b88b9c4992200..d663c82c3885e 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -76,34 +76,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
 uv pip install --system -r requirements/common.txt
 
-# must put before installing xformers, so it can install the correct version of xfomrers.
-ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-
-# Build xformers with cuda and torch nightly
-# following official xformers guidance: https://github.com/facebookresearch/xformers#build
-# todo(elainewy): cache xformers build result for faster build
-ARG max_jobs=16
-ENV MAX_JOBS=${max_jobs}
-ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-     --mount=type=cache,target=/root/.cache/uv \
-    echo 'git clone xformers...' \
-    && git clone https://github.com/facebookresearch/xformers.git --recursive \
-    && cd xformers \
-    && git checkout ${XFORMERS_COMMIT} \
-    && git submodule update --init --recursive \
-    && echo 'finish git clone xformers...' \
-    && rm -rf build \
-    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
-    && cd .. \
-    && rm -rf xformers
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system xformers-dist/*.whl --verbose
-
 # build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
@@ -233,11 +205,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm
     --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system vllm-dist/*.whl --verbose
 
-# install xformers again for the new environment
-RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
-
 ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
 
 # install package for build flashinfer
@@ -307,7 +274,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/nightly_torch_test.txt
 
 # Logging to confirm the torch versions
-RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+RUN pip freeze | grep -E 'torch|vllm|flashinfer'
 
 # Logging to confirm all the packages are installed
 RUN pip freeze
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 09fd85a466eed..735bb2e205332 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -98,21 +98,6 @@ to warm it up so that future builds are faster.
     <img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
 </p>
 
-## Update dependencies
-
-Several vLLM dependencies like xFormers depend on PyTorch and need
-to be updated accordingly. Rather than waiting for all of them to publish new
-releases (which would take too much time), they can be built from
-source to unblock the update process.
-
-### xFormers
-
-```bash
-export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
-MAX_JOBS=16 uv pip install --system \
-    --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
-```
-
 ## Update all the different vLLM platforms
 
 Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 9e86f785b10c7..94920dc5306b3 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -283,7 +283,7 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
 
 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
 
-- On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
+- On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
 - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
 
 For AMD ROCm, you can further control the specific Attention implementation using the following variables:
diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh
index 1577de85f7ff2..b5c92749466b0 100644
--- a/examples/online_serving/openai_embedding_long_text/service.sh
+++ b/examples/online_serving/openai_embedding_long_text/service.sh
@@ -22,7 +22,6 @@ API_KEY=${API_KEY:-"your-api-key"}
 POOLING_TYPE=${POOLING_TYPE:-"auto"}  # auto, MEAN, CLS, LAST
 export VLLM_ENABLE_CHUNKED_PROCESSING=true
 export CUDA_VISIBLE_DEVICES=2,3,4,5
-# export VLLM_ATTENTION_BACKEND=XFORMERS
 
 echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
 echo "=================================================================="
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index d63fe9e1e77c1..15e8aadc56f47 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -9,6 +9,5 @@ torch==2.9.0
 torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.2
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 0cf1e85d4e8ee..521d6c33dd390 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -74,9 +74,6 @@ def test_models(
     model_executor: str,
     enable_prompt_embeds: bool,
 ) -> None:
-    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
-        pytest.skip(f"{backend} does not support gemma2 with full context length.")
-
     with monkeypatch.context() as m:
         m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 9662e73321ebe..1a7d5ce0ddc1e 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -13,12 +13,6 @@ from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.platforms import current_platform
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
 
-if not current_platform.is_rocm():
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
-
-    from tests.kernels.utils import make_alibi_bias
-
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
@@ -448,129 +442,6 @@ def ref_multi_query_kv_attention(
     return torch.cat(ref_outputs, dim=0)
 
 
-@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(
-    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
-)
-@torch.inference_mode()
-def test_multi_query_kv_attention(
-    num_seqs: int,
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    use_alibi: bool = False,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
-    # As the xformers library is already tested with its own tests, we can use
-    # a smaller MAX_SEQ_LEN here.
-    max_len = min(MAX_SEQ_LEN, 4096)
-    seq_lens = random.sample(range(1, max_len), num_seqs)
-    num_tokens = sum(seq_lens)
-
-    scale = float(1.0 / (head_size**0.5))
-    num_query_heads, num_kv_heads = num_heads
-    qkv = torch.empty(
-        num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype
-    )
-    qkv.uniform_(-scale, scale)
-    query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1)
-
-    num_queries_per_kv = num_query_heads // num_kv_heads
-    if num_queries_per_kv > 1:
-        # Handle MQA and GQA
-        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
-        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
-    alibi_bias = None
-    if use_alibi:
-        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
-        attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
-        output = torch.empty_like(query)
-        start = 0
-        # Dynamic sequence length not supported with custom attn_bias.
-        for i, seq_len in enumerate(seq_lens):
-            end = start + seq_len
-            out = xops.memory_efficient_attention_forward(
-                query[None, start:end],
-                key[None, start:end],
-                value[None, start:end],
-                attn_bias=attn_bias[i],
-                p=0.0,
-                scale=scale,
-            )
-            output[start:end].copy_(out.view_as(query[start:end]))
-            start += seq_len
-        # xformers.AttentionBias to Tensor for use in reference impl.
-        alibi_bias = [
-            b.materialize((1, num_query_heads, i, i), device=device).squeeze()
-            for b, i in zip(attn_bias, seq_lens)
-        ]
-    else:
-        attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
-        output = xops.memory_efficient_attention_forward(
-            query.unsqueeze(0),
-            key.unsqueeze(0),
-            value.unsqueeze(0),
-            attn_bias=attn_bias,
-            p=0.0,
-            scale=scale,
-        )
-        output = output.squeeze(0)
-
-    cu_seq_lens = [0]
-    for seq_len in seq_lens:
-        cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
-    ref_output = ref_multi_query_kv_attention(
-        cu_seq_lens,
-        query,
-        key,
-        value,
-        scale,
-        alibi_bias,
-        dtype,
-    )
-    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
-    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
-
-
-@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", [64])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(
-    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
-)
-@torch.inference_mode()
-def test_multi_query_kv_attention_with_alibi(
-    num_seqs: int,
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-) -> None:
-    return test_multi_query_kv_attention(
-        num_seqs,
-        num_heads,
-        head_size,
-        dtype,
-        seed,
-        device,
-        use_alibi=True,
-    )
-
-
 @pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
 def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
     head_size = 64
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 9be56a33f76c8..cd34b520ea71b 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -34,7 +34,7 @@ DEVICE_MLA_BACKENDS = {
 }
 
 DEVICE_REGULAR_ATTN_BACKENDS = {
-    "cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
+    "cuda": ["FLASHINFER", "FLASH_ATTN"],
     "hip": ["ROCM_ATTN"],
     "cpu": ["CPU_ATTN"],
 }
@@ -207,12 +207,6 @@ def test_env(
                     )
                     expected = "FLASHINFER"
                     assert backend.get_name() == expected
-                elif name == "XFORMERS":
-                    backend = get_attn_backend(
-                        32, torch.float16, None, block_size, use_mla=use_mla
-                    )
-                    expected = "XFORMERS"
-                    assert backend.get_name() == expected
                 elif name == "FLASH_ATTN":
                     backend = get_attn_backend(
                         32, torch.float16, None, block_size, use_mla=use_mla
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index a878ac6396ce5..ae3c63cc62d6b 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -24,10 +24,6 @@ from vllm.platforms.rocm import RocmPlatform
 def clear_cache():
     """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
-    # Clear xformers availability cache
-    import vllm.attention.layer as layer_module
-
-    layer_module.USE_XFORMERS_OPS = None
 
 
 @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 5d5a26fbfc2cd..9307ef7814a8b 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -509,43 +509,6 @@ def pack_qkv(qkv: QKVInputs, device: torch.device | str) -> PackedQKVInputs:
     )
 
 
-def make_alibi_bias(
-    alibi_slopes: torch.Tensor,
-    num_kv_heads: int,
-    dtype: torch.dtype,
-    seq_lens: list[int],
-) -> list[Any]:
-    """Create ALiBi biases compatible with xFormers attention tests."""
-    from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias
-
-    if alibi_slopes is None:
-        return [None for _ in seq_lens]
-
-    attn_biases: list[Any] = []
-    num_heads = alibi_slopes.shape[0]
-    assert num_heads >= num_kv_heads, (
-        "ALiBi slopes expect at least as many heads as KV heads"
-    )
-
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
-        bias = bias[None, :] - bias[:, None]
-
-        padded_len = (seq_len + 7) // 8 * 8
-        bias_tensor = torch.empty(
-            1,
-            num_heads,
-            seq_len,
-            padded_len,
-            device=alibi_slopes.device,
-            dtype=dtype,
-        )[:, :, :, :seq_len].copy_(bias)
-        bias_tensor.mul_(alibi_slopes[:, None, None])
-        attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor))
-
-    return attn_biases
-
-
 def _make_metadata_tensors(
     seq_lens: list[int] | None,
     context_lens: list[int] | None,
@@ -649,23 +612,12 @@ def make_kv_cache(
 
     Returns:
 
-    * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
-    *     for backend 'XFORMERS'
     * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
     *     for backend 'FLASH_ATTN'
     """
-    if backend == "XFORMERS":
-        kv_cache = torch.rand((2, num_blocks, block_size * num_heads * head_size)).to(
-            device
-        )
-    elif backend == "FLASH_ATTN":
-        kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(
-            device
-        )
-    else:
-        raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
-        )
+    if backend != "FLASH_ATTN":
+        raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.")
+    kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(device)
     if default_val is not None:
         kv_cache[:, :, :] = default_val
     return kv_cache
@@ -843,22 +795,14 @@ def assert_actual_matches_ideal(
     * output_under_test: actually observed output value
     """
     ideal_output = test_params.packed_qkvo.ideal_output
-    if backend == "XFORMERS":
-        torch.testing.assert_close(
-            ideal_output, output_under_test.view_as(ideal_output)
-        )
-
-    elif backend == "FLASH_ATTN":
-        # For FlashAttention override the accuracy thresholds to non default
-        # values since we notice a higher difference between the ideal and
-        # actual output.
-        torch.testing.assert_close(
-            ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
-        )
-    else:
-        raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
-        )
+    if backend != "FLASH_ATTN":
+        raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.")
+    # For FlashAttention override the accuracy thresholds to non default
+    # values since we notice a higher difference between the ideal and
+    # actual output.
+    torch.testing.assert_close(
+        ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
+    )
 
 
 # Copied/modified from torch._refs.__init__.py
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 1cf8ed602b6a4..e430826461a14 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -57,10 +57,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
@@ -84,10 +80,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
 @pytest.mark.skipif(
     current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
 )
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
 @multi_gpu_test(num_gpus=4)
 def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
@@ -108,10 +100,6 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
 @pytest.mark.skipif(
     current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
 )
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
 @multi_gpu_test(num_gpus=4)
 def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 1800ca107a426..7d8c940100ca4 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -2,12 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
-import pytest
-
 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
 from vllm.sampling_params import BeamSearchParams
 
 
@@ -142,10 +139,6 @@ QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
 QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm",
-)
 def test_qwen2vl_lora(qwen2vl_lora_files):
     """Test Qwen 2.0 VL model with LoRA"""
     config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
@@ -156,10 +149,6 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
         tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm",
-)
 def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
     """Test Qwen 2.0 VL model with LoRA through beam search."""
     config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
@@ -178,10 +167,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
         )
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
-)
 def test_qwen25vl_lora(qwen25vl_lora_files):
     """Test Qwen 2.5 VL model with LoRA"""
     config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index 6747cf7743b14..125e4e3827747 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -43,7 +43,6 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
 
     FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
     TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
-    XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"
     ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
     ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"
     ROCM_AITER_TRITON_MLA = (
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a8e796a1eab63..f1d57ac50fb9f 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -51,31 +51,6 @@ else:
 
 FP8_DTYPE = current_platform.fp8_dtype()
 logger = init_logger(__name__)
-USE_XFORMERS_OPS = None
-
-
-def check_xformers_availability():
-    global USE_XFORMERS_OPS
-    if USE_XFORMERS_OPS is not None:
-        return USE_XFORMERS_OPS
-
-    if current_platform.is_cuda() and current_platform.has_device_capability(100):
-        # Xformers FA is not compatible with B200
-        USE_XFORMERS_OPS = False
-    else:
-        try:
-            from importlib.util import find_spec
-
-            find_spec("xformers.ops")
-            USE_XFORMERS_OPS = True
-        except ImportError:
-            USE_XFORMERS_OPS = False
-
-    # the warning only needs to be shown once
-    if not USE_XFORMERS_OPS:
-        logger.warning("Xformers is not available, falling back.")
-
-    return USE_XFORMERS_OPS
 
 
 def check_upstream_fa_availability(dtype: torch.dtype):
@@ -533,7 +508,6 @@ class MultiHeadAttention(nn.Module):
             if backend
             in {
                 AttentionBackendEnum.TORCH_SDPA,
-                AttentionBackendEnum.XFORMERS,
                 AttentionBackendEnum.PALLAS,
                 AttentionBackendEnum.ROCM_AITER_FA,
                 AttentionBackendEnum.FLASH_ATTN,
@@ -549,12 +523,6 @@ class MultiHeadAttention(nn.Module):
             )
         )
 
-        if (
-            self.attn_backend == AttentionBackendEnum.XFORMERS
-            and not check_xformers_availability()
-        ):
-            self.attn_backend = AttentionBackendEnum.TORCH_SDPA
-
         self.is_flash_attn_backend = self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
@@ -614,12 +582,6 @@ class MultiHeadAttention(nn.Module):
                 max_seqlen_k=kv_len,
                 softmax_scale=self.scale,
             )
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-
-            out = xops.memory_efficient_attention_forward(
-                query, key, value, scale=self.scale
-            )
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             query, key, value = (x.transpose(1, 2) for x in (query, key, value))
             out = F.scaled_dot_product_attention(query, key, value, scale=self.scale)
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 06a9f7cd82266..46f8f5117f7a7 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -3,7 +3,7 @@
 """
 This file contains ops for ViT attention to be compatible with torch.compile
 as there are operations here not supported by torch.compile (for instance,
-`to_list` in xformers attn, or `.item()` in flash attention)
+`.item()` in flash attention)
 
 Using these ops and wrapping vision blocks with `torch.compile` can speed up
 throughput in vision models by ~5% relative on H100, and improve token
@@ -19,42 +19,6 @@ import torch.nn.functional as F
 from vllm.utils.torch_utils import direct_register_custom_op
 
 
-def xformers_attn_seqlens_wrapper(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-    attn_bias = BlockDiagonalMask.from_seqlens(
-        q_seqlen=seqlens.tolist(), kv_seqlen=None, device=q.device
-    )
-    context_layer = xops.memory_efficient_attention_forward(
-        q, k, v, attn_bias=attn_bias, p=0, scale=None
-    )
-    context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
-    return context_layer
-
-
-def xformers_attn_seqlens_wrapper_fake(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
-    b, s, h, d = q.shape
-    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
-
-
-direct_register_custom_op(
-    op_name="xformers_attn_seqlens_wrapper",
-    op_func=xformers_attn_seqlens_wrapper,
-    fake_impl=xformers_attn_seqlens_wrapper_fake,
-)
-
-
-def vit_xformers_attn_wrapper(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
-    return torch.ops.vllm.xformers_attn_seqlens_wrapper(q, k, v, seqlens)
-
-
 def flash_attn_maxseqlen_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index e9af08b2316d2..ad19b58aa155c 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -36,7 +36,14 @@ def get_env_variable_attn_backend() -> AttentionBackendEnum | None:
     * None otherwise
     """
     backend_name = os.environ.get(STR_BACKEND_ENV_VAR)
-    return None if backend_name is None else AttentionBackendEnum[backend_name]
+    if backend_name is None:
+        return None
+    if backend_name == "XFORMERS":
+        raise ValueError(
+            "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
+            "details). Please select a supported attention backend."
+        )
+    return AttentionBackendEnum[backend_name]
 
 
 # Global state allows a particular choice of backend
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 9f62b35ed515c..00a81a319bf72 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -173,6 +173,12 @@ class MultiModalConfig:
         # We need to import the real type here (deferred to avoid circular import).
         from vllm.attention.backends.registry import AttentionBackendEnum
 
+        if isinstance(value, str) and value.upper() == "XFORMERS":
+            raise ValueError(
+                "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
+                "details). Please select a supported attention backend."
+            )
+
         if value is None or isinstance(value, AttentionBackendEnum):
             return value
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 9b1ed1fc680b4..56558548d3981 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -640,7 +640,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Example options:
     # - "TORCH_SDPA": use torch.nn.MultiheadAttention
     # - "FLASH_ATTN": use FlashAttention
-    # - "XFORMERS": use XFormers
     # - "FLASHINFER": use flashinfer
     # - "FLASHMLA": use FlashMLA
     # - "FLASH_ATTN_MLA": use FlashAttention for MLA
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 2d2251e83b5b1..5460018d0d67a 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -306,7 +306,6 @@ class DotsVisionAttention(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -324,7 +323,6 @@ class DotsVisionAttention(nn.Module):
         rotary_pos_emb: torch.Tensor | None = None,
         *,
         max_seqlen: int | None = None,
-        seqlens: list[int] | None = None,
     ) -> torch.Tensor:
         # [S, C] -> [S, B=1, C]
         x = hidden_states.unsqueeze(1)
@@ -374,16 +372,6 @@ class DotsVisionAttention(nn.Module):
                 out_i = out_i.permute(0, 2, 1, 3)
                 outputs.append(out_i)
             context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
         else:
             raise RuntimeError("Unsupported attention backend")
 
@@ -545,14 +533,12 @@ class DotsVisionBlock(nn.Module):
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
         max_seqlen: int | None = None,
-        seqlens: list[int] | None = None,
     ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
             rotary_pos_emb=rotary_pos_emb,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
         return hidden_states
@@ -663,18 +649,14 @@ class DotsVisionTransformer(nn.Module):
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
-    def compute_attn_mask_seqlen(
-        self, cu_seqlens: torch.Tensor
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+        max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: list[list[int]]
@@ -694,14 +676,13 @@ class DotsVisionTransformer(nn.Module):
         )
         cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
 
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         for blk in self.blocks:
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         if self.post_trunk_norm is not None:
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index daa5bf03ea4a9..07b34fbc8addb 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -214,7 +214,6 @@ class Ernie4_5_VisionAttention(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -259,7 +258,6 @@ class Ernie4_5_VisionAttention(nn.Module):
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -311,20 +309,6 @@ class Ernie4_5_VisionAttention(nn.Module):
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -404,14 +388,12 @@ class Ernie4_5_VisionBlock(nn.Module):
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
             rotary_pos_emb=rotary_pos_emb,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
         return hidden_states
@@ -562,18 +544,14 @@ class Ernie4_5_VisionTransformer(nn.Module):
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
-    def compute_attn_mask_seqlen(
-        self, cu_seqlens: torch.Tensor
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+        max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, num_pad=0
@@ -598,8 +576,8 @@ class Ernie4_5_VisionTransformer(nn.Module):
         if hidden_states.ndim == 2:
             hidden_states = hidden_states.unsqueeze(dim=1)
 
-        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
 
         for i, blk in enumerate(self.blocks):
             hidden_states = blk(
@@ -607,7 +585,6 @@ class Ernie4_5_VisionTransformer(nn.Module):
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         final_output = self.ln(hidden_states)
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index d141e95498064..7e0370886884f 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -309,7 +309,6 @@ class Glm4vVisionAttention(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -345,7 +344,6 @@ class Glm4vVisionAttention(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -400,20 +398,6 @@ class Glm4vVisionAttention(nn.Module):
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -461,7 +445,6 @@ class Glm4vVisionBlock(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
@@ -469,7 +452,6 @@ class Glm4vVisionBlock(nn.Module):
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         x_fused_norm, residual = self.norm2(x, residual=x_attn)
         x = residual + self.mlp(x_fused_norm)
@@ -803,15 +785,14 @@ class Glm4vVisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
-        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+    ) -> int | None:
+        max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -836,8 +817,9 @@ class Glm4vVisionTransformer(nn.Module):
         ).cumsum(dim=0, dtype=torch.int32)
         cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
 
-        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         x = self.embeddings(
             x, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1]
         )
@@ -851,7 +833,6 @@ class Glm4vVisionTransformer(nn.Module):
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         # adapter
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 8fc3db296aa79..302260b952992 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -9,6 +9,7 @@ from typing import Annotated, Any, Literal, TypeAlias, TypeVar
 import numpy as np
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from einops import rearrange
 from transformers import PretrainedConfig
 from transformers.activations import GELUActivation
@@ -424,7 +425,7 @@ class KeyeSiglipAttention(nn.Module):
 
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.TORCH_SDPA,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -451,7 +452,6 @@ class KeyeSiglipAttention(nn.Module):
         )
 
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         batch_size = q.shape[0]
 
         if rope_emb is None:
@@ -498,17 +498,21 @@ class KeyeSiglipAttention(nn.Module):
                 softmax_scale=self.scale,
             )
             context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (
+                    rearrange(x, "b s h d -> b h s d") for x in (q_i, k_i, v_i)
+                )
+                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
 
         context_layer = rearrange(context_layer, "b s h d -> b s (h d)").contiguous()
 
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index dee0c16ab0f63..74bb868492da9 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -38,7 +38,6 @@ from vllm.attention.layer import (
 )
 from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
-    vit_xformers_attn_wrapper,
 )
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -657,7 +656,6 @@ class SiglipAttention(nn.Module):
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor | None,
         max_seqlen: torch.Tensor | None,
-        seqlens: torch.Tensor | None,
     ) -> torch.Tensor:
         batch_size, _, _ = hidden_states.shape
 
@@ -703,10 +701,6 @@ class SiglipAttention(nn.Module):
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            if seqlens is None:
-                raise ValueError("xFormers attention backend requires seqlens tensor.")
-            context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
         else:
             raise RuntimeError(
                 f"PaddleOCR-VL does not support {self.attn_backend} backend now."
@@ -818,7 +812,6 @@ class SiglipEncoderLayer(nn.Module):
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor | None,
         max_seqlen: torch.Tensor | None,
-        seqlens: torch.Tensor | None,
     ) -> torch.Tensor:
         residual = hidden_states
 
@@ -828,7 +821,6 @@ class SiglipEncoderLayer(nn.Module):
             cu_seqlens=cu_seqlens,
             rotary_pos_emb=rotary_pos_emb,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         hidden_states = residual + hidden_states
@@ -870,7 +862,6 @@ class SiglipEncoder(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -943,14 +934,11 @@ class SiglipEncoder(nn.Module):
             cu_seqlens = cu_seqlens.to(device=device)
 
         max_seqlen = None
-        seqlens = None
         if self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
 
         hidden_states = inputs_embeds
         for encoder_layer in self.layers:
@@ -959,7 +947,6 @@ class SiglipEncoder(nn.Module):
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
         return hidden_states
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 8a034fd72b02a..6011d93a795d1 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -74,6 +74,7 @@ from .vision import (
 )
 
 try:
+    # Note: vLLM does not install xformers by default.
     from xformers import ops as xops
 
     if current_platform.is_cuda() and current_platform.has_device_capability(100):
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 1500a437613cc..8c707c2561af1 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -46,7 +46,6 @@ from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
     vit_torch_sdpa_wrapper,
-    vit_xformers_attn_wrapper,
 )
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
@@ -375,7 +374,6 @@ class Qwen2_5_VisionAttention(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -435,8 +433,6 @@ class Qwen2_5_VisionAttention(nn.Module):
                 v,
                 cu_seqlens,
             )
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
 
         output, _ = self.proj(context_layer)
         return output
@@ -448,9 +444,7 @@ class Qwen2_5_VisionAttention(nn.Module):
         "cu_seqlens": 0,
         "rotary_pos_emb_cos": 0,
         "rotary_pos_emb_sin": 0,
-        "seqlens": 0,
     },
-    mark_unbacked_dims={"seqlens": 0},
     enable_if=should_torch_compile_mm_vit,
 )
 class Qwen2_5_VisionBlock(nn.Module):
@@ -501,7 +495,6 @@ class Qwen2_5_VisionBlock(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
@@ -509,7 +502,6 @@ class Qwen2_5_VisionBlock(nn.Module):
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         x_fused_norm, residual = self.norm2(x, residual=x_attn)
         x = residual + self.mlp(x_fused_norm)
@@ -670,7 +662,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -822,17 +813,14 @@ class Qwen2_5_VisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        seqlens = torch.zeros(1, device=cu_seqlens.device)
         if self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
-        return max_seqlen, seqlens
+        return max_seqlen
 
     @staticmethod
     def invert_permutation(perm: torch.Tensor) -> torch.Tensor:
@@ -897,10 +885,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
 
         # transformers
         # pre-compute seqlens for window/full attn to reduce cuMemcpy operations
-        max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(cu_seqlens)
-        max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
-            cu_window_seqlens
-        )
+        max_seqlen_full = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen_window = self.compute_attn_mask_seqlen(cu_window_seqlens)
 
         cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
         cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True)
@@ -927,11 +913,9 @@ class Qwen2_5_VisionTransformer(nn.Module):
             if layer_num in self.fullatt_block_indexes:
                 cu_seqlens_now = cu_seqlens
                 max_seqlen_now = max_seqlen_full
-                seqlens_now = seqlens_full
             else:
                 cu_seqlens_now = cu_window_seqlens
                 max_seqlen_now = max_seqlen_window
-                seqlens_now = seqlens_window
 
             hidden_states = blk(
                 hidden_states,
@@ -939,7 +923,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen_now,
-                seqlens=seqlens_now,
             )
 
         # For Qwen2.5-VL-3B, float16 will overflow at last block
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 479a7871e364f..9d1d023aed172 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -348,7 +348,6 @@ class Qwen2VisionAttention(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -384,7 +383,6 @@ class Qwen2VisionAttention(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, 3 * head * head_dim]
         x, _ = self.qkv(x)
@@ -445,20 +443,6 @@ class Qwen2VisionAttention(nn.Module):
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -509,7 +493,6 @@ class Qwen2VisionBlock(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -517,7 +500,6 @@ class Qwen2VisionBlock(nn.Module):
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -728,18 +710,14 @@ class Qwen2VisionTransformer(nn.Module):
         sin_combined = sin[pos_ids].flatten(1)
         return cos_combined, sin_combined
 
-    def compute_attn_mask_seqlen(
-        self, cu_seqlens: torch.Tensor
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+        max_seqlen = None
         if self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -771,7 +749,7 @@ class Qwen2VisionTransformer(nn.Module):
         x = x.unsqueeze(1)
 
         # pre-compute seqlens for attn mask to reduce cuMemcpy operations
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
         for blk in self.blocks:
             x = blk(
@@ -780,7 +758,6 @@ class Qwen2VisionTransformer(nn.Module):
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         # adapter
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 54ef56f83344e..61f218f16d79c 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -224,7 +224,6 @@ class Qwen3_VisionBlock(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -232,7 +231,6 @@ class Qwen3_VisionBlock(nn.Module):
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -500,14 +498,11 @@ class Qwen3Omni_VisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        seqlens = torch.zeros(1, device=cu_seqlens.device)
         if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -533,7 +528,7 @@ class Qwen3Omni_VisionTransformer(nn.Module):
         hidden_states = hidden_states.unsqueeze(1)
         rotary_pos_emb_cos = rotary_pos_emb_cos.to(hidden_states.device)
         rotary_pos_emb_sin = rotary_pos_emb_sin.to(hidden_states.device)
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
 
         hidden_states_list = []
         deepstack_visual_indexes = self.deepstack_visual_indexes
@@ -545,7 +540,6 @@ class Qwen3Omni_VisionTransformer(nn.Module):
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
             if (
                 deepstack_visual_indexes is not None
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 90c4894d33e88..4cd6fa14c32df 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -235,7 +235,6 @@ class Qwen3_VisionBlock(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -243,7 +242,6 @@ class Qwen3_VisionBlock(nn.Module):
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -391,7 +389,6 @@ class Qwen3_VisionTransformer(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -531,17 +528,14 @@ class Qwen3_VisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        seqlens = torch.zeros(1, device=cu_seqlens.device)
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -569,7 +563,7 @@ class Qwen3_VisionTransformer(nn.Module):
         cu_seqlens = torch.from_numpy(cu_seqlens)
 
         hidden_states = hidden_states.unsqueeze(1)
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
 
         deepstack_feature_lists = []
@@ -580,7 +574,6 @@ class Qwen3_VisionTransformer(nn.Module):
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
             if layer_num in self.deepstack_visual_indexes:
                 deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index f9bf242b7194e..06793a3d1bb14 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -277,12 +277,7 @@ class CudaPlatformBase(Platform):
         except ImportError:
             pass
 
-        if cls.has_device_capability(100):
-            # xFormers doesn't support Blackwell, fall back to SDPA
-            # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501
-            return AttentionBackendEnum.TORCH_SDPA
-        else:
-            return AttentionBackendEnum.XFORMERS
+        return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
     def get_valid_backends(
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 3ef44e7703204..d94da71b289f3 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -49,7 +49,6 @@ STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
 # Possible string values of STR_BACKEND_ENV_VAR
 # register, corresponding to possible backends
 STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
-STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"
 
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
deleted file mode 100644
index 5039c44b9c3e6..0000000000000
--- a/vllm/v1/attention/backends/xformers.py
+++ /dev/null
@@ -1,420 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention layer with XFormersAttention."""
-
-from dataclasses import dataclass
-from typing import ClassVar, Optional
-
-import torch
-
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionImpl,
-    AttentionType,
-    MultipleOf,
-)
-from vllm.attention.ops.triton_unified_attention import unified_attention
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder,
-    CommonAttentionMetadata,
-    split_decodes_and_prefills,
-)
-from vllm.v1.kv_cache_interface import AttentionSpec
-
-try:
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import (
-        AttentionBias,
-        PagedBlockDiagonalCausalWithOffsetPaddedKeysMask,
-    )
-
-    XFORMERS_AVAILABLE = True
-except ImportError:
-    XFORMERS_AVAILABLE = False
-
-from vllm import _custom_ops as ops
-
-logger = init_logger(__name__)
-
-
-class XFormersAttentionBackend(AttentionBackend):
-    accept_output_buffer: bool = True
-    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-
-    @staticmethod
-    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
-        return [MultipleOf(16)]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [
-            32,
-            40,
-            48,
-            56,
-            64,
-            72,
-            80,
-            88,
-            96,
-            104,
-            112,
-            120,
-            128,
-            136,
-            144,
-            152,
-            160,
-            168,
-            176,
-            184,
-            192,
-            200,
-            208,
-            216,
-            224,
-            232,
-            240,
-            248,
-            256,
-        ]
-
-    @staticmethod
-    def get_name() -> str:
-        return "XFORMERS"
-
-    @staticmethod
-    def get_impl_cls() -> type["XFormersAttentionImpl"]:
-        return XFormersAttentionImpl
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        cache_dtype_str: str = "auto",
-    ) -> tuple[int, ...]:
-        if block_size % 16 != 0:
-            raise ValueError("Block size must be a multiple of 16.")
-        return (2, num_blocks, block_size, num_kv_heads, head_size)
-
-    @staticmethod
-    def get_builder_cls() -> type["XFormersAttentionMetadataBuilder"]:
-        return XFormersAttentionMetadataBuilder
-
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return False
-
-
-@dataclass
-class XFormersAttentionMetadata:
-    num_actual_tokens: int  # Number of tokens excluding padding.
-    max_query_len: int
-    query_start_loc: torch.Tensor
-    max_seq_len: int
-    seq_lens: torch.Tensor
-    block_table: torch.Tensor
-    slot_mapping: torch.Tensor
-
-    num_prefill_tokens: int = 0
-    num_decode_tokens: int = 0
-    num_prefills: int = 0
-    num_decodes: int = 0
-
-    # Biases for different attention types.
-    attn_bias: Optional["AttentionBias"] = None
-
-    # Self-attention prefill/decode metadata cache
-    _cached_prefill_metadata: Optional["XFormersAttentionMetadata"] = None
-    _cached_decode_metadata: Optional["XFormersAttentionMetadata"] = None
-
-    @property
-    def prefill_metadata(self) -> Optional["XFormersAttentionMetadata"]:
-        if self.num_prefills == 0:
-            return None
-
-        if self._cached_prefill_metadata is not None:
-            # Recover cached prefill-phase attention
-            # metadata structure
-            return self._cached_prefill_metadata
-
-        q_start_loc = self.query_start_loc[self.num_decodes :]
-        q_seqlens = torch.diff(q_start_loc)
-        kv_seqlens = self.seq_lens[self.num_decodes :]
-        # Construct & cache prefill-phase attention metadata structure
-        self._cached_prefill_metadata = XFormersAttentionMetadata(
-            num_actual_tokens=self.num_prefill_tokens,
-            max_query_len=int(q_seqlens.max().item()),
-            query_start_loc=q_start_loc - q_start_loc[0],
-            max_seq_len=int(kv_seqlens.max().item()),
-            seq_lens=kv_seqlens,
-            block_table=self.block_table[self.num_decodes :],
-            slot_mapping=self.slot_mapping[self.num_decode_tokens :],
-        )
-        return self._cached_prefill_metadata
-
-    @property
-    def decode_metadata(self) -> Optional["XFormersAttentionMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-
-        if self._cached_decode_metadata is not None:
-            # Recover cached decode-phase attention
-            # metadata structure
-            return self._cached_decode_metadata
-
-        q_start_loc = self.query_start_loc
-        q_seqlens = torch.diff(q_start_loc)
-        decode_kv_seqlens = self.seq_lens[: self.num_decodes]
-        # Construct & cache decode-phase attention metadata structure
-        self._cached_decode_metadata = XFormersAttentionMetadata(
-            num_actual_tokens=self.num_decode_tokens,
-            max_query_len=int(q_seqlens[: self.num_decodes].max().item()),
-            query_start_loc=q_start_loc[: self.num_decodes + 1],
-            max_seq_len=int(decode_kv_seqlens.max().item()),
-            seq_lens=decode_kv_seqlens,
-            block_table=self.block_table[: self.num_decodes],
-            slot_mapping=self.slot_mapping[: self.num_decode_tokens],
-            attn_bias=self.attn_bias,
-        )
-        return self._cached_decode_metadata
-
-
-class XFormersAttentionMetadataBuilder(
-    AttentionMetadataBuilder[XFormersAttentionMetadata]
-):
-    reorder_batch_threshold: int = 1
-
-    def __init__(
-        self,
-        kv_cache_spec: AttentionSpec,
-        layer_names: list[str],
-        vllm_config: VllmConfig,
-        device: torch.device,
-    ):
-        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
-
-        assert XFORMERS_AVAILABLE
-        self.block_size = kv_cache_spec.block_size
-        self._num_decodes = 0
-        self._num_decode_tokens = 0
-
-    def build(
-        self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        fast_build: bool = False,
-    ) -> XFormersAttentionMetadata:
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
-            )
-        )
-
-        num_actual_tokens = common_attn_metadata.num_actual_tokens
-        q_start_loc = common_attn_metadata.query_start_loc
-        q_seqlens = torch.diff(q_start_loc)
-        max_query_len = common_attn_metadata.max_query_len
-        kv_seqlens = common_attn_metadata.seq_lens
-        max_seq_len = common_attn_metadata.max_seq_len
-        block_table = common_attn_metadata.block_table_tensor
-        slot_mapping = common_attn_metadata.slot_mapping
-
-        bias = None
-        if num_decodes > 0:
-            # Construct the decoder bias.
-            decode_q_seqlens = q_seqlens[:num_decodes]
-            decode_kv_seqlens = kv_seqlens[:num_decodes]
-            bias = PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=decode_q_seqlens.tolist(),
-                kv_seqlen=decode_kv_seqlens.tolist(),
-                page_size=self.block_size,
-                block_tables=block_table[:num_decodes],
-                device=block_table.device,
-            )
-
-        return XFormersAttentionMetadata(
-            num_actual_tokens=num_actual_tokens,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            num_prefills=num_prefills,
-            num_decodes=num_decodes,
-            max_query_len=max_query_len,
-            query_start_loc=q_start_loc,
-            max_seq_len=max_seq_len,
-            seq_lens=kv_seqlens,
-            block_table=block_table,
-            slot_mapping=slot_mapping,
-            attn_bias=bias,
-        )
-
-
-class XFormersAttentionImpl(AttentionImpl):
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: list[float] | None,
-        sliding_window: int | None,
-        kv_cache_dtype: str,
-        logits_soft_cap: float | None = None,
-        attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: str | None = None,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if alibi_slopes is not None:
-            raise NotImplementedError("XFormers does not support alibi slopes yet.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.kv_cache_dtype = kv_cache_dtype
-        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        if sliding_window is None:
-            self.sliding_window = (-1, -1)
-        else:
-            self.sliding_window = (sliding_window - 1, 0)
-        if logits_soft_cap is None:
-            # Setting logits_soft_cap to 0 means no soft cap.
-            logits_soft_cap = 0
-        self.logits_soft_cap = logits_soft_cap
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "XFormersAttentionImpl."
-            )
-
-    def forward(
-        self,
-        layer: torch.nn.Module,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: XFormersAttentionMetadata,
-        output: torch.Tensor | None = None,
-        output_scale: torch.Tensor | None = None,
-        output_block_scale: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        """Forward pass with XFormers.
-
-        Args:
-            query: shape = [num_tokens, num_heads, head_size]
-            key: shape = [num_tokens, num_kv_heads, head_size]
-            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache: shape =
-                [2, num_blocks, block_size, num_kv_heads, head_size]
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        assert output is not None, "Output tensor must be provided."
-
-        if output_scale is not None or output_block_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for XFormersAttentionImpl"
-            )
-
-        if attn_metadata is None:
-            # Profiling run.
-            return output.fill_(0)
-
-        # Cache the input KVs.
-        key_cache, value_cache = kv_cache.unbind(0)
-        if self.kv_sharing_target_layer_name is None:
-            # Reshape the input keys and values and store them in the cache.
-            # Skip this if sharing KV cache with an earlier attention layer.
-            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-            # not padded. However, we don't need to do key[:num_actual_tokens]
-            # and value[:num_actual_tokens] because the reshape_and_cache_flash
-            # op uses the slot_mapping's shape to determine the number of
-            # actual tokens.
-            ops.reshape_and_cache_flash(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
-        num_actual_tokens = attn_metadata.num_actual_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        if prefill_meta := attn_metadata.prefill_metadata:
-            descale_shape = (prefill_meta.query_start_loc.shape[0] - 1, key.shape[1])
-            unified_attention(
-                q=query[num_decode_tokens:num_actual_tokens],
-                k=key_cache,
-                v=value_cache,
-                out=output[num_decode_tokens:num_actual_tokens],
-                cu_seqlens_q=prefill_meta.query_start_loc,
-                max_seqlen_q=prefill_meta.max_query_len,
-                seqused_k=prefill_meta.seq_lens,
-                max_seqlen_k=prefill_meta.max_seq_len,
-                softmax_scale=self.scale,
-                causal=True,
-                alibi_slopes=self.alibi_slopes,
-                window_size=self.sliding_window,
-                block_table=prefill_meta.block_table,
-                softcap=self.logits_soft_cap,
-                q_descale=None,  # Not supported
-                k_descale=layer._k_scale.expand(descale_shape),
-                v_descale=layer._v_scale.expand(descale_shape),
-            )
-
-        if decode_meta := attn_metadata.decode_metadata:
-            # Query for decode. KV is not needed because it is already cached.
-            decode_query = query[:num_decode_tokens]
-            # Reshape query to [1, B_T, G, H, D].
-            q = decode_query.view(
-                1, -1, self.num_kv_heads, self.num_queries_per_kv, self.head_size
-            )
-            # Reshape the k and v caches to [1, Bkv_T, G, H, D]
-            cache_k = key_cache.view(
-                1, -1, self.num_kv_heads, 1, self.head_size
-            ).expand(
-                1,
-                -1,
-                self.num_kv_heads,
-                self.num_queries_per_kv,
-                self.head_size,
-            )
-            cache_v = value_cache.view(
-                1, -1, self.num_kv_heads, 1, self.head_size
-            ).expand(
-                1,
-                -1,
-                self.num_kv_heads,
-                self.num_queries_per_kv,
-                self.head_size,
-            )
-
-            attn_bias = decode_meta.attn_bias
-            output[:num_decode_tokens] = xops.memory_efficient_attention_forward(
-                q,
-                cache_k,
-                cache_v,
-                attn_bias=attn_bias,
-                p=0.0,
-                scale=self.scale,
-            ).view(decode_query.shape)
-
-        # Reshape the output tensor.
-        return output

From ed40d85929f25a939b8d79ef3f2fb923f5a7bb54 Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Mon, 24 Nov 2025 14:48:45 +0800
Subject: [PATCH 002/134] [BugFix] Fix R-VL model loading error (#29299)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
---
 examples/offline_inference/vision_language_multi_image.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index d6e169548f15b..301265d4e17f7 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1110,6 +1110,7 @@ def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
         model=model_name,
         max_model_len=16384,
         max_num_seqs=16,
+        trust_remote_code=True,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 

From 68dfe28eaefc20ce8d847c3b3ccf712716d20c20 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Mon, 24 Nov 2025 18:02:28 +0800
Subject: [PATCH 003/134] [Feature][Benchmark] add --link-vars can filter when
 serve_param equal bench_param (#28909)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/benchmarks/sweep/serve.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 45ac446a7aedf..1298e4acbd87d 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -211,6 +211,7 @@ def run_combs(
     output_dir: Path,
     num_runs: int,
     dry_run: bool,
+    links: list[tuple[str, str]],
 ):
     all_data = list[dict[str, object]]()
     for serve_comb in serve_params:
@@ -226,6 +227,14 @@ def run_combs(
             else contextlib.nullcontext()
         ) as server:
             for bench_comb in bench_params:
+                should_run = all(
+                    serve_key in serve_comb
+                    and bench_key in bench_comb
+                    and serve_comb[serve_key] == bench_comb[bench_key]
+                    for serve_key, bench_key in links
+                )
+                if not should_run:
+                    continue
                 base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
 
                 comb_data = run_comb(
@@ -262,6 +271,7 @@ class SweepServeArgs:
     num_runs: int
     dry_run: bool
     resume: str | None
+    link_vars: list[tuple[str, str]] | None
 
     parser_name: ClassVar[str] = "serve"
     parser_help: ClassVar[str] = "Run vLLM server benchmark under multiple settings."
@@ -285,7 +295,7 @@ class SweepServeArgs:
         else:
             # i.e.: run bench_cmd without any modification
             bench_params = ParameterSweep.from_records([{}])
-
+        link_vars = cls.parse_link_vars(args.link_vars)
         num_runs = args.num_runs
         if num_runs < 1:
             raise ValueError("`num_runs` should be at least 1.")
@@ -301,6 +311,7 @@ class SweepServeArgs:
             num_runs=num_runs,
             dry_run=args.dry_run,
             resume=args.resume,
+            link_vars=link_vars,
         )
 
     @classmethod
@@ -376,8 +387,28 @@ class SweepServeArgs:
             "parameter combinations for which there are still no output files.",
         )
 
+        parser.add_argument(
+            "--link-vars",
+            type=str,
+            default="",
+            help=(
+                "Comma-separated list of linked variables between serve and bench, "
+                "e.g. max_num_seqs=max_concurrency,max_model_len=random_input_len"
+            ),
+        )
+
         return parser
 
+    @staticmethod
+    def parse_link_vars(s: str) -> list[tuple[str, str]]:
+        if not s:
+            return []
+        pairs = []
+        for item in s.split(","):
+            a, b = item.split("=")
+            pairs.append((a.strip(), b.strip()))
+        return pairs
+
 
 def run_main(args: SweepServeArgs):
     timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -397,6 +428,7 @@ def run_main(args: SweepServeArgs):
             output_dir=output_dir,
             num_runs=args.num_runs,
             dry_run=args.dry_run,
+            links=args.link_vars,
         )
     except BaseException as exc:
         raise RuntimeError(

From 8005e606bf280b7b6002f57e95ae3210ddc6f041 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=B0=E5=85=AE?=
 <38908462+zhyajie@users.noreply.github.com>
Date: Mon, 24 Nov 2025 18:16:52 +0800
Subject: [PATCH 004/134] [Bugfix][Rocm] Fix shared expert weight loading
 failure in DeepSeek-MTP (#27563)

Signed-off-by: zhyajie <yajizhan@amd.com>
Co-authored-by: zhyajie <yajizhan@amd.com>
---
 vllm/model_executor/models/deepseek_mtp.py | 153 +++++++++++++++------
 vllm/model_executor/models/deepseek_v2.py  |  14 +-
 2 files changed, 121 insertions(+), 46 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index e028dc497aa6a..6e23037b919ab 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -1,15 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable
+import typing
+from collections.abc import Callable, Iterable
 
 import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -231,6 +233,9 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
         return self.model.compute_logits(hidden_states, spec_step_idx)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        rocm_aiter_moe_shared_expert_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
         stacked_params_mapping = [
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
@@ -238,11 +243,16 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
             ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
         ]
 
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+        expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts,
+            num_experts=self.config.n_routed_experts
+            + (
+                self.config.n_shared_experts
+                if rocm_aiter_moe_shared_expert_enabled
+                else 0
+            ),
         )
 
         params_dict = dict(self.named_parameters())
@@ -253,6 +263,9 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
             spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
             if spec_layer is None:
                 continue
+            is_fusion_moe_shared_experts_layer = (
+                rocm_aiter_moe_shared_expert_enabled and ("mlp.shared_experts" in name)
+            )
             name = self._rewrite_spec_layer_name(spec_layer, name)
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
@@ -266,6 +279,8 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                 if ("mlp.experts." in name) and name not in params_dict:
                     continue
+                if is_fusion_moe_shared_experts_layer:
+                    continue
                 name_mapped = name.replace(weight_name, param_name)
 
                 # QKV fusion is optional, fall back to normal
@@ -286,45 +301,105 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(
-                        param,
-                        loaded_weight,
-                        name,
-                        shard_id=shard_id,
-                        expert_id=expert_id,
+                # Special handling: when AITER fusion_shared_experts is enabled,
+                # checkpoints may provide a single widened shared_experts tensor
+                # without explicit expert indices
+                # (e.g. ...mlp.shared_experts.gate_proj.weight).
+                # For models with multiple shared experts, split that tensor
+                # evenly into per-shared-expert slices and load them into
+                # appended expert slots mlp.experts.{n_routed_experts + j}.*
+                # accordingly.
+                num_chunks = 1
+                if is_fusion_moe_shared_experts_layer:
+                    num_chunks = getattr(self.config, "n_shared_experts", 1) or 1
+                    # Determine split axis based on op type
+                    # gate/up: ColumnParallel → split along dim 0
+                    # down: RowParallel → split along dim 1
+                    split_dim = 1 if "down_proj.weight" in name else 0
+                    total = loaded_weight.shape[split_dim]
+                    assert total % num_chunks == 0, (
+                        f"Shared expert weight dim {total} "
+                        f"not divisible by num_chunks {num_chunks}"
                     )
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
+                    chunk_size = total // num_chunks
 
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
+                for j in range(num_chunks):
+                    chunk_name = name
+                    weight_to_load = loaded_weight
 
-                    # According to DeepSeek-V3 Technical Report, MTP modules
-                    # shares embedding layer. We only load the first weights.
-                    if (
-                        spec_layer != self.model.mtp_start_layer_idx
-                        and ".layers" not in name
-                    ):
-                        continue
+                    if is_fusion_moe_shared_experts_layer:
+                        if split_dim == 0:
+                            weight_to_load = loaded_weight[
+                                j * chunk_size : (j + 1) * chunk_size, :
+                            ]
+                        else:
+                            weight_to_load = loaded_weight[
+                                :, j * chunk_size : (j + 1) * chunk_size
+                            ]
+                        # Synthesize an expert-style name so expert mapping
+                        # can route it
+                        chunk_name = name.replace(
+                            "mlp.shared_experts",
+                            f"mlp.experts.{self.config.n_routed_experts + j}",
+                        )
 
-                    param = params_dict[name]
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
-                    )
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
+                    # Use expert_params_mapping to locate the destination
+                    # param and delegate to its expert-aware weight_loader
+                    # with expert_id.
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if weight_name not in chunk_name:
+                            continue
+
+                        # Do not modify `name` since the loop may continue here
+                        # Instead, create a new variable
+                        name_mapped = chunk_name.replace(weight_name, param_name)
+
+                        param = params_dict[name_mapped]
+                        # We should ask the weight loader to return success or
+                        # not here since otherwise we may skip experts with
+                        # other available replicas.
+                        weight_loader = typing.cast(
+                            Callable[..., bool], param.weight_loader
+                        )
+                        success = weight_loader(
+                            param,
+                            weight_to_load,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                            return_success=True,
+                        )
+                        if success:
+                            if not is_fusion_moe_shared_experts_layer:
+                                name = name_mapped
+                            else:
+                                loaded_params.add(name_mapped)
+                            break
+                    else:
+                        # Skip loading extra bias for GPTQ models.
+                        if name.endswith(".bias") and name not in params_dict:
+                            continue
+
+                        name = maybe_remap_kv_scale_name(name, params_dict)
+                        if name is None:
+                            continue
+
+                        # According to DeepSeek-V3 Technical Report, MTP modules
+                        # shares embedding layer. We only load the first weights.
+                        if (
+                            spec_layer != self.model.mtp_start_layer_idx
+                            and ".layers" not in name
+                        ):
+                            continue
+
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+            if not is_fusion_moe_shared_experts_layer:
+                loaded_params.add(name)
         return loaded_params
 
     def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 7cfd381592b49..ad932559b983d 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1479,8 +1479,8 @@ class DeepseekV2ForCausalLM(
             if spec_layer is not None:
                 continue  # skip spec decode layers for main model
 
-            is_fuse_shared_experts_layer = rocm_aiter_moe_shared_expert_enabled and (
-                "mlp.shared_experts" in name
+            is_fusion_moe_shared_experts_layer = (
+                rocm_aiter_moe_shared_expert_enabled and ("mlp.shared_experts" in name)
             )
 
             for param_name, weight_name, shard_id in stacked_params_mapping:
@@ -1495,7 +1495,7 @@ class DeepseekV2ForCausalLM(
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                 if ("mlp.experts." in name) and name not in params_dict:
                     continue
-                if is_fuse_shared_experts_layer:
+                if is_fusion_moe_shared_experts_layer:
                     continue
                 name_mapped = name.replace(weight_name, param_name)
 
@@ -1531,7 +1531,7 @@ class DeepseekV2ForCausalLM(
                 # appended expert slots mlp.experts.{n_routed_experts + j}.*
                 # accordingly.
                 num_chunks = 1
-                if is_fuse_shared_experts_layer:
+                if is_fusion_moe_shared_experts_layer:
                     num_chunks = getattr(self.config, "n_shared_experts", 1) or 1
                     # Determine split axis based on op type
                     # gate/up: ColumnParallel → split along dim 0
@@ -1548,7 +1548,7 @@ class DeepseekV2ForCausalLM(
                     chunk_name = name
                     weight_to_load = loaded_weight
 
-                    if is_fuse_shared_experts_layer:
+                    if is_fusion_moe_shared_experts_layer:
                         if split_dim == 0:
                             weight_to_load = loaded_weight[
                                 j * chunk_size : (j + 1) * chunk_size, :
@@ -1599,7 +1599,7 @@ class DeepseekV2ForCausalLM(
                             return_success=True,
                         )
                         if success:
-                            if not is_fuse_shared_experts_layer:
+                            if not is_fusion_moe_shared_experts_layer:
                                 name = name_mapped
                             else:
                                 loaded_params.add(name_mapped)
@@ -1628,7 +1628,7 @@ class DeepseekV2ForCausalLM(
                             param, "weight_loader", default_weight_loader
                         )
                         weight_loader(param, loaded_weight)
-            if not is_fuse_shared_experts_layer:
+            if not is_fusion_moe_shared_experts_layer:
                 loaded_params.add(name)
 
         return loaded_params

From eca7a8fb59c223c603922be0bd62f5c460972a50 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Mon, 24 Nov 2025 12:10:48 +0100
Subject: [PATCH 005/134] [Doc]: fix typos in various files (#29230)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 benchmarks/kernels/deepgemm/README.md | 2 +-
 vllm/config/vllm.py                   | 2 +-
 vllm/forward_context.py               | 2 +-
 vllm/v1/worker/gpu_input_batch.py     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md
index 41e68e047be82..a28c6956be0e9 100644
--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -2,7 +2,7 @@
 
 This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
 
-Currently this just includes dense GEMMs and only works on Hopper GPUs.
+Currently, this just includes dense GEMMs and only works on Hopper GPUs.
 
 ## Setup
 
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index d64e315b4fe39..8a3599416bc72 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -96,7 +96,7 @@ class VllmConfig:
     """`torch.compile` and cudagraph capture configuration for the model.
 
     As a shorthand, one can append compilation arguments via 
-    -0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`).
+    -0.parameter=argument such as `-O.mode=3` (same as `-O='{"mode":3}'`).
 
     You can specify the full compilation config like so:
     `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 25fb7181a8f29..7cb490e391abb 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -153,7 +153,7 @@ class DPMetadata:
     @contextmanager
     def sp_local_sizes(self, sequence_parallel_size: int):
         """
-        Context mamager for setting self.local_sizes. Same as self.chunked_sizes
+        Context manager for setting self.local_sizes. Same as self.chunked_sizes
         but without any chunking.
         """
         self.local_sizes = _compute_sp_num_tokens(
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index d6fef450c028a..4a2818ab1bfd8 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -525,7 +525,7 @@ class InputBatch:
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
         #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
-        # instead, we need to temporiarily copy the data for one of the indices
+        # instead, we need to temporarily copy the data for one of the indices
         # TODO(lucas): optimize this by only copying valid indices
         tmp = self.token_ids_cpu[i1, ...].copy()
         self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]

From 4de87866a8fdc9395ccd40e00c0f9075439b07ae Mon Sep 17 00:00:00 2001
From: R3hankhan <Rehan.Khan7@ibm.com>
Date: Mon, 24 Nov 2025 17:38:09 +0530
Subject: [PATCH 006/134] [CPU][IBM Z] Fix BF16 support and vectorize math
 operations for s390x (#28926)

Signed-off-by: Rehan Khan <Rehan.Khan7@ibm.com>
---
 csrc/cpu/cpu_attn_impl.hpp |   2 +-
 csrc/cpu/cpu_types_vxe.hpp | 586 +++++++++++++++++++++++++++++++++----
 2 files changed, 531 insertions(+), 57 deletions(-)

diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 12c6f5d3015cc..98f55d7c014be 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -847,7 +847,7 @@ struct VecTypeTrait<c10::BFloat16> {
 };
 #endif
 
-#if !defined(__powerpc__)
+#if !defined(__powerpc__) && !defined(__s390x__)
 template <>
 struct VecTypeTrait<c10::Half> {
   using vec_t = vec_op::FP16Vec16;
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
index 51bca37e699b9..9efd8b7ec14a4 100644
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -4,6 +4,7 @@
 
 #include <vecintrin.h>
 #include <cmath>
+#include <limits>
 #include <torch/all.h>
 namespace vec_op {
 
@@ -174,8 +175,9 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   }
 
   explicit FP32Vec8(const BF16Vec8& v) {
-    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
-    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+    // On big-endian s390x, place BF16 first to get correct byte order
+    reg.val[0] = (__vector float)vec_mergeh(v.reg, zero);
+    reg.val[1] = (__vector float)vec_mergel(v.reg, zero);
   }
 
   float reduce_sum() const {
@@ -189,51 +191,257 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   }
 
   FP32Vec8 exp() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::exp(ar.values[0]);
-    ret.val[0][1] = std::exp(ar.values[1]);
-    ret.val[0][2] = std::exp(ar.values[2]);
-    ret.val[0][3] = std::exp(ar.values[3]);
-    ret.val[1][0] = std::exp(ar.values[4]);
-    ret.val[1][1] = std::exp(ar.values[5]);
-    ret.val[1][2] = std::exp(ar.values[6]);
-    ret.val[1][3] = std::exp(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    f32x4x2_t out;
+
+    const __vector float log2e = vec_splats(1.44269504088896341f);
+    const __vector float one = vec_splats(1.0f);
+    const __vector float min_x = vec_splats(-87.3f);
+    const __vector float max_x = vec_splats(88.7f);
+
+    // 5th-degree minimax polynomial for 2^r (r in [0,1))
+    const __vector float c1 = vec_splats(0.6931471805599453f);
+    const __vector float c2 = vec_splats(0.240226506959101f);
+    const __vector float c3 = vec_splats(0.05550410866482158f);
+    const __vector float c4 = vec_splats(0.009618129107628477f);
+    const __vector float c5 = vec_splats(0.0013333558146428443f);
+
+    for (int i = 0; i < 2; i++) {
+      __vector float x = reg.val[i];
+
+      x = vec_max(x, min_x);
+      x = vec_min(x, max_x);
+
+      __vector float y = vec_mul(x, log2e);
+
+      __vector float kf = vec_floor(y);
+      __vector float r = vec_sub(y, kf);
+
+      __vector signed int k = vec_signed(kf);
+      const __vector signed int min_k = vec_splats((signed int)-126);
+      const __vector signed int max_k = vec_splats((signed int)127);
+      k = vec_min(vec_max(k, min_k), max_k);
+
+      // Build 2^k from exponent bits
+      __vector signed int exp_int = vec_add(k, vec_splats((signed int)127));
+      __vector unsigned int bits = (__vector unsigned int)exp_int;
+      bits = vec_sl(bits, vec_splats((unsigned int)23));
+      __vector float pow2k = (__vector float)bits;
+
+      // Improved minimax polynomial
+      __vector float poly = vec_madd(c5, r, c4);
+      poly = vec_madd(poly, r, c3);
+      poly = vec_madd(poly, r, c2);
+      poly = vec_madd(poly, r, c1);
+      poly = vec_madd(poly, r, one);
+
+      out.val[i] = vec_mul(pow2k, poly);
+    }
+
+    return FP32Vec8(out);
   }
 
   FP32Vec8 tanh() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::tanh(ar.values[0]);
-    ret.val[0][1] = std::tanh(ar.values[1]);
-    ret.val[0][2] = std::tanh(ar.values[2]);
-    ret.val[0][3] = std::tanh(ar.values[3]);
-    ret.val[1][0] = std::tanh(ar.values[4]);
-    ret.val[1][1] = std::tanh(ar.values[5]);
-    ret.val[1][2] = std::tanh(ar.values[6]);
-    ret.val[1][3] = std::tanh(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    // tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
+    const __vector float one = vec_splats(1.0f);
+    const __vector float two = vec_splats(2.0f);
+    const __vector float zero = vec_splats(0.0f);
+    const __vector float sat =
+        vec_splats(9.0f);  // beyond this, tanh(x) ~ sign(x)
+
+    f32x4x2_t out;
+
+    for (int i = 0; i < 2; i++) {
+      __vector float x = reg.val[i];
+      __vector float ax = vec_abs(x);
+
+      // sign(x): +1 or -1
+      __vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
+
+      // saturation mask: |x| > sat
+      __vector __bool int saturated = vec_cmpgt(ax, sat);
+
+      // 2x
+      __vector float two_x = vec_mul(x, two);
+
+      // Build a temporary FP32Vec8 with both lanes = 2x, reuse exp()
+      f32x4x2_t tmp;
+      tmp.val[0] = two_x;
+      tmp.val[1] = two_x;
+      FP32Vec8 exp_2x_vec(tmp);
+
+      FP32Vec8 e2x = exp_2x_vec.exp();
+      __vector float e = e2x.reg.val[i];
+
+      // tanh(x) = (e - 1) / (e + 1)
+      __vector float num = vec_sub(e, one);
+      __vector float den = vec_add(e, one);
+
+      __vector float t = vec_div(num, den);
+
+      // For large |x|, clamp to sign(x)
+      out.val[i] = vec_sel(t, sign, saturated);
+    }
+
+    return FP32Vec8(out);
   }
 
   FP32Vec8 er() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::erf(ar.values[0]);
-    ret.val[0][1] = std::erf(ar.values[1]);
-    ret.val[0][2] = std::erf(ar.values[2]);
-    ret.val[0][3] = std::erf(ar.values[3]);
-    ret.val[1][0] = std::erf(ar.values[4]);
-    ret.val[1][1] = std::erf(ar.values[5]);
-    ret.val[1][2] = std::erf(ar.values[6]);
-    ret.val[1][3] = std::erf(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    // A&S 7.1.26 approximation:
+    // erf(x) = sign(x) * (1 - ((((a5*t + a4)*t + a3)*t + a2)*t + a1) * t *
+    // exp(-x^2)) t = 1 / (1 + p*|x|),  p = 0.3275911
+
+    const __vector float one = vec_splats(1.0f);
+    const __vector float zero = vec_splats(0.0f);
+    const __vector float p = vec_splats(0.3275911f);
+
+    // Polynomial coeffs
+    const __vector float a1 = vec_splats(0.254829592f);
+    const __vector float a2 = vec_splats(-0.284496736f);
+    const __vector float a3 = vec_splats(1.421413741f);
+    const __vector float a4 = vec_splats(-1.453152027f);
+    const __vector float a5 = vec_splats(1.061405429f);
+
+    // Threshold where erf(x) ~ sign(x)
+    const __vector float sat = vec_splats(6.0f);
+
+    f32x4x2_t out;
+
+    for (int lane = 0; lane < 2; lane++) {
+      __vector float x = reg.val[lane];
+      __vector float ax = vec_abs(x);
+
+      // sign(x)
+      __vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
+
+      // |x| > 6 → erf(x) = ±1
+      __vector __bool int saturated = vec_cmpgt(ax, sat);
+
+      // t = 1 / (1 + p * |x|)
+      __vector float t = vec_madd(p, ax, one);
+      t = vec_div(one, t);
+
+      // poly = a5
+      __vector float poly = a5;
+      poly = vec_madd(poly, t, a4);
+      poly = vec_madd(poly, t, a3);
+      poly = vec_madd(poly, t, a2);
+      poly = vec_madd(poly, t, a1);
+
+      // full polynomial: poly = poly * t
+      poly = vec_mul(poly, t);
+
+      // Compute exp(-x^2)
+      __vector float x2 = vec_mul(x, x);
+      __vector float neg_x2 = vec_neg(x2);
+
+      f32x4x2_t tmp;
+      tmp.val[0] = neg_x2;
+      tmp.val[1] = neg_x2;
+      FP32Vec8 exp_neg_x2(tmp);
+
+      FP32Vec8 e = exp_neg_x2.exp();
+      __vector float ex = e.reg.val[lane];
+
+      // erf(x) = sign * (1 - poly * exp(-x^2))
+      __vector float term = vec_mul(poly, ex);
+      __vector float y = vec_sub(one, term);
+      y = vec_mul(y, sign);
+
+      // saturated → ±1
+      __vector float sat_val = vec_mul(sign, one);
+      out.val[lane] = vec_sel(y, sat_val, saturated);
+    }
+
+    return FP32Vec8(out);
+  }
+  // Elementwise sigmoid(x) = 1 / (1 + exp(-x))
+  FP32Vec8 sigmoid() const {
+    const __vector float one = vec_splats(1.0f);
+
+    f32x4x2_t neg;
+    for (int i = 0; i < 2; ++i) {
+      neg.val[i] = vec_neg(reg.val[i]);
+    }
+
+    FP32Vec8 neg_x(neg);
+    FP32Vec8 e = neg_x.exp();  // exp(-x)
+
+    f32x4x2_t denom;
+    for (int i = 0; i < 2; ++i) {
+      denom.val[i] = vec_add(one, e.reg.val[i]);
+    }
+
+    FP32Vec8 denom_vec(denom);
+    FP32Vec8 one_vec(1.0f);
+
+    return one_vec / denom_vec;
+  }
+
+  // Tanh-based GELU:
+  // gelu(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x^3)))
+  FP32Vec8 gelu_tanh() const {
+    const __vector float k_s2pi = vec_splats(0.7978845608028654f);  // √(2/π)
+    const __vector float k_0_0447 = vec_splats(0.044715f);
+
+    f32x4x2_t x2, x3, inner;
+    for (int i = 0; i < 2; ++i) {
+      __vector float x = reg.val[i];
+      x2.val[i] = vec_mul(x, x);                            // x^2
+      x3.val[i] = vec_mul(x2.val[i], x);                    // x^3
+      __vector float t = vec_madd(k_0_0447, x3.val[i], x);  // x + 0.044715*x^3
+      inner.val[i] = vec_mul(k_s2pi, t);                    // √(2/π)*(...)
+    }
+
+    FP32Vec8 inner_vec(inner);
+    FP32Vec8 t = inner_vec.tanh();  // tanh part
+
+    FP32Vec8 one_vec(1.0f);
+    FP32Vec8 half_vec(0.5f);
+
+    FP32Vec8 x_vec(*this);
+    return x_vec * half_vec * (one_vec + t);
+  }
+
+  // Erf-based GELU:
+  // gelu(x) = 0.5 * x * (1 + erf(x / √2))
+  FP32Vec8 gelu_erf() const {
+    const __vector float inv_sqrt2 = vec_splats(0.7071067811865476f);  // 1/√2
+    FP32Vec8 x_vec(*this);
+
+    f32x4x2_t scaled;
+    for (int i = 0; i < 2; ++i) {
+      scaled.val[i] = vec_mul(reg.val[i], inv_sqrt2);
+    }
+    FP32Vec8 x_scaled(scaled);
+
+    FP32Vec8 erf_x = x_scaled.er();
+
+    FP32Vec8 one_vec(1.0f);
+    FP32Vec8 half_vec(0.5f);
+
+    return x_vec * half_vec * (one_vec + erf_x);
+  }
+
+  // Elementwise reciprocal: 1/x (scalar per lane, for correctness)
+  FP32Vec8 rcp() const {
+    AliasReg in, out;
+    in.reg = reg;
+
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      out.values[i] = 1.0f / in.values[i];
+    }
+    return FP32Vec8(out.reg);
+  }
+
+  // Elementwise rsqrt(x) = 1 / sqrt(x) (scalar per lane, for correctness)
+  FP32Vec8 rsqrt() const {
+    AliasReg in, out;
+    in.reg = reg;
+
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      out.values[i] = 1.0f / std::sqrt(in.values[i]);
+    }
+    return FP32Vec8(out.reg);
   }
 
   FP32Vec8 operator*(const FP32Vec8& b) const {
@@ -316,10 +524,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   }
 
   explicit FP32Vec16(const BF16Vec16& v) {
-    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
-    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
-    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
-    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+    // On big-endian s390x, place BF16 first to get correct byte order
+    reg.val[0] = (__vector float)vec_mergeh(v.reg.val[0], zero);
+    reg.val[1] = (__vector float)vec_mergel(v.reg.val[0], zero);
+    reg.val[2] = (__vector float)vec_mergeh(v.reg.val[1], zero);
+    reg.val[3] = (__vector float)vec_mergel(v.reg.val[1], zero);
   }
 
   explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
@@ -376,6 +585,23 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return result;
   }
 
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
+                                vec_max(reg.val[1], b.reg.val[1]),
+                                vec_max(reg.val[2], b.reg.val[2]),
+                                vec_max(reg.val[3], b.reg.val[3])}));
+  }
+
+  float reduce_max() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = ar.values[0];
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) {
+      if (ar.values[i] > result) result = ar.values[i];
+    });
+    return result;
+  }
+
   void save(float* ptr) const {
     vec_xst(reg.val[0], 0, ptr);
     vec_xst(reg.val[1], 16, ptr);
@@ -402,15 +628,14 @@ struct VecType<c10::BFloat16> {
   using vec_type = BF16Vec8;
 };
 
+// On s390x, FP16 (Half) is not natively supported, use FP32 vectors instead
+using FP16Vec16 = FP32Vec16;
+
 template <typename T>
 void storeFP32(float v, T* ptr) {
   *ptr = v;
 }
 
-inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
-  acc = acc + a * b;
-}
-
 namespace c10 {
 struct BFloat16 {
   uint16_t value;  // Assume BFloat16 is defined as a struct containing a 16-bit
@@ -429,6 +654,79 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
   #define __VEC_CLASS_FP_NAN (1 << 6)
 #endif
 
+// Optimized FMA (Fused Multiply-Add) implementations using IBM Z vector
+// intrinsics
+
+// FP32Vec4 FMA: acc = acc + (a * b) or equivalently acc = fma(a, b, acc)
+FORCE_INLINE void fma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_madd(a.reg, b.reg, acc.reg);
+}
+
+// FP32Vec8 FMA: acc = acc + (a * b)
+FORCE_INLINE void fma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+// FP32Vec16 FMA: acc = acc + (a * b)
+FORCE_INLINE void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_madd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_madd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Multiply-Subtract: acc = acc - (a * b)
+FORCE_INLINE void fms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_msub(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void fms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void fms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_msub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_msub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Negative Multiply-Add: acc = -(a * b) + acc
+FORCE_INLINE void nfma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_nmadd(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void nfma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void nfma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_nmadd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_nmadd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Negative Multiply-Subtract: acc = -(a * b) - acc
+FORCE_INLINE void nfms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_nmsub(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void nfms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void nfms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_nmsub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_nmsub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
 const static __vector unsigned char omask = {2,  3,  6,  7,  10, 11, 14, 15,
                                              18, 19, 22, 23, 26, 27, 30, 31};
 const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
@@ -441,13 +739,24 @@ const static __vector unsigned int one = {1, 1, 1, 1};
 inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
   __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
   __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int lsb0 = inp0 >> sh16;
+  __vector unsigned int lsb1 = inp1 >> sh16;
+  lsb0 = lsb0 & one;
+  lsb1 = lsb1 & one;
+  __vector unsigned int rnd0 = lsb0 + bias;
+  __vector unsigned int rnd1 = lsb1 + bias;
+  inp0 = inp0 + rnd0;
+  inp1 = inp1 + rnd1;
   int cc;
   __vector __bool int sel0 =
       vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
   __vector __bool int sel1 =
       vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
-  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
-  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp0 = inp0 >> sh16;
+  inp1 = inp1 >> sh16;
+
   reg = (__vector signed short)vec_perm(inp0, inp1, omask);
 }
 
@@ -456,6 +765,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
   __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
   __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
   __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+  __vector unsigned int lsb0 = inp0 >> sh16;
+  __vector unsigned int lsb1 = inp1 >> sh16;
+  __vector unsigned int lsb2 = inp2 >> sh16;
+  __vector unsigned int lsb3 = inp3 >> sh16;
+  lsb0 = lsb0 & one;
+  lsb1 = lsb1 & one;
+  lsb2 = lsb2 & one;
+  lsb3 = lsb3 & one;
+  __vector unsigned int rnd0 = lsb0 + bias;
+  __vector unsigned int rnd1 = lsb1 + bias;
+  __vector unsigned int rnd2 = lsb2 + bias;
+  __vector unsigned int rnd3 = lsb3 + bias;
+  inp0 = inp0 + rnd0;
+  inp1 = inp1 + rnd1;
+  inp2 = inp2 + rnd2;
+  inp3 = inp3 + rnd3;
   int cc;
   __vector __bool int sel0 =
       vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
@@ -465,15 +790,164 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
       vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
   __vector __bool int sel3 =
       vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
-  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
-  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
-  inp2 = vec_sel(inp2, nan, sel2) >> sh16;
-  inp3 = vec_sel(inp3, nan, sel3) >> sh16;
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp2 = vec_sel(inp2, nan, sel2);
+  inp3 = vec_sel(inp3, nan, sel3);
+  inp0 = inp0 >> sh16;
+  inp1 = inp1 >> sh16;
+  inp2 = inp2 >> sh16;
+  inp3 = inp3 >> sh16;
+
   reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
   reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
 }
 
-inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
+// 1D softmax over `n` elements in `input`, writes result to `output`.
+// Uses FP32Vec8 for main body, scalar tail handling.
+// Requirement: n > 0
+FORCE_INLINE void softmax_fp32vec8(float* output, const float* input, int n) {
+  if (n <= 0) return;
+
+  // ---------- Pass 1: find max ----------
+  float max_val = -std::numeric_limits<float>::infinity();
+  int i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    FP32Vec8 v(input + i);
+    FP32Vec8::AliasReg ar;
+    ar.reg = v.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      if (ar.values[j] > max_val) max_val = ar.values[j];
+    }
+  }
+  for (; i < n; ++i) {
+    if (input[i] > max_val) max_val = input[i];
+  }
+
+  // ---------- Pass 2: compute exp(x - max) and sum ----------
+  float sum = 0.0f;
+  i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    float tmp[FP32Vec8::VEC_ELEM_NUM];
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      tmp[j] = input[i + j] - max_val;
+    }
+
+    FP32Vec8 v(tmp);
+    FP32Vec8 e = v.exp();
+
+    FP32Vec8::AliasReg ar;
+    ar.reg = e.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      output[i + j] = ar.values[j];
+      sum += ar.values[j];
+    }
+  }
+
+  // Tail
+  for (; i < n; ++i) {
+    float x = input[i] - max_val;
+    float ex = std::exp(x);  // scalar tail
+    output[i] = ex;
+    sum += ex;
+  }
+
+  // ---------- Pass 3: normalize ----------
+  float inv_sum = 1.0f / sum;
+  i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    float tmp[FP32Vec8::VEC_ELEM_NUM];
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      tmp[j] = output[i + j] * inv_sum;
+    }
+    FP32Vec8 v(tmp);
+    v.save(output + i);
+  }
+
+  for (; i < n; ++i) {
+    output[i] *= inv_sum;
+  }
+}
+
+// 1D RMSNorm kernel:
+//   input:  x[0..n-1]
+//   weight: w[0..n-1] (gamma), may be nullptr
+//   output: y[i] = x[i] * inv_rms * (weight[i] if weight != nullptr else 1)
+//   eps: small epsilon for numerical stability
+FORCE_INLINE void rmsnorm_fp32vec8(float* output, const float* input,
+                                   const float* weight, int n, float eps) {
+  if (n <= 0) return;
+
+  // ---------- Pass 1: compute sum of squares ----------
+  float sum_sq = 0.0f;
+  int i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    FP32Vec8 x_vec(input + i);
+
+    FP32Vec8 sq = x_vec * x_vec;
+
+    FP32Vec8::AliasReg ar;
+    ar.reg = sq.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      sum_sq += ar.values[j];
+    }
+  }
+
+  // Tail
+  for (; i < n; ++i) {
+    float v = input[i];
+    sum_sq += v * v;
+  }
+
+  float mean_sq = sum_sq / static_cast<float>(n);
+  float inv_rms = 1.0f / std::sqrt(mean_sq + eps);
+
+  // ---------- Pass 2: scale (and apply weight if given) ----------
+  const float inv_rms_f = inv_rms;
+  i = 0;
+
+  if (weight) {
+    // with gamma
+    for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+      FP32Vec8 x_vec(input + i);
+
+      float wtmp[FP32Vec8::VEC_ELEM_NUM];
+      for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+        wtmp[j] = weight[i + j];
+      }
+      FP32Vec8 w_vec(wtmp);
+
+      FP32Vec8 scale_vec(inv_rms_f);
+      FP32Vec8 y = x_vec * scale_vec * w_vec;
+      y.save(output + i);
+    }
+
+    for (; i < n; ++i) {
+      output[i] = input[i] * inv_rms_f * weight[i];
+    }
+  } else {
+    // without gamma
+    for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+      FP32Vec8 x_vec(input + i);
+      FP32Vec8 scale_vec(inv_rms_f);
+      FP32Vec8 y = x_vec * scale_vec;
+      y.save(output + i);
+    }
+
+    for (; i < n; ++i) {
+      output[i] = input[i] * inv_rms_f;
+    }
+  }
+}
+
+// Prefetch data to cache for better memory access performance
+FORCE_INLINE void prefetch(const void* addr) {
+  __builtin_prefetch(addr, 0, 3);  // 0=read, 3=high temporal locality
+}
 
 };  // namespace vec_op
 

From 2601f18a8216b8e0b8743cded669b8b4d0e3e980 Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Mon, 24 Nov 2025 22:08:29 +0800
Subject: [PATCH 007/134] [EPLB] Optimize EPLB for Async Rearrange Experts 
 (#22179)

Signed-off-by: David Chen <530634352@qq.com>
Co-authored-by: SunChenxiang123 <1291824390@qq.com>
---
 tests/distributed/test_eplb_execute.py     | 127 +++++-
 tests/distributed/test_eplb_spec_decode.py |  39 +-
 vllm/config/parallel.py                    |   4 +
 vllm/distributed/eplb/async_worker.py      | 115 ++++++
 vllm/distributed/eplb/eplb_state.py        | 433 ++++++++++++++++++---
 vllm/distributed/eplb/rebalance_execute.py | 137 ++++++-
 vllm/v1/worker/gpu_model_runner.py         |   2 +
 7 files changed, 779 insertions(+), 78 deletions(-)
 create mode 100644 vllm/distributed/eplb/async_worker.py

diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index 9498e75b279b7..781dfd44c1ef6 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -1,13 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import asyncio
 import random
 
 import pytest
 import torch
 import torch.distributed
 
-from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
+from vllm.distributed.eplb.rebalance_execute import (
+    move_from_buffer,
+    rearrange_expert_weights_inplace,
+    transfer_layer,
+)
 from vllm.distributed.parallel_state import (
     ensure_model_parallel_initialized,
     get_tp_group,
@@ -231,6 +236,100 @@ def verify_redundant_experts_have_same_weights(
                     )
 
 
+def _test_async_transfer_layer_without_mtp_worker(
+    env,
+    world_size: int,
+    num_layers: int,
+    num_local_experts: int,
+    num_logical_experts: int,
+) -> None:
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
+
+    tp_group = get_tp_group()
+    ep_group = tp_group.device_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
+
+    total_physical_experts = world_size * num_local_experts
+    hidden_sizes = [16, 32]
+
+    redundancy_config = create_redundancy_config(
+        num_logical_experts,
+        total_physical_experts,
+    )
+    old_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        redundancy_config,
+    )
+
+    new_redundancy_config = create_redundancy_config(
+        num_logical_experts,
+        total_physical_experts,
+    )
+    new_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        new_redundancy_config,
+    )
+
+    expert_weights = create_expert_weights(
+        num_layers,
+        num_local_experts,
+        hidden_sizes,
+        ep_rank,
+        device,
+        old_indices,
+    )
+
+    expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
+    cuda_stream = torch.cuda.Stream(device=device)
+
+    for layer_idx in range(num_layers):
+        is_unchanged, is_received_locally, experts_recv_loc = asyncio.run(
+            transfer_layer(
+                old_global_expert_indices=old_indices,
+                new_global_expert_indices=new_indices,
+                expert_weights=expert_weights,
+                expert_weights_buffer=expert_buffer,
+                ep_group=ep_group,
+                layer=layer_idx,
+                cuda_stream=cuda_stream,
+            )
+        )
+
+        cuda_stream.synchronize()
+        move_from_buffer(
+            expert_weights=expert_weights[layer_idx],
+            expert_weights_buffer=expert_buffer,
+            is_unchanged=is_unchanged,
+            is_received_locally=is_received_locally,
+            experts_recv_loc=experts_recv_loc,
+            new_indices=new_indices[layer_idx].tolist(),
+            ep_group=ep_group,
+        )
+
+    verify_expert_weights_after_shuffle(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        ep_rank,
+        num_local_experts,
+    )
+    verify_redundant_experts_have_same_weights(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        world_size,
+        num_local_experts,
+    )
+
+
 def _test_rearrange_expert_weights_with_redundancy(
     env, world_size, num_layers, num_local_experts, num_logical_experts
 ) -> None:
@@ -399,6 +498,32 @@ def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
             )
 
 
+@pytest.mark.parametrize(
+    "world_size,num_layers,num_local_experts,num_logical_experts",
+    [
+        (2, 2, 2, 3),
+    ],
+)
+def test_async_transfer_layer_without_mtp(
+    world_size: int,
+    num_layers: int,
+    num_local_experts: int,
+    num_logical_experts: int,
+):
+    """Exercise async EPLB transfer path without MTP/spec decode."""
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    distributed_run(
+        _test_async_transfer_layer_without_mtp_worker,
+        world_size,
+        num_layers,
+        num_local_experts,
+        num_logical_experts,
+    )
+
+
 @pytest.mark.parametrize("world_size", [2, 4])
 def test_rearrange_expert_weights_no_change(world_size):
     """
diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
index 11e23f128f331..c055b7a3f6dd7 100644
--- a/tests/distributed/test_eplb_spec_decode.py
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -10,10 +10,11 @@ from tests.utils import large_gpu_mark
 
 def get_model_args(
     model_name: str,
-    spec_model_name: str,
+    spec_model_name: str | None,
     spec_method: str,
     tp_size: int,
     model_max_len: int,
+    use_async: bool = False,
 ) -> dict:
     speculative_config = {
         "method": spec_method,
@@ -37,6 +38,8 @@ def get_model_args(
         "enable_eplb": True,
         "max_model_len": model_max_len,
     }
+    if use_async:
+        model_args["eplb_config"] = {"use_async": True}
     return model_args
 
 
@@ -94,3 +97,37 @@ def test_eplb_spec_decode(
         measured_value - RTOL < expected_gsm8k_value
         and measured_value + RTOL > expected_gsm8k_value
     ), f"Expected: {expected_gsm8k_value} |  Measured: {measured_value}"
+
+
+@large_gpu_mark(min_gb=80)
+def test_eplb_spec_decode_qwen3_next_mtp_async() -> None:
+    """
+    Ensure async EPLB works with MTP speculative decoding for Qwen3-Next.
+    """
+
+    TASK = "gsm8k"
+    FILTER = "exact_match,strict-match"
+    RTOL = 0.03
+    expected_gsm8k_value = 0.86
+
+    model_args = get_model_args(
+        model_name="Qwen/Qwen3-Next-80B-A3B-Instruct",
+        spec_model_name=None,
+        spec_method="mtp",
+        tp_size=4,
+        model_max_len=4096,
+        use_async=True,
+    )
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=TASK,
+        batch_size=64,
+        num_fewshot=8,
+    )
+    measured_value = results["results"][TASK][FILTER]
+    assert (
+        measured_value - RTOL < expected_gsm8k_value
+        and measured_value + RTOL > expected_gsm8k_value
+    ), f"Expected: {expected_gsm8k_value} |  Measured: {measured_value}"
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 4b0236d8de3f5..ad438a8b464e0 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -60,6 +60,10 @@ class EPLBConfig:
     Log the balancedness each step of expert parallelism.
     This is turned off by default since it will cause communication overhead.
     """
+    use_async: bool = False
+    """
+    Whether to use non-blocking EPLB.
+    """
 
 
 @config
diff --git a/vllm/distributed/eplb/async_worker.py b/vllm/distributed/eplb/async_worker.py
new file mode 100644
index 0000000000000..e4b4fc92eeaaa
--- /dev/null
+++ b/vllm/distributed/eplb/async_worker.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+The async worker that transfers experts in the background.
+"""
+
+import asyncio
+import threading
+from typing import TYPE_CHECKING
+
+import torch
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.parallel_state import get_ep_group
+from vllm.logger import init_logger
+
+from .rebalance_execute import transfer_layer
+
+if TYPE_CHECKING:
+    from .eplb_state import EplbState
+
+logger = init_logger(__name__)
+
+
+def start_async_worker(
+    state: "EplbState",
+    rank_mapping: dict[int, int] | None = None,
+    is_profile: bool = False,
+) -> threading.Thread:
+    ep_group = get_ep_group().device_group
+    rank = ep_group.rank()
+    device_index = state.cuda_device_index
+
+    def thread_target() -> None:
+        assert device_index is not None
+        torch.cuda.set_device(device_index)
+        cuda_stream = torch.cuda.Stream(device=device_index)
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            loop.run_until_complete(
+                transfer_run_periodically(
+                    state=state,
+                    ep_group=ep_group,
+                    is_profile=is_profile,
+                    rank_mapping=rank_mapping,
+                    cuda_stream=cuda_stream,
+                )
+            )
+        except Exception as exc:  # pragma: no cover - diagnostic path
+            logger.exception("async loop error (Rank %d): %s", rank, str(exc))
+        finally:
+            loop.close()
+
+    thread = threading.Thread(target=thread_target, daemon=True)
+    thread.start()
+    return thread
+
+
+async def transfer_run_periodically(
+    state: "EplbState",
+    ep_group: ProcessGroup,
+    is_profile: bool = False,
+    rank_mapping: dict[int, int] | None = None,
+    cuda_stream: torch.cuda.Stream = None,
+) -> None:
+    while True:
+        await asyncio.to_thread(state.rearrange_event.wait)
+        logger.info("async worker woke up for EPLB transfer")
+
+        for model_state in state.model_states.values():
+            if not model_state.is_async_enabled:
+                continue
+            current_num_layers = model_state.model.num_moe_layers
+            while (
+                model_state.rebalanced
+                and model_state.layer_to_transfer < current_num_layers
+            ):
+                if (
+                    not model_state.ep_buffer_ready
+                    and model_state.rebalanced
+                    and model_state.new_physical_to_logical_map is not None
+                ):
+                    await asyncio.to_thread(model_state.buffer_lock.acquire)
+                    try:
+                        if model_state.layer_to_transfer >= current_num_layers:
+                            break
+
+                        (
+                            model_state.is_unchanged,
+                            model_state.is_received_locally,
+                            model_state.experts_recv_loc,
+                        ) = await transfer_layer(
+                            old_global_expert_indices=model_state.physical_to_logical_map,
+                            new_global_expert_indices=model_state.new_physical_to_logical_map,
+                            expert_weights=model_state.model.expert_weights,
+                            expert_weights_buffer=model_state.expert_buffer,
+                            ep_group=ep_group,
+                            is_profile=is_profile,
+                            layer=model_state.layer_to_transfer,
+                            cuda_stream=cuda_stream,
+                            rank_mapping=rank_mapping,
+                        )
+                        event = torch.cuda.Event(blocking=False)
+                        cuda_stream.record_event(event)
+                        model_state.buffer_ready_event = event
+                        model_state.ep_buffer_ready = 1
+                    finally:
+                        model_state.buffer_lock.release()
+                else:
+                    if not model_state.rebalanced:
+                        break
+                    await asyncio.sleep(0.001)
+
+        state.rearrange_event.clear()
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 526d3ceac7b8f..9f8798a96a2fc 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -26,6 +26,7 @@ MoE layer. If we have 32 EP ranks, then each GPU will hold 288 / 32 = 9 local
 physical experts.
 """
 
+import threading
 import time
 from collections.abc import Sequence
 from dataclasses import dataclass
@@ -43,8 +44,9 @@ from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MixtureOfExperts
 
+from .async_worker import start_async_worker
 from .rebalance_algo import rebalance_experts
-from .rebalance_execute import rearrange_expert_weights_inplace
+from .rebalance_execute import move_from_buffer, rearrange_expert_weights_inplace
 
 logger = init_logger(__name__)
 
@@ -132,6 +134,74 @@ class EplbModelState:
     """
     model_name: str
     model: MixtureOfExperts
+    expert_buffer: list[torch.Tensor]
+    """
+    The buffer to store the expert weights during transfer.
+    """
+    buffer_lock: threading.Lock
+    """
+    The lock to protect the expert buffer.
+    """
+    buffer_ready_event: torch.cuda.Event | None
+    """
+    CUDA event recorded when the async worker finishes filling the buffer.
+    The main thread waits on this before consuming the buffer.
+    """
+    ep_buffer_ready: int
+    """
+    The flag indicates whether the expert buffer is ready for transfer.
+    0 or 1.
+    """
+    layer_to_transfer: int
+    """
+    The layer index to transfer in async mode.
+    """
+    rebalanced: bool
+    """
+    The flag indicates whether the experts rebalance have been computed.
+    """
+    pending_global_ready_check: bool
+    """
+    Whether the async EPLB needs to poll peers for buffer readiness.
+    """
+    is_unchanged: list[bool]
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    The size is same as the num of physical experts in the current layer.
+    """
+    is_received_locally: list[bool]
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    The size is same as the num of physical experts in the current layer.
+    """
+    experts_recv_loc: dict[int, int]
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    The size is same as the num of physical experts in the current layer.
+    """
+    is_async_enabled: bool
+    """
+    The flag indicates whether the EPLB is running in async mode.
+    """
+    cuda_device_index: int | None
+    """
+    CUDA device index for the async EPLB worker thread.
+    """
+    new_physical_to_logical_map: torch.Tensor | None = None
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    the size is same as physical_to_logical_map
+    """
+    new_logical_to_physical_map: torch.Tensor | None = None
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    the size is same as logical_to_physical_map
+    """
+    new_logical_replica_count: torch.Tensor | None = None
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    the size is same as logical_replica_count
+    """
 
 
 class EplbState:
@@ -164,12 +234,31 @@ class EplbState:
         Otherwise, the rearrangement will hang at collective
         communication calls.
         """
-        self.expert_rearrangement_step: int = 0
+        self.expert_rearrangement_step_interval: int = 0
         """
         Interval for expert rearrangement steps.
         This is a constant and is taken from the config.
         """
-        self.expert_rearrangement_step_interval: int = 0
+        self.is_async: bool = False
+        """
+        The flag indicates whether the EPLB is running in async mode.
+        """
+        self.rearrange_event = threading.Event()
+        """
+        Event to signal when a new rearrangement is needed for the async thread.
+        """
+        self.async_worker: threading.Thread | None = None
+        """
+        Background thread handling async transfers.
+        """
+        self.cuda_device_index: int | None = None
+        """
+        CUDA device index for the async EPLB worker thread.
+        """
+        if self.device.type == "cuda":
+            self.cuda_device_index = self.device.index
+            if self.cuda_device_index is None and torch.cuda.is_available():
+                self.cuda_device_index = torch.cuda.current_device()
 
     @staticmethod
     def build_initial_global_physical_to_logical_map(
@@ -239,6 +328,8 @@ class EplbState:
         Build the initial EPLB state.
         """
         self.validate_ep_configuration(model)
+        self.is_async = self.parallel_config.eplb_config.use_async
+
         physical_to_logical_map_list = (
             EplbState.build_initial_global_physical_to_logical_map(
                 model.num_routed_experts,
@@ -368,7 +459,12 @@ class EplbState:
             physical_to_logical_map = new_physical_to_logical_map.to(self.device)
             logical_to_physical_map.copy_(new_logical_to_physical_map)
             logical_replica_count.copy_(new_logical_replica_count)
+        else:
+            new_physical_to_logical_map = None
 
+            new_logical_to_physical_map = None
+
+            new_logical_replica_count = None
         model.set_eplb_state(
             expert_load_pass,
             logical_to_physical_map,
@@ -385,15 +481,33 @@ class EplbState:
             )
             self.expert_rearrangement_step = 0
 
-        self.model_states[model_config.compute_hash()] = EplbModelState(
-            physical_to_logical_map,
-            logical_to_physical_map,
-            logical_replica_count,
-            expert_load_pass,
-            expert_load_window,
-            model_config.model,
-            model,
+        expert_buffer = [torch.empty_like(w) for w in model.expert_weights[0]]
+
+        model_state = EplbModelState(
+            physical_to_logical_map=physical_to_logical_map,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+            expert_load_pass=expert_load_pass,
+            expert_load_window=expert_load_window,
+            model_name=model_config.model,
+            model=model,
+            expert_buffer=expert_buffer,
+            buffer_lock=threading.Lock(),
+            buffer_ready_event=None,
+            ep_buffer_ready=0,
+            layer_to_transfer=0,
+            rebalanced=False,
+            pending_global_ready_check=False,
+            is_unchanged=[],
+            is_received_locally=[],
+            experts_recv_loc={},
+            is_async_enabled=self.is_async,
+            cuda_device_index=self.cuda_device_index,
+            new_physical_to_logical_map=new_physical_to_logical_map,
+            new_logical_to_physical_map=new_logical_to_physical_map,
+            new_logical_replica_count=new_logical_replica_count,
         )
+        self.model_states[model_config.compute_hash()] = model_state
 
     def step(
         self,
@@ -420,7 +534,7 @@ class EplbState:
             - `max_tokens`: The maximum load across ranks.
             - `balancedness`: The ratio of average load to maximum load.
         """
-
+        ep_group = get_ep_group().device_group
         if is_profile:
             self.rearrange(is_profile=True)
             return
@@ -488,7 +602,49 @@ class EplbState:
         # rearrangement step and perform rearrangement to ensure all ranks are
         # performing collective communication.
         self.expert_rearrangement_step += 1
+
+        if self.is_async:
+            for eplb_model_state in self.model_states.values():
+                if not eplb_model_state.is_async_enabled:
+                    continue
+
+                all_ranks_buffer_ready = False
+                if eplb_model_state.pending_global_ready_check:
+                    all_ranks_buffer_ready = self._all_ranks_buffer_ready(
+                        eplb_model_state
+                    )
+                if (
+                    eplb_model_state.is_async_enabled
+                    and eplb_model_state.ep_buffer_ready
+                    and all_ranks_buffer_ready
+                ):
+                    self.move_to_workspace(
+                        model_state=eplb_model_state,
+                        ep_group=ep_group,
+                        is_profile=is_profile,
+                    )
+                    if (
+                        eplb_model_state.layer_to_transfer
+                        >= eplb_model_state.model.num_moe_layers
+                    ):
+                        self.post_eplb(eplb_model_state, is_profile)
+                        eplb_model_state.rebalanced = False
+                        eplb_model_state.layer_to_transfer = 0
+                        eplb_model_state.pending_global_ready_check = False
+                        logger.info(
+                            "finish async transfer for model %s rank %d layer %d",
+                            eplb_model_state.model_name,
+                            ep_group.rank(),
+                            eplb_model_state.model.num_moe_layers,
+                        )
+
         if self.expert_rearrangement_step >= self.expert_rearrangement_step_interval:
+            if any(
+                eplb_model_state.is_async_enabled and eplb_model_state.rebalanced
+                for eplb_model_state in self.model_states.values()
+            ):
+                # Still performing asynchronous rearrangement
+                return
             self.expert_rearrangement_step = 0
             self.rearrange()
 
@@ -524,7 +680,11 @@ class EplbState:
         if is_main_rank:
             torch.cuda.synchronize()
             time_start = time.perf_counter()
-            logger.info("Rearranging experts %s...", "(profile)" if is_profile else "")
+            logger.info(
+                "Rearranging experts %s %s...",
+                "(async mode)" if self.is_async else "sync mode",
+                "(profile)" if is_profile else "",
+            )
 
         if global_expert_loads is None:
             # Map the physical expert load to global logical experts
@@ -593,6 +753,7 @@ class EplbState:
         model = eplb_model_state.model
         num_replicas = model.num_physical_experts
         num_groups = model.num_expert_groups
+
         if rank_mapping is not None and len(rank_mapping) == ep_group.size():
             # NOTE(yongji): scale down, we need to rebalance the experts on
             # remaining GPUs, transfer the experts while we haven't shutdown
@@ -608,7 +769,7 @@ class EplbState:
             num_gpus = ep_group.size()
 
         if num_gpus % num_nodes != 0:
-            self.num_nodes = 1
+            num_nodes = 1
             logger.warning_once(
                 f"num_gpus % num_nodes != 0, "
                 "not using hierarchical rearrangement algorithm.\n"
@@ -631,60 +792,216 @@ class EplbState:
                 num_gpus,
             )
 
-            # Update expert weights
-            rearrange_expert_weights_inplace(
-                eplb_model_state.physical_to_logical_map,
-                new_physical_to_logical_map,
-                eplb_model_state.model.expert_weights,
-                ep_group,
-                is_profile,
-                rank_mapping,
-            )
+            if not eplb_model_state.is_async_enabled or is_profile:
+                # Update expert weights
+                rearrange_expert_weights_inplace(
+                    eplb_model_state.physical_to_logical_map,
+                    new_physical_to_logical_map,
+                    eplb_model_state.model.expert_weights,
+                    ep_group,
+                    is_profile,
+                    rank_mapping,
+                )
 
-            if not is_profile:
-                if (
-                    eplb_model_state.physical_to_logical_map.shape[1]
-                    != new_physical_to_logical_map.shape[1]
-                ):
-                    eplb_model_state.physical_to_logical_map = (
-                        new_physical_to_logical_map.to(
-                            eplb_model_state.physical_to_logical_map.device
+                if not is_profile:
+                    if (
+                        eplb_model_state.physical_to_logical_map.shape[1]
+                        != new_physical_to_logical_map.shape[1]
+                    ):
+                        eplb_model_state.physical_to_logical_map = (
+                            new_physical_to_logical_map.to(
+                                eplb_model_state.physical_to_logical_map.device
+                            )
                         )
+                    else:
+                        eplb_model_state.physical_to_logical_map.copy_(
+                            new_physical_to_logical_map
+                        )
+                    max_physical_slots = new_logical_to_physical_map.shape[-1]
+                    assert (
+                        max_physical_slots
+                        <= eplb_model_state.logical_to_physical_map.shape[-1]
                     )
-                else:
-                    eplb_model_state.physical_to_logical_map.copy_(
-                        new_physical_to_logical_map
+                    new_logical_to_physical_map = torch.nn.functional.pad(
+                        new_logical_to_physical_map,
+                        (
+                            0,
+                            eplb_model_state.logical_to_physical_map.shape[-1]
+                            - max_physical_slots,
+                        ),
+                        value=-1,
                     )
-                max_physical_slots = new_logical_to_physical_map.shape[-1]
-                assert (
-                    max_physical_slots
-                    <= eplb_model_state.logical_to_physical_map.shape[-1]
-                )
-                new_logical_to_physical_map = torch.nn.functional.pad(
+                    eplb_model_state.logical_to_physical_map.copy_(
+                        new_logical_to_physical_map
+                    )
+                    eplb_model_state.logical_replica_count.copy_(
+                        new_logical_replica_count
+                    )
+                if is_main_rank:
+                    assert time_start is not None
+                    torch.cuda.synchronize()
+                    time_end = time.perf_counter()
+                    logger.info(
+                        "Rearranged experts%sin %.2f seconds.",
+                        " (profile) " if is_profile else " ",
+                        time_end - time_start,
+                    )
+            else:
+                device = eplb_model_state.physical_to_logical_map.device
+                new_physical = new_physical_to_logical_map.to(device)
+                max_slots = eplb_model_state.logical_to_physical_map.shape[-1]
+                padded_logical = torch.nn.functional.pad(
                     new_logical_to_physical_map,
-                    (
-                        0,
-                        eplb_model_state.logical_to_physical_map.shape[-1]
-                        - max_physical_slots,
-                    ),
+                    (0, max(0, max_slots - new_logical_to_physical_map.shape[-1])),
                     value=-1,
+                ).to(eplb_model_state.logical_to_physical_map.device)
+                new_replica = new_logical_replica_count.to(
+                    eplb_model_state.logical_replica_count.device
                 )
-                eplb_model_state.logical_to_physical_map.copy_(
-                    new_logical_to_physical_map
-                )
-                eplb_model_state.logical_replica_count.copy_(new_logical_replica_count)
 
-        if is_main_rank:
-            assert time_start is not None
-            torch.cuda.synchronize()
-            time_end = time.perf_counter()
-            logger.info(
-                "Rearranged experts%sin %.2f seconds.",
-                " (profile) " if is_profile else " ",
-                time_end - time_start,
-            )
+                eplb_model_state.new_physical_to_logical_map = new_physical
+                eplb_model_state.new_logical_to_physical_map = padded_logical
+                eplb_model_state.new_logical_replica_count = new_replica
+
+                eplb_model_state.rebalanced = True
+                eplb_model_state.layer_to_transfer = 0
+                eplb_model_state.pending_global_ready_check = True
+
+        # Signal async thread to start transferring layers
+        if self.is_async and (not is_profile):
+            self.rearrange_event.set()
         return None
 
+    def start_async_loop(
+        self,
+        rank_mapping: dict[int, int] | None = None,
+        is_profile: bool = False,
+    ):
+        if not self.is_async:
+            return
+        if self.async_worker is None:
+            self.async_worker = start_async_worker(
+                self,
+                rank_mapping=rank_mapping,
+                is_profile=is_profile,
+            )
+
+    def _update_layer_mapping_from_new(
+        self, model_state: EplbModelState, layer: int
+    ) -> None:
+        if (
+            model_state.new_physical_to_logical_map is None
+            or model_state.new_logical_to_physical_map is None
+            or model_state.new_logical_replica_count is None
+        ):
+            return
+
+        target_device = model_state.physical_to_logical_map.device
+        new_physical = model_state.new_physical_to_logical_map
+        if model_state.physical_to_logical_map.shape[1] != new_physical.shape[1]:
+            model_state.physical_to_logical_map = new_physical.to(target_device)
+        else:
+            model_state.physical_to_logical_map[layer].copy_(
+                new_physical[layer].to(target_device)
+            )
+
+        logical_device = model_state.logical_to_physical_map.device
+        new_logical = model_state.new_logical_to_physical_map[layer].to(logical_device)
+        max_slots = model_state.logical_to_physical_map.shape[-1]
+        slot_delta = max_slots - new_logical.shape[-1]
+        if slot_delta > 0:
+            new_logical = torch.nn.functional.pad(
+                new_logical, (0, slot_delta), value=-1
+            )
+        model_state.logical_to_physical_map[layer].copy_(new_logical)
+
+        replica_device = model_state.logical_replica_count.device
+        model_state.logical_replica_count[layer].copy_(
+            model_state.new_logical_replica_count[layer].to(replica_device)
+        )
+
+    def _all_ranks_buffer_ready(self, model_state: EplbModelState) -> bool:
+        parallel_state = get_ep_group()
+        cpu_group = getattr(parallel_state, "cpu_group", None)
+        if cpu_group is not None and cpu_group.size() > 1:
+            flag = torch.tensor(
+                (int(model_state.ep_buffer_ready),), dtype=torch.int32, device="cpu"
+            )
+            all_reduce(flag, group=cpu_group)
+            return int(flag.item()) == cpu_group.size()
+
+        device_group = parallel_state.device_group
+        if device_group.size() <= 1:
+            return bool(model_state.ep_buffer_ready)
+
+        device = getattr(
+            parallel_state, "device", model_state.physical_to_logical_map.device
+        )
+        flag = torch.tensor(
+            (int(model_state.ep_buffer_ready),), dtype=torch.int32, device=device
+        )
+        all_reduce(flag, group=device_group)
+        return int(flag.item()) == device_group.size()
+
+    def move_to_workspace(
+        self,
+        model_state: EplbModelState,
+        ep_group: ProcessGroup,
+        is_profile: bool = False,
+    ):
+        if not model_state.buffer_lock.acquire(blocking=False):
+            return
+        try:
+            assert model_state.new_physical_to_logical_map is not None
+            device_index = model_state.cuda_device_index or self.cuda_device_index
+            if model_state.buffer_ready_event is not None and device_index is not None:
+                stream = torch.cuda.current_stream(device=device_index)
+                stream.wait_event(model_state.buffer_ready_event)
+                model_state.buffer_ready_event = None
+            move_from_buffer(
+                expert_weights=model_state.model.expert_weights[
+                    model_state.layer_to_transfer
+                ],
+                expert_weights_buffer=model_state.expert_buffer,
+                is_unchanged=model_state.is_unchanged,
+                is_received_locally=model_state.is_received_locally,
+                experts_recv_loc=model_state.experts_recv_loc,
+                new_indices=model_state.new_physical_to_logical_map[
+                    model_state.layer_to_transfer
+                ].tolist(),
+                ep_group=ep_group,
+            )
+            transferred_layer = model_state.layer_to_transfer
+            self._update_layer_mapping_from_new(model_state, transferred_layer)
+            # After the main thread consumes, advance layer_to_transfer
+            model_state.layer_to_transfer += 1
+            model_state.ep_buffer_ready = 0
+            logger.info(
+                "model %s successfully move_to_workspace layer %d",
+                model_state.model_name,
+                transferred_layer,
+            )
+        finally:
+            try:
+                model_state.buffer_lock.release()
+            except Exception as e:
+                logger.error(
+                    "Rank %d: buffer_lock release failed in move_to_workspace: %s",
+                    ep_group.rank(),
+                    str(e),
+                )
+
+    def post_eplb(self, model_state: EplbModelState, is_profile: bool = False) -> None:
+        assert model_state.new_physical_to_logical_map is not None
+        assert model_state.new_logical_to_physical_map is not None
+        assert model_state.new_logical_replica_count is not None
+        if not is_profile:
+            for layer_idx in range(model_state.physical_to_logical_map.shape[0]):
+                self._update_layer_mapping_from_new(model_state, layer_idx)
+        model_state.new_physical_to_logical_map = None
+        model_state.new_logical_to_physical_map = None
+        model_state.new_logical_replica_count = None
+
     @staticmethod
     def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]:
         """
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 5c1efbaf03bab..376dad8a72ef1 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -100,18 +100,19 @@ def get_ep_ranks_with_expert(
     return ranks_to_send, ranks_to_recv_actual
 
 
-def shuffle_layer(
+def move_to_buffer(
     num_local_experts: int,
-    ep_rank: int,
     old_indices: Sequence[int],
     new_indices: Sequence[int],
     expert_weights: Iterable[torch.Tensor],
     expert_weights_buffer: Sequence[torch.Tensor],
+    cuda_stream: torch.cuda.Stream | None,
     ep_group: ProcessGroup,
-) -> None:
+) -> tuple[list[bool], list[bool], dict[int, int]]:
     """
     Perform expert weights rearrangement of one layer.
     """
+    ep_rank = ep_group.rank()
     local2global = partial(
         idx_local_to_global,
         local_cnt=num_local_experts,
@@ -137,7 +138,8 @@ def shuffle_layer(
             if old_indices[src_global] == new_indices[dst_global]:
                 is_received_locally[dst] = True
                 for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                    buffer[dst].copy_(weight[src])
+                    with torch.cuda.stream(cuda_stream):
+                        buffer[dst].copy_(weight[src], non_blocking=True)
 
     p2p_ops: list[P2POp] = []
 
@@ -225,25 +227,115 @@ def shuffle_layer(
         ]
 
     # 4. Execute the P2P operations. The real communication happens here.
-    if p2p_ops:
+    if p2p_ops and cuda_stream is not None:
+        with torch.cuda.stream(cuda_stream):
+            reqs = batch_isend_irecv(p2p_ops)
+            for req in reqs:
+                req.wait()
+    elif p2p_ops:
         reqs = batch_isend_irecv(p2p_ops)
         for req in reqs:
             req.wait()
+    # wait for the communication to finish
+    return is_unchanged, is_received_locally, experts_recv_loc
+
+
+def move_from_buffer(
+    expert_weights: Iterable[torch.Tensor],
+    expert_weights_buffer: list[torch.Tensor],
+    is_unchanged: list[bool],
+    is_received_locally: list[bool],
+    experts_recv_loc: dict[int, int],
+    new_indices: Sequence[int],
+    ep_group: ProcessGroup,
+) -> None:
+    ep_rank = ep_group.rank()
+    num_local_experts = len(is_unchanged)
+
+    local2global = partial(
+        idx_local_to_global, local_cnt=num_local_experts, ep_rank=ep_rank
+    )
 
-    # 5. Copy the weights from the buffer back to the original weights.
     for dst in range(num_local_experts):
         if is_unchanged[dst]:
             continue
         if is_received_locally[dst]:
             for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                weight[dst].copy_(buffer[dst])
+                weight[dst].copy_(buffer[dst], non_blocking=True)
         else:
             expert = new_indices[local2global(dst)]
             if expert == -1:
                 continue
             src = experts_recv_loc[expert]
             for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                weight[dst].copy_(buffer[src])
+                weight[dst].copy_(buffer[src], non_blocking=True)
+
+
+async def transfer_layer(
+    old_global_expert_indices: torch.Tensor,
+    new_global_expert_indices: torch.Tensor,
+    expert_weights: Sequence[Iterable[torch.Tensor]],
+    expert_weights_buffer: Sequence[torch.Tensor],
+    ep_group: ProcessGroup,
+    is_profile: bool = False,
+    layer: int = 0,
+    cuda_stream: torch.cuda.Stream | None = None,
+    rank_mapping: dict[int, int] | None = None,
+) -> tuple[list[bool], list[bool], dict[int, int]]:
+    """
+    Rearranges the expert weights in place according to the new expert indices.
+
+    The value of the indices arguments are logical indices of the experts,
+    while keys are physical.
+
+    Args:
+        old_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
+        new_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
+        expert_weights: A sequence of shape (num_moe_layers)(weight_count)
+            of tensors of shape (num_local_physical_experts, hidden_size_i).
+            For example, a linear layer may have up and down projection,
+            so weight_count = 2. Each weight's hidden size can be different.
+        ep_group: The device process group for expert parallelism.
+        is_profile (bool): If `True`, do not perform any actual weight copy.
+            This is used during profile run, where we only perform dummy
+            communications to reserve enough memory for the buffers.
+    """
+    ep_size = ep_group.size()
+    if rank_mapping is not None:
+        if len(rank_mapping) == ep_group.size():
+            # scale down
+            new_global_expert_indices = _map_new_expert_indices_with_rank_mapping(
+                new_global_expert_indices,
+                rank_mapping,
+            )
+        else:
+            # scale up
+            old_global_expert_indices = _map_old_expert_indices_with_rank_mapping(
+                old_global_expert_indices,
+                rank_mapping,
+                ep_group.size(),
+            )
+
+    assert old_global_expert_indices.shape[1] == new_global_expert_indices.shape[1]
+    num_moe_layers, num_physical_experts = old_global_expert_indices.shape
+    assert len(expert_weights) == num_moe_layers
+    num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
+    assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
+    assert num_physical_experts == ep_size * num_local_physical_experts
+    # A buffer to hold the expert weights in one layer during the exchange.
+    # NOTE: Currently we assume the same weights across different layers
+    # have the same shape.
+
+    is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
+        num_local_experts=num_local_physical_experts,
+        old_indices=old_global_expert_indices[layer].tolist(),
+        new_indices=new_global_expert_indices[layer].tolist(),
+        expert_weights=expert_weights[layer],
+        expert_weights_buffer=expert_weights_buffer,
+        cuda_stream=cuda_stream,
+        ep_group=ep_group,
+    )
+    return is_unchanged, is_received_locally, experts_recv_loc
 
 
 def rearrange_expert_weights_inplace(
@@ -296,7 +388,6 @@ def rearrange_expert_weights_inplace(
     num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
     assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
 
-    ep_rank = ep_group.rank()
     ep_size = ep_group.size()
     assert num_physical_experts == ep_size * num_local_physical_experts
 
@@ -329,14 +420,24 @@ def rearrange_expert_weights_inplace(
     torch.cuda.synchronize()
 
     for layer in range(num_moe_layers):
-        shuffle_layer(
-            num_local_physical_experts,
-            ep_rank,
-            old_global_expert_indices_cpu[layer].tolist(),
-            new_global_expert_indices_cpu[layer].tolist(),
-            expert_weights[layer],
-            expert_weights_buffer,
-            ep_group,
+        is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
+            num_local_experts=num_local_physical_experts,
+            old_indices=old_global_expert_indices_cpu[layer].tolist(),
+            new_indices=new_global_expert_indices_cpu[layer].tolist(),
+            expert_weights=expert_weights[layer],
+            expert_weights_buffer=expert_weights_buffer,
+            cuda_stream=None,
+            ep_group=ep_group,
+        )
+
+        move_from_buffer(
+            expert_weights=expert_weights[layer],
+            expert_weights_buffer=expert_weights_buffer,
+            is_unchanged=is_unchanged,
+            is_received_locally=is_received_locally,
+            experts_recv_loc=experts_recv_loc,
+            new_indices=new_global_expert_indices[layer].tolist(),
+            ep_group=ep_group,
         )
 
 
@@ -428,4 +529,4 @@ def _map_new_expert_indices_with_rank_mapping(
     return mapped_expert_indices
 
 
-__all__ = ["rearrange_expert_weights_inplace"]
+__all__ = ["transfer_layer", "move_from_buffer"]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6a54e02f861e9..cbafc9c993cc2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3370,6 +3370,8 @@ class GPUModelRunner(
                 old_global_expert_indices,
                 rank_mapping,
             )
+            if self.eplb_state.is_async:
+                self.eplb_state.start_async_loop(rank_mapping=rank_mapping)
 
         if (
             self.vllm_config.compilation_config.mode

From f716a153723d4b2e18d01380cfe25d9ac636e2ef Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 24 Nov 2025 09:40:05 -0500
Subject: [PATCH 008/134] Update KServe guide link in documentation (#29258)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/deployment/integrations/kserve.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md
index edf79fca4f93e..37b29aa1a4876 100644
--- a/docs/deployment/integrations/kserve.md
+++ b/docs/deployment/integrations/kserve.md
@@ -2,4 +2,4 @@
 
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
 
-Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
+Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.

From 7a228b5305f3257834c5079fd475f659bc4bf73d Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Mon, 24 Nov 2025 07:12:41 -0800
Subject: [PATCH 009/134] Add option to use unbacked, and backed size obl
 dynamic shapes for more sounds compilation. (#26199)

Signed-off-by: Laith Sakka <lsakka@meta.com>
---
 docs/design/debug_vllm_compile.md             |  70 ++++++++++++
 docs/design/torch_compile.md                  | 105 +++++++++++++++++-
 .../test_dynamic_shapes_compilation.py        |  88 +++++++++++++++
 vllm/compilation/decorators.py                |  63 +++++++++--
 vllm/compilation/wrapper.py                   |  24 +++-
 vllm/config/compilation.py                    |  60 +++++++++-
 vllm/model_executor/models/llama.py           |  12 +-
 vllm/model_executor/models/qwen2.py           |  35 +++++-
 8 files changed, 442 insertions(+), 15 deletions(-)
 create mode 100644 tests/compile/test_dynamic_shapes_compilation.py

diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index 8912eb58f8ac7..408d2878309dd 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -151,6 +151,76 @@ To avoid this, please either:
 2. wrap the branching logic into a custom operator. TorchDynamo does not
 trace into custom operators.
 
+## Debugging constraint violations and dynamic shapes guards issues
+
+Dynamic-shape guards are a specific category of Dynamo guards. They are constraints that `torch.compile`
+attaches to dynamic dimensions (e.g., `seq_len`) to ensure the compiled artifact remains valid.
+These guards typically appear when framework code, custom passes, or user code branches based on
+dynamic shape values.
+
+**Example:**
+
+```python
+if x > 10:
+    # path A
+else:
+    # path B
+```
+
+This creates a guard `x > 10` or `x <= 10` depending on which path was traced.
+
+**vLLM's Assumption:**
+vLLM assumes that all guards added by torch.compile are safe to drop and will not
+constrain the compiled graph to specific input shapes. When this assumption is violated,
+it can cause issues that users need to debug.
+Some side effects that indicates this assumption is violated are runtime errors
+or `ConstraintViolationErrors`.
+
+A `ConstraintViolationErrors` will be thrown if a dynamic shape gets constrained to
+a single value. If you encounter a constraint violation error or suspect that a dynamic
+shapes guard is being added incorrectly, you can use stricter dynamic shape modes to
+help debug the issue:
+
+```sh
+# Online - using unbacked mode
+vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+
+# Online - using backed_size_oblivious mode
+vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious
+```
+
+```py
+# Offline - using unbacked mode
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+LLM(model, compilation_config=CompilationConfig(
+    dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.UNBACKED)
+))
+
+# Offline - using backed_size_oblivious mode
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+LLM(model, compilation_config=CompilationConfig(
+    dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS)
+))
+```
+
+These modes are stricter and reduce or eliminate the need of dynamic shapes guarding, which can help isolate issues:
+
+- `unbacked`: Uses unbacked symints which don't allow guards, making it easier to identify where guards are being incorrectly added
+- `backed_size_oblivious`: Uses a mode that is more strict about guarding.
+
+For more details on dynamic shapes modes, see [Dynamic shapes and vLLM guard dropping](torch_compile.md#dynamic-shapes-and-vllm-guard-dropping).
+
+### Printing guards
+
+To see all guards that are being added during compilation, you can use `TORCH_LOGS=+dynamic`:
+
+```sh
+TORCH_LOGS=+dynamic vllm serve meta-llama/Llama-3.2-1B
+```
+
+Look for `[guard added]` in the logs to see where guards are being added. This can help you identify which operations are
+causing guards to be added incorrectly.
+
 ## Debugging TorchInductor
 
 TorchInductor takes a captured graph and then compiles it down to some Python code
diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 27edc4f89201d..7b0b2c1e96978 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -29,6 +29,109 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
 
 By default, the cache saves compiled artifacts as binary files. If you would like to interact with the generated code for debugging purposes, set the field `compile_cache_save_format=unpacked` in the compilation config, or omit this and set the env variable `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`.
 
+## Dynamic shapes and vllm guard dropping
+
+`torch.compile` is designed to guard on dynamic shapes with no hesitation
+when needed. This contradicts with vLLM's `torch.compile` approach of
+dropping the guards since many of those guards could be material.
+
+`torch.compile` provides two kinds of dynamic shapes: `backed` and `unbacked`.
+`torch.compile` guards on `backed` dynamic shapes and does not provide a
+guarantee that no guards will be added to them. User code, dynamo,
+inductor, and autograd all can add guards. Moreover, for 0/1
+specializations, backed symbols are specialized unconditionally to 0, 1,
+or >=2 even without encountering a branching on those ranges.
+
+On the contrary, `unbacked` dynamic shapes are guaranteed not to be guarded
+on and are not 0/1 specialized. However, there is a possibility of
+throwing a data dependent error when a branch that requires their value is
+encountered and no explicit unbacked handling is defined. The framework is
+converging to a state where it won't throw DDE but rather pick general
+paths. One downside of using unbacked is missed optimization opportunities
+due to either perf bugs or picking general paths, also using a fixed
+non-example input-based hint (this will be fixed soon with override_hint
+API). An example of picking general paths is assuming input not contiguous
+in functions call contiguous() and reshape() when can't be symbolically proven
+with a change of introducing a clone.
+
+`backed_size_oblivious` is a flag that enables treating backed symbols as
+unbacked wherever explicit handling for unbacked is defined. With this
+mode, 0/1 specializations are mostly avoided in framework code and the
+default 0/1 specialization does not happen. However, there is still no
+guarantee that torch.compile won't guard, especially due to user code or
+custom passes. `backed_size_oblivious` is experimental in PyTorch compile
+and could be deprecated. That said, it's a safer option to use than
+`backed` and the probability of reducing performance is lower than
+`unbacked`.
+
+### Configuring Dynamic Shapes
+
+The `DynamicShapesConfig` allows you to control the dynamic shapes behavior by
+setting the `type` field. You can choose between three modes:
+`BACKED`(default), `UNBACKED` , and `BACKED_SIZE_OBLIVIOUS`.
+
+#### Offline Inference Example (Using LLM class)
+
+When using the `LLM` class for offline inference, you can configure dynamic
+shapes through the `compilation_config` parameter:
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+
+# Example: Using backed_size_oblivious (experimental, safer than backed)
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B",
+    compilation_config=CompilationConfig(
+        dynamic_shapes_config=DynamicShapesConfig(
+            type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS
+        )
+    )
+)
+
+# Example: Using unbacked (strongest guarantee against guards)
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B",
+    compilation_config=CompilationConfig(
+        dynamic_shapes_config=DynamicShapesConfig(
+            type=DynamicShapesType.UNBACKED
+        )
+    )
+)
+
+# Generate outputs
+prompts = ["Hello, my name is", "The future of AI is"]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+outputs = llm.generate(prompts, sampling_params)
+```
+
+#### Online Serving Example (Using vllm serve)
+
+When using `vllm serve` for online serving, you can configure dynamic shapes
+through the `--compilation-config` flag:
+
+```bash
+# Example: Using unbacked
+vllm serve meta-llama/Llama-3.2-1B \
+  --compilation-config '{"dynamic_shapes_config": {"type": "unbacked"}}'
+
+
+# Alternative: Using dot notation (simpler for single values)
+vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+```
+
+#### Choosing the Right Mode
+
+- **BACKED** (default): Use when you're willing to accept potential unsafe dropping of guards
+for maximal performance. Guard could be unsoundly added and then ignored.
+
+- **UNBACKED**  Use when you need the strongest guarantee against guards.
+  This is the most conservative option but may miss some optimization opportunities.
+
+- **BACKED_SIZE_OBLIVIOUS**: Use when you want a balance between avoiding guards
+  and performance. This experimental mode is safer than BACKED but still not as
+  conservative as UNBACKED.
+
 ## Python Code Compilation
 
 In the very verbose logs, we can see:
@@ -122,7 +225,7 @@ When all the shapes are known, `torch.compile` can compare different configs, an
       triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
       triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
       triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
-      mm 0.0160 ms 81.6% 
+      mm 0.0160 ms 81.6%
       triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
       triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
       triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
new file mode 100644
index 0000000000000..c20aea822fe81
--- /dev/null
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import gc
+
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config.compilation import CompilationMode, DynamicShapesType
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+
+def get_test_models():
+    """Get list of models to test based on PyTorch version"""
+    # TODO "Qwen/Qwen3-4B-Instruct-2507" fails Fix issue and support it.
+    return ["gpt2", "Qwen/Qwen2-7B-Instruct", "meta-llama/Llama-3.1-8B"]
+
+
+@pytest.mark.parametrize("model_name", get_test_models())
+@pytest.mark.parametrize(
+    "shapes_type",
+    [
+        DynamicShapesType.BACKED,
+        DynamicShapesType.UNBACKED,
+        DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
+    ],
+)
+@pytest.mark.parametrize("use_aot_compile", ["0"])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_dynamic_shapes_compilation(
+    monkeypatch, model_name, shapes_type, use_aot_compile, use_bytecode_hook
+):
+    """Test that all dynamic shapes types compile successfully"""
+    print(
+        f"\nTesting model: {model_name} with {shapes_type.name}, "
+        f"AOT compile: {use_aot_compile}, "
+        f"Bytecode hook: {use_bytecode_hook}"
+    )
+    if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
+        pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
+
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
+    prompt = "Hello, my name is"
+
+    print(f"Testing {shapes_type.name} dynamic shapes...")
+
+    # Initialize the model with specific dynamic shapes configuration
+    model = LLM(
+        model=model_name,
+        compilation_config={
+            "mode": CompilationMode.VLLM_COMPILE,
+            "dynamic_shapes_config": {
+                "type": shapes_type.value,
+            },
+        },
+    )
+
+    output = model.generate(prompt)
+    result = output[0].outputs[0].text
+    # Example of setting the sampling parameters
+    tokenizer = get_tokenizer(model_name)
+    yes_tokens = tokenizer.encode("yes", add_special_tokens=False)
+    no_tokens = tokenizer.encode("no", add_special_tokens=False)
+    allowed_ids = list(set(yes_tokens + no_tokens))
+    sampling_params = SamplingParams(
+        max_tokens=1, temperature=0, allowed_token_ids=allowed_ids
+    )
+
+    output = model.generate(
+        "answer with yes or no is " + result + " rubbish for prompt " + prompt + "?",
+        sampling_params=sampling_params,
+    )
+    result = output[0].outputs[0].text
+    assert result == "yes"
+
+    # Clean up GPU memory
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.synchronize()
+    print("GPU memory cleared")
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 11a18c0e6bb78..6d9da1c488c6d 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -24,6 +24,7 @@ from vllm.config import (
     get_current_vllm_config,
     set_current_vllm_config,
 )
+from vllm.config.compilation import DynamicShapesType
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -104,6 +105,7 @@ def support_torch_compile(
     dynamic_arg_dims: dict[str, int | list[int]] | None = None,
     mark_unbacked_dims: dict[str, int | list[int]] | None = None,
     enable_if: Callable[[VllmConfig], bool] | None = None,
+    shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
 ) -> Callable[[_T], _T] | _T:
     """
     A decorator to add support for compiling the forward method of a class.
@@ -161,6 +163,14 @@ def support_torch_compile(
     dim to be decorated with `mark_unbacked`.  This is useful if we would like to
     enforce that dynamo does not specialize on 0/1 values in the case of dummy input
     such as for vision model compilation
+
+    `shape_invariants` is a function that gets compiled right before forward.
+    The function should have the torch._check calls that are needed to set
+    the relationships between different input sizes. For example:
+            torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
+    This enforces constraints on the symbolic shapes without hardcoding
+    specific values. It is needed for some models to avoid data dependent
+    errors.
     """
 
     def cls_decorator_helper(cls: _T) -> _T:
@@ -199,7 +209,11 @@ def support_torch_compile(
                     f"Argument {k} not found in the forward method of {cls}"
                 )
         return _support_torch_compile(
-            cls, inferred_dynamic_arg_dims, mark_unbacked_dims, enable_if
+            cls,
+            inferred_dynamic_arg_dims,
+            mark_unbacked_dims,
+            enable_if,
+            shape_invariants,
         )
 
     if cls is not None:
@@ -242,6 +256,7 @@ def _support_torch_compile(
     dynamic_arg_dims: dict[str, int | list[int]],
     mark_unbacked_dims: dict[str, int | list[int]] | None = None,
     enable_if: Callable[[VllmConfig], bool] | None = None,
+    shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
 ) -> _T:
     """
     A decorator to add support for compiling the forward method of a class.
@@ -276,11 +291,12 @@ def _support_torch_compile(
         old_init(self, **kwargs)
 
         self.vllm_config = vllm_config
+        self.compilation_config = self.vllm_config.compilation_config
         enable_compile = enable_if is None or enable_if(vllm_config)
         # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner
         # will handle the compilation, so we don't need to do anything here.
         self.do_not_compile = (
-            vllm_config.compilation_config.mode
+            self.compilation_config.mode
             in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE]
             or not supports_dynamo()
             or _should_ignore_torch_compile(self.__class__)
@@ -289,29 +305,38 @@ def _support_torch_compile(
         if self.do_not_compile:
             return
 
+        self._check_shape_invariants = shape_invariants
+
         compilation_counter.num_models_seen += 1
         self.compiled = False
         TorchCompileWithNoGuardsWrapper.__init__(self)
 
     cls.__init__ = __init__
 
-    def _mark_dynamic_inputs(mod, *args, **kwargs):
+    def _mark_dynamic_inputs(mod, type, *args, **kwargs):
+        def mark_dynamic(arg, dims):
+            if type == DynamicShapesType.UNBACKED:
+                torch._dynamo.decorators.mark_unbacked(arg, dims)
+            else:
+                torch._dynamo.mark_dynamic(arg, dims)
+
         sig = inspect.signature(mod.__class__.forward)
         bound_args = sig.bind(mod, *args, **kwargs)
         bound_args.apply_defaults()
         for k, dims in dynamic_arg_dims.items():
             arg = bound_args.arguments.get(k)
+
             if arg is not None:
                 dims = [dims] if isinstance(dims, int) else dims
                 if isinstance(arg, torch.Tensor):
                     # In case dims is specified with negative indexing
                     dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                    torch._dynamo.mark_dynamic(arg, dims)
+                    mark_dynamic(arg, dims)
                 elif isinstance(arg, IntermediateTensors):
                     for tensor in arg.tensors.values():
                         # In case dims is specified with negative indexing
                         dims = [tensor.ndim + dim if dim < 0 else dim for dim in dims]
-                        torch._dynamo.mark_dynamic(tensor, dims)
+                        mark_dynamic(tensor, dims)
                 else:
                     raise ValueError(
                         "Unsupported dynamic dimensions"
@@ -338,6 +363,7 @@ def _support_torch_compile(
         if getattr(self, "aot_compiled_fn", None) is not None:
             return self.aot_compiled_fn(self, *args, **kwargs)
 
+        ds_type = self.compilation_config.dynamic_shapes_config.type
         cache_dir = None
         aot_compilation_path = None
         if envs.VLLM_USE_AOT_COMPILE:
@@ -352,6 +378,14 @@ def _support_torch_compile(
             serialized backend artifacts), then we need to generate a new AOT
             compile artifact from scratch.
             """
+            # Validate that AOT compile is not used with unbacked dynamic
+            # shapes. aot_compile re-allocates backed symbols post dynamo!
+            if ds_type == DynamicShapesType.UNBACKED:
+                raise ValueError(
+                    "AOT compilation is not compatible with UNBACKED dynamic shapes. "
+                    "Please use BACKED or BACKED_SIZE_OBLIVIOUS dynamic shapes type "
+                    "when VLLM_USE_AOT_COMPILE is enabled."
+                )
             from .caching import compilation_config_hash_factors
 
             factors: list[str] = compilation_config_hash_factors(self.vllm_config)
@@ -401,7 +435,12 @@ def _support_torch_compile(
         # This is the path for the first compilation.
 
         # the first compilation needs to have dynamic shapes marked
-        _mark_dynamic_inputs(self, *args, **kwargs)
+        _mark_dynamic_inputs(
+            self,
+            ds_type,
+            *args,
+            **kwargs,
+        )
 
         # here, it is the starting point of the `torch.compile` process
         start_monitoring_torch_compile(self.vllm_config)
@@ -417,9 +456,7 @@ def _support_torch_compile(
         # properly when any of these files change.
 
         # 1. the file containing the top-level forward function
-        self.vllm_config.compilation_config.traced_files.add(
-            original_code_object.co_filename
-        )
+        self.compilation_config.traced_files.add(original_code_object.co_filename)
 
         # 2. every time Dynamo sees a function call, it will inline
         # the function by calling InliningInstructionTranslator.inline_call_
@@ -429,7 +466,7 @@ def _support_torch_compile(
 
         def patched_inline_call(self_):
             code = self_.f_code
-            self.vllm_config.compilation_config.traced_files.add(code.co_filename)
+            self.compilation_config.traced_files.add(code.co_filename)
             return inline_call(self_)
 
         # Disable the C++ compilation of symbolic shape guards. C++-fication
@@ -445,12 +482,18 @@ def _support_torch_compile(
             # if the config doesn't exist
             logger.debug("enable_cpp_symbolic_shape_guards config not available")
 
+        # Prepare backed_size_oblivious config patch if needed
+        fx_config_patches = {}
+        if ds_type == DynamicShapesType.BACKED_SIZE_OBLIVIOUS:
+            fx_config_patches["backed_size_oblivious"] = True
+
         with (
             patch.object(
                 InliningInstructionTranslator, "inline_call_", patched_inline_call
             ),
             torch._dynamo.config.patch(**dynamo_config_patches),
             maybe_use_cudagraph_partition_wrapper(self.vllm_config),
+            torch.fx.experimental._config.patch(**fx_config_patches),
             _torch27_patch_tensor_subclasses(),
         ):
             if envs.VLLM_USE_AOT_COMPILE:
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 493e57f97f0f4..b120c85bf232e 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -6,6 +6,7 @@ import sys
 from abc import abstractmethod
 from contextlib import contextmanager
 from types import CodeType
+from typing import Any
 
 import torch
 import torch._C._dynamo.guards
@@ -85,6 +86,12 @@ class TorchCompileWithNoGuardsWrapper:
     since we drop all guards.
     """
 
+    def check_invariants_and_forward(self, *args, **kwargs):
+        assert hasattr(self, "_check_shape_invariants")
+        self._check_shape_invariants(*args, **kwargs)
+
+        return self.forward(*args, **kwargs)
+
     def __init__(self):
         self.compiled = False
 
@@ -104,6 +111,21 @@ class TorchCompileWithNoGuardsWrapper:
             # Drop all the guards.
             options["guard_filter_fn"] = lambda x: [False for _ in x]
 
+        # Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False
+        from vllm.compilation.decorators import DynamicShapesType
+
+        ds_type = vllm_config.compilation_config.dynamic_shapes_config.type
+        compiled_ptr: Any = self.forward
+        if ds_type == DynamicShapesType.UNBACKED:
+            if envs.VLLM_USE_BYTECODE_HOOK:
+                # reason is that bytecode does this hack torch._dynamo.eval_frame.
+                # remove_from_cache(self.original_code_object()) to force a new
+                # re-compilation.
+                raise ValueError(
+                    "UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0. "
+                )
+            compiled_ptr = self.check_invariants_and_forward
+
         if envs.VLLM_USE_AOT_COMPILE:
             if hasattr(torch._dynamo.config, "enable_aot_compile"):
                 torch._dynamo.config.enable_aot_compile = True
@@ -114,7 +136,7 @@ class TorchCompileWithNoGuardsWrapper:
                 logger.warning(msg)
 
         self._compiled_callable = torch.compile(
-            self.forward,
+            compiled_ptr,
             fullgraph=True,
             dynamic=False,
             backend=backend,
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 9b5309598d0e2..42eccf9f41123 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -192,6 +192,54 @@ class PassConfig:
             self.enable_qk_norm_rope_fusion = False
 
 
+class DynamicShapesType(str, enum.Enum):
+    """Types of dynamic shapes handling in torch.compile().
+    see  Dynamic shapes and vllm guard dropping in torch_compile.md
+    for more details."""
+
+    BACKED = "backed"
+    """Use backed dynamic shapes. torch.compile() guards on backed dynamic
+    shapes and may add guards. Symbols are specialized to 0, 1, or >=2 even
+    without encountering branching on those ranges."""
+
+    UNBACKED = "unbacked"
+    """Use unbacked dynamic shapes. Guaranteed not to be guarded on and not
+    0/1 specialized, but may throw data dependent errors when branches require
+    their value without explicit unbacked handling."""
+
+    BACKED_SIZE_OBLIVIOUS = "backed_size_oblivious"
+    """Experimental flag that treats backed symbols as unbacked when explicit
+    unbacked handling is defined."""
+
+
+@config
+@dataclass
+class DynamicShapesConfig:
+    """Configuration to control/debug torch compile dynamic shapes."""
+
+    type: DynamicShapesType = DynamicShapesType.BACKED
+    """Controls the type of dynamic shapes handling to use with torch.compile().
+
+    - BACKED: Default PyTorch behavior with potential guards ignored.
+    - UNBACKED: No guards guaranteed (most sound) but may throw
+      data dependent errors.
+    - BACKED_SIZE_OBLIVIOUS: Experimental safer alternative to
+      backed/unbacked.
+    """
+
+    # TODO add a debug mode to fail
+
+    def compute_hash(self) -> str:
+        """
+        Provide a hash for DynamicShapesConfig
+        """
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, {})
+        return hash_factors(factors)
+
+
 @config
 @dataclass
 class CompilationConfig:
@@ -322,7 +370,7 @@ class CompilationConfig:
     If empty list [], no ops are excluded (suitable for full cudagraphs)."""
     compile_mm_encoder: bool = False
     """Whether or not to compile the multimodal encoder.
-    Currently, this only works for `Qwen2_5_vl` on selected platforms. 
+    Currently, this only works for `Qwen2_5_vl` on selected platforms.
     Disabled by default until more models are supported/tested to work."""
 
     # Inductor capture
@@ -348,9 +396,11 @@ class CompilationConfig:
     """Sizes to compile for inductor. In addition
     to integers, it also supports "cudagraph_capture_sizes" to
     specify the sizes for cudagraph capture."""
+
     inductor_compile_config: dict = field(default_factory=dict)
     """Additional configurations for inductor.
     - None: use default configurations."""
+
     inductor_passes: dict[str, str] = field(default_factory=dict)
     """Additional passes for inductor. It is a dictionary
     from pass name to pass function qualified name. We use function
@@ -460,8 +510,15 @@ class CompilationConfig:
     max_num_seqs, and prevents capture of many large graphs (>512) that would
     greatly increase startup time with limited performance benefit.
     """
+
+    dynamic_shapes_config: DynamicShapesConfig = field(
+        default_factory=DynamicShapesConfig
+    )
+    """Configuration for dynamic shapes options"""
+
     local_cache_dir: str = field(default=None, init=False)  # type: ignore
     """local cache dir for each rank"""
+
     bs_to_padded_graph_size: list[int] = field(
         default=None,  # type: ignore
         init=False,
@@ -530,6 +587,7 @@ class CompilationConfig:
         from vllm.config.utils import get_hash_factors, hash_factors
 
         factors = get_hash_factors(self, ignored_factors)
+
         factors["pass_config"] = self.pass_config.compute_hash()
         return hash_factors(factors)
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index ebf8addda4a54..eebb9e07fa89d 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -354,7 +354,17 @@ class LlamaDecoderLayer(nn.Module):
         return vllm_config.quant_config
 
 
-@support_torch_compile
+def llama_model_invariants(
+    input_ids, positions, intermediate_tensors=None, inputs_embeds=None
+):
+    """Shape invariants for Llama model compilation, those are translated to
+    runtime assertions for unbacked dynamic shapes and are compiled away for
+    backed"""
+    if input_ids is not None:
+        torch._check(positions.size()[0] == input_ids.size()[0])
+
+
+@support_torch_compile(shape_invariants=llama_model_invariants)
 class LlamaModel(nn.Module):
     def __init__(
         self,
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 32b6d6dd07b83..5831ce0b3d64b 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -274,6 +274,38 @@ class Qwen2DecoderLayer(nn.Module):
         return hidden_states, residual
 
 
+def qwen_2_model_invariants(
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    intermediate_tensors: IntermediateTensors | None = None,
+    inputs_embeds: torch.Tensor | None = None,
+):
+    """Shape invariants for Qwen2Model Model, those are translated to
+    runtime assertions for unbacked dynamic shapes and are compiled away for
+    backed"""
+    # All these should be equal.
+    # input_ids.size()[0]
+    # positions.size()[-1]
+    # intermediate_tensors["hidden_states"].size()[0]
+    # inputs_embeds.size()[0]
+    torch._check(input_ids.size()[0] == positions.size()[-1])
+    if intermediate_tensors is not None:
+        torch._check(
+            input_ids.size()[0] == intermediate_tensors["hidden_states"].size()[0]
+        )
+
+    if inputs_embeds is not None:
+        torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
+
+    # Hidden dimensions should match (hidden_size)
+    # intermediate_tensors["hidden_states"].size()[1]
+    # inputs_embeds.size()[1]
+    if inputs_embeds is not None and intermediate_tensors is not None:
+        torch._check(
+            inputs_embeds.size()[1] == intermediate_tensors["hidden_states"].size()[1]
+        )
+
+
 @support_torch_compile(
     dynamic_arg_dims={
         "input_ids": 0,
@@ -282,7 +314,8 @@ class Qwen2DecoderLayer(nn.Module):
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    }
+    },
+    shape_invariants=qwen_2_model_invariants,
 )
 class Qwen2Model(nn.Module):
     def __init__(

From e48b2e6848ac0084860c8a6bd73927d7535a2c61 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Mon, 24 Nov 2025 22:24:49 +0700
Subject: [PATCH 010/134] [Bugfix] [ROCm] [UX] Reorganize ROCm Backend
 Selection Logic (#26980)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../test_rocm_attention_backends_selection.py | 337 ++++++++++++++++++
 vllm/platforms/rocm.py                        |  80 +++--
 2 files changed, 394 insertions(+), 23 deletions(-)
 create mode 100644 tests/v1/attention/test_rocm_attention_backends_selection.py

diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
new file mode 100644
index 0000000000000..4ec79e9eb6ba4
--- /dev/null
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -0,0 +1,337 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for attention backend selectors."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.platforms import current_platform
+
+# ROCm-specific attention backend selection tests
+pytestmark = pytest.mark.skipif(
+    not current_platform.is_rocm(), reason="ROCm-specific tests"
+)
+
+
+@pytest.fixture
+def mock_vllm_config():
+    """Create a mock VllmConfig for testing."""
+    config = MagicMock()
+    config.model_config.dtype = torch.float16
+    config.model_config.hf_config.architectures = ["LlamaForCausalLM"]
+    config.cache_config.block_size = 16
+    return config
+
+
+@pytest.fixture
+def mock_on_gfx9():
+    """Mock the on_gfx9 function to return True."""
+    with patch("vllm.platforms.rocm.on_gfx9", return_value=True):
+        yield
+
+
+@pytest.mark.parametrize(
+    "env_vars, selected_backend, expected_backend_path",
+    [
+        # Test Case 1: Default (no env vars, no explicit backend)
+        (
+            {},
+            None,
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 2: Explicit TRITON_ATTN backend
+        (
+            {},
+            "TRITON_ATTN",
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 3: Explicit ROCM_ATTN backend
+        (
+            {},
+            "ROCM_ATTN",
+            AttentionBackendEnum.ROCM_ATTN.get_path(),
+        ),
+        # Test Case 4: Explicit ROCM_AITER_FA backend
+        (
+            {},
+            "ROCM_AITER_FA",
+            AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+        ),
+        # Test Case 5: Explicit ROCM_AITER_UNIFIED_ATTN backend
+        (
+            {},
+            "ROCM_AITER_UNIFIED_ATTN",
+            AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
+        ),
+        # Test Case 6: VLLM_ROCM_USE_AITER=1
+        # (defaults to AITER FA when MHA not explicitly disabled)
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            None,
+            AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+        ),
+        # Test Case 7: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=1
+        (
+            {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "1"},
+            None,
+            AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+        ),
+        # Test Case 8: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
+        (
+            {
+                "VLLM_ROCM_USE_AITER": "1",
+                "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION": "1",
+            },
+            None,
+            AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
+        ),
+        # Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
+        (
+            {"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+            None,
+            AttentionBackendEnum.ROCM_ATTN.get_path(),
+        ),
+        # Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            "TRITON_ATTN",
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
+        # (explicitly disabled)
+        (
+            {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"},
+            None,
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            "ROCM_ATTN",
+            AttentionBackendEnum.ROCM_ATTN.get_path(),
+        ),
+    ],
+)
+def test_standard_attention_backend_selection(
+    env_vars,
+    selected_backend,
+    expected_backend_path,
+    mock_vllm_config,
+    mock_on_gfx9,
+    monkeypatch,
+):
+    """Test standard attention backend selection with various configurations."""
+    # Set environment variables
+    for key, value in env_vars.items():
+        monkeypatch.setenv(key, value)
+
+    # Import after setting env vars to ensure they're picked up
+    # Reload envs to pick up new environment variables
+    import importlib
+
+    import vllm.envs as envs
+    from vllm.attention.backends.registry import _Backend
+
+    importlib.reload(envs)
+
+    # Convert string backend to enum if provided
+    backend_enum = None
+    if selected_backend:
+        backend_enum = getattr(_Backend, selected_backend)
+
+    # Get the backend class path
+    from vllm.platforms.rocm import RocmPlatform
+
+    backend_path = RocmPlatform.get_attn_backend_cls(
+        selected_backend=backend_enum,
+        head_size=128,
+        dtype=torch.float16,
+        kv_cache_dtype="auto",
+        block_size=16,
+        use_mla=False,
+        has_sink=False,
+        use_sparse=False,
+    )
+    assert backend_path == expected_backend_path
+
+
+@pytest.mark.parametrize(
+    "env_vars, selected_backend, block_size, expected_backend_path, should_raise",
+    [
+        # Test Case 1: TRITON_MLA with block_size != 1
+        (
+            {},
+            "TRITON_MLA",
+            16,
+            AttentionBackendEnum.TRITON_MLA.get_path(),
+            False,
+        ),
+        # Test Case 2: TRITON_MLA with block_size == 1 (should raise)
+        (
+            {},
+            "TRITON_MLA",
+            1,
+            None,
+            True,
+        ),
+        # Test Case 3: ROCM_AITER_MLA with block_size == 1
+        (
+            {},
+            "ROCM_AITER_MLA",
+            1,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 4: ROCM_AITER_MLA with block_size != 1 (should raise)
+        (
+            {},
+            "ROCM_AITER_MLA",
+            16,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 5: VLLM_ROCM_USE_AITER=1 with block_size == 1
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            None,
+            1,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 6: VLLM_ROCM_USE_AITER=1 with block_size == 16
+        # (should use ROCM_AITER_MLA now, as it supports block_size 16)
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            None,
+            16,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 7: VLLM_ROCM_USE_AITER=1 + explicit TRITON_MLA
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            "TRITON_MLA",
+            16,
+            AttentionBackendEnum.TRITON_MLA.get_path(),
+            False,
+        ),
+        # Test Case 8: Explicit ROCM_AITER_TRITON_MLA
+        (
+            {},
+            "ROCM_AITER_TRITON_MLA",
+            16,
+            AttentionBackendEnum.ROCM_AITER_TRITON_MLA.get_path(),
+            False,
+        ),
+    ],
+)
+def test_mla_backend_selection(
+    env_vars,
+    selected_backend,
+    block_size,
+    expected_backend_path,
+    should_raise,
+    mock_vllm_config,
+    monkeypatch,
+):
+    """Test MLA backend selection with various configurations."""
+    # Set environment variables
+    for key, value in env_vars.items():
+        monkeypatch.setenv(key, value)
+
+    # Import after setting env vars
+    # Reload envs
+    import importlib
+
+    import vllm.envs as envs
+    from vllm.attention.backends.registry import _Backend
+
+    importlib.reload(envs)
+
+    # Mock is_aiter_mla_enabled based on env vars and block_size
+    aiter_enabled = env_vars.get("VLLM_ROCM_USE_AITER") == "1"
+
+    mock_rocm_ops = MagicMock()
+    mock_rocm_ops.is_mla_enabled.return_value = aiter_enabled
+    mock_aiter_module = MagicMock()
+    mock_aiter_module.rocm_aiter_ops = mock_rocm_ops
+
+    with patch.dict("sys.modules", {"vllm._aiter_ops": mock_aiter_module}):
+        # Convert string backend to enum if provided
+        backend_enum = None
+        if selected_backend:
+            backend_enum = getattr(_Backend, selected_backend)
+
+        from vllm.platforms.rocm import RocmPlatform
+
+        if should_raise:
+            with pytest.raises(ValueError):
+                RocmPlatform.get_attn_backend_cls(
+                    selected_backend=backend_enum,
+                    head_size=128,
+                    dtype=torch.float16,
+                    kv_cache_dtype="auto",
+                    block_size=block_size,
+                    use_mla=True,
+                    has_sink=False,
+                    use_sparse=False,
+                )
+        else:
+            backend_path = RocmPlatform.get_attn_backend_cls(
+                selected_backend=backend_enum,
+                head_size=128,
+                dtype=torch.float16,
+                kv_cache_dtype="auto",
+                block_size=block_size,
+                use_mla=True,
+                has_sink=False,
+                use_sparse=False,
+            )
+            assert backend_path == expected_backend_path
+
+
+def test_aiter_fa_requires_gfx9(mock_vllm_config):
+    """Test that ROCM_AITER_FA requires gfx9 architecture."""
+    from vllm.attention.backends.registry import _Backend
+    from vllm.platforms.rocm import RocmPlatform
+
+    # Mock on_gfx9 to return False
+    with (
+        patch("vllm.platforms.rocm.on_gfx9", return_value=False),
+        pytest.raises(
+            ValueError,
+            match="only supported on gfx9",
+        ),
+    ):
+        RocmPlatform.get_attn_backend_cls(
+            selected_backend=_Backend.ROCM_AITER_FA,
+            head_size=128,
+            dtype=torch.float16,
+            kv_cache_dtype="auto",
+            block_size=16,
+            use_mla=False,
+            has_sink=False,
+            use_sparse=False,
+        )
+
+
+def test_sparse_not_supported(mock_vllm_config):
+    """Test that sparse attention is not supported on ROCm."""
+    from vllm.platforms.rocm import RocmPlatform
+
+    with pytest.raises(
+        AssertionError, match="Sparse MLA backend on ROCm only supports block size 1"
+    ):
+        RocmPlatform.get_attn_backend_cls(
+            selected_backend=None,
+            head_size=128,
+            dtype=torch.float16,
+            kv_cache_dtype="auto",
+            block_size=16,
+            use_mla=False,
+            has_sink=False,
+            use_sparse=True,
+        )
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f9005fd7d044c..f3ec965bd0881 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -262,30 +262,64 @@ class RocmPlatform(Platform):
                 f"is not MLA type while requested for MLA backend."
             )
 
-        if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
-            logger.info("Using FlexAttention backend.")
-            return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
-        if (
-            rocm_aiter_ops.is_mha_enabled()
-        ) or selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
-            logger.info("Using Aiter Flash Attention backend.")
-            return AttentionBackendEnum.ROCM_AITER_FA.get_path()
-        if (
-            rocm_aiter_ops.is_triton_unified_attn_enabled()
-        ) or selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
-            logger.info("Using Aiter Unified Attention backend.")
-            return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
-        if (
-            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
-            or selected_backend == AttentionBackendEnum.ROCM_ATTN
-        ):
-            # rocm specific backend, with aiter and/or
-            #   triton prefix-prefill
-            logger.info("Using Rocm Attention backend.")
+        if selected_backend == AttentionBackendEnum.TRITON_ATTN:
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+        if selected_backend == AttentionBackendEnum.ROCM_ATTN:
+            logger.info("Using Rocm Attention backend on V1 engine.")
             return AttentionBackendEnum.ROCM_ATTN.get_path()
-        # default case, using triton unified attention
-        logger.info("Using Triton Attention backend.")
-        return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+        if selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
+            if on_gfx9():
+                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+            else:
+                raise ValueError(
+                    f"The selected backend, {selected_backend.name}, "
+                    "is only supported on gfx9 architectures."
+                )
+
+        if selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
+            logger.info("Using Aiter Unified Attention backend on V1 engine.")
+            return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
+
+        # Handle automatic backend selection based on environment variables
+        if selected_backend is None:
+            # Priority 1: Check for AITER Unified Attention (must check before MHA)
+            if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION:
+                logger.info("Using Aiter Unified Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
+
+            # Priority 2: Check for AITER MHA (Flash Attention)
+            # Only use if explicitly enabled (not just VLLM_ROCM_USE_AITER=1)
+            if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
+                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+
+            # Priority 3: Check for ROCM_ATTN (prefill-decode split)
+            if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
+                logger.info("Using Rocm Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_ATTN.get_path()
+
+            # Priority 4: Check for AITER enabled without specific flags
+            # This defaults to AITER FA only if MHA is not explicitly disabled
+            if (
+                envs.VLLM_ROCM_USE_AITER
+                and on_gfx9()
+                and envs.VLLM_ROCM_USE_AITER_MHA is not False
+            ):
+                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+
+            # Default: Triton Unified Attention
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+        raise RuntimeError(
+            "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
+            "to select a supported backend."
+        )
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:

From 656516c3158ef932c6a19a6aab9fdf4df74b105f Mon Sep 17 00:00:00 2001
From: Aydin Abiar <62435714+Aydin-ab@users.noreply.github.com>
Date: Mon, 24 Nov 2025 07:28:51 -0800
Subject: [PATCH 011/134] [Bugfix] properly handle nested json with llama3 tool
 parser (#27701)

Signed-off-by: Aydin Abiar <aydin@anyscale.com>
Signed-off-by: Aydin Abiar <62435714+Aydin-ab@users.noreply.github.com>
Co-authored-by: Aydin Abiar <aydin@anyscale.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 .../test_llama3_json_tool_parser.py           | 128 ++++++++++++++++++
 .../openai/tool_parsers/llama_tool_parser.py  | 116 ++++++++++------
 2 files changed, 203 insertions(+), 41 deletions(-)

diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
index 2b68a653f4600..37e52d2cdf609 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from unittest.mock import MagicMock, patch
+
 import pytest
 
 from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
@@ -132,3 +134,129 @@ def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
     assert result.tool_calls[0].function.name == "searchTool"
     assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
     assert result.tool_calls[2].function.name == "searchTool"
+
+
+def test_extract_tool_calls_deeply_nested_json(parser):
+    # Test with deeply nested JSON parameters (5 levels)
+    model_output = (
+        '{"name": "complexTool", '
+        '"parameters": {'
+        '"level1": {'
+        '"level2": {'
+        '"level3": {'
+        '"level4": {'
+        '"value": "deep"'
+        "}}}}}}"
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "complexTool"
+    # Verify the nested structure is preserved in the arguments
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["level1"]["level2"]["level3"]["level4"]["value"] == "deep"
+
+
+def test_extract_tool_calls_multiple_with_deep_nesting(parser):
+    # Test with multiple tool calls where some have deeply nested parameters
+    model_output = (
+        '{"name": "simpleTool", "parameters": {"value": "test"}}; '
+        '{"name": "complexTool", "parameters": '
+        '{"config": {"database": {"connection": {"pool": {"size": 10}}}}}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 2
+
+    # Check first tool call
+    assert result.tool_calls[0].function.name == "simpleTool"
+    import json
+
+    args0 = json.loads(result.tool_calls[0].function.arguments)
+    assert args0["value"] == "test"
+
+    # Check second tool call with deep nesting
+    assert result.tool_calls[1].function.name == "complexTool"
+    args1 = json.loads(result.tool_calls[1].function.arguments)
+    assert args1["config"]["database"]["connection"]["pool"]["size"] == 10
+
+
+def test_extract_tool_calls_with_quotes_and_brackets_in_string(parser):
+    # Test with quotes and brackets inside quoted string values
+    model_output = (
+        '{"name": "searchTool", '
+        '"parameters": {'
+        '"query": "test {value} [complex]",'
+        '"nested": {"inner": "more {brackets}"}'
+        "}}"
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "searchTool"
+    # Verify the string values are preserved including brackets and quotes
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["query"] == "test {value} [complex]"
+    assert args["nested"]["inner"] == "more {brackets}"
+
+
+def test_extract_tool_calls_with_escaped_quotes_in_nested_json(parser):
+    # Test with escaped quotes in deeply nested JSON
+    model_output = (
+        '{"name": "parserTool", "parameters": {"text": "He said \\"Hello {world}\\""}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "parserTool"
+    # Verify escaped quotes are preserved
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["text"] == 'He said "Hello {world}"'
+
+
+def test_extract_tool_calls_missing_name_key(parser):
+    # Test that missing "name" key returns content
+    model_output = '{"parameters": {}}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_extract_tool_calls_missing_parameters_and_arguments_key(parser):
+    # Test that missing both "parameters" and "arguments" keys returns content
+    model_output = '{"name": "toolWithoutParams"}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_regex_timeout_handling(parser):
+    """Test regex timeout is handled gracefully"""
+    fake_problematic_input = "{hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.finditer.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(parser, "tool_call_start_regex", mock_regex):
+        result = parser.extract_tool_calls(fake_problematic_input, None)
+
+        # should treat as regular text when regex times out
+        assert result.content == fake_problematic_input
+        assert result.tools_called is False
+        assert len(result.tool_calls) == 0
+        mock_regex.finditer.assert_called_once()
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 02fc9b8a4d34e..e1fe6e90dfd0b 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -9,6 +9,7 @@ import regex as re
 from partial_json_parser.core.options import Allow
 from transformers import PreTrainedTokenizerBase
 
+import vllm.envs as envs
 from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
@@ -56,12 +57,10 @@ class Llama3JsonToolParser(ToolParser):
         self.bot_token_id = tokenizer.encode(self.bot_token, add_special_tokens=False)[
             0
         ]
-        # Updated regex to match multiple JSONs separated by semicolons
-        # This pattern is more robust and can handle nested JSON objects
-        self.tool_call_regex = re.compile(
-            r"{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*",
-            re.DOTALL,
-        )
+        # Simple regex to find opening braces - we'll use JSON decoder for parsing
+        # This handles arbitrary nesting depth correctly
+        self.tool_call_start_regex = re.compile(r"\{")
+        self.json_decoder = json.JSONDecoder()
 
     def extract_tool_calls(
         self, model_output: str, request: ChatCompletionRequest
@@ -77,49 +76,84 @@ class Llama3JsonToolParser(ToolParser):
                 tools_called=False, tool_calls=[], content=model_output
             )
 
-        # Find JSON object(s) in the text using regex
-        match = self.tool_call_regex.search(model_output)
-        if not match:
+        # Keep track of the end index of the last parsed JSON object
+        # so we don't parse inner brackets
+        end_index = -1
+        tool_calls: list[ToolCall] = []
+
+        try:
+            for match in self.tool_call_start_regex.finditer(
+                model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
+            ):
+                start_index = match.start()
+                # Skip if this brace is inside a previously parsed JSON object
+                if start_index <= end_index:
+                    continue
+
+                try:
+                    obj, json_end_index = self.json_decoder.raw_decode(
+                        model_output[start_index:]
+                    )
+                    end_index = start_index + json_end_index
+
+                    # raise KeyError if missing
+                    name = obj["name"]
+                    arguments_or_params = (
+                        obj["arguments"] if "arguments" in obj else obj["parameters"]
+                    )
+
+                    tool_calls.append(
+                        ToolCall(
+                            type="function",
+                            function=FunctionCall(
+                                name=name,
+                                # function call args are JSON but as a string
+                                arguments=json.dumps(
+                                    arguments_or_params, ensure_ascii=False
+                                ),
+                            ),
+                        )
+                    )
+                except KeyError as e:
+                    # Missing required key
+                    missing_key = str(e).strip("'\"")
+                    logger.exception(
+                        "Couldn't extract tool call from JSON response. "
+                        "Required key '%s' not present. "
+                        "Returning output in content with empty tool calls.",
+                        missing_key,
+                    )
+                    return ExtractedToolCallInformation(
+                        tools_called=False, tool_calls=[], content=model_output
+                    )
+                except Exception:
+                    # Any other error during parsing
+                    logger.exception(
+                        "Error in extracting tool call from response. "
+                        "Returning output in content with empty tool calls"
+                    )
+                    return ExtractedToolCallInformation(
+                        tools_called=False, tool_calls=[], content=model_output
+                    )
+        except TimeoutError:
+            logger.warning("Regex timeout occurred when matching tool call pattern.")
+            logger.debug(
+                "Regex timeout occurred when matching user input: %s", model_output
+            )
             return ExtractedToolCallInformation(
                 tools_called=False, tool_calls=[], content=model_output
             )
 
-        try:
-            json_str = match.group(0)
-            # Split by semicolon and strip whitespace
-            json_objects = [obj.strip() for obj in json_str.split(";")]
-
-            tool_calls: list[ToolCall] = []
-            for json_obj in json_objects:
-                if not json_obj:  # Skip empty strings
-                    continue
-                obj = json.loads(json_obj)
-                tool_calls.append(
-                    ToolCall(
-                        type="function",
-                        function=FunctionCall(
-                            name=obj["name"],
-                            # function call args are JSON but as a string
-                            arguments=json.dumps(
-                                obj["arguments"]
-                                if "arguments" in obj
-                                else obj["parameters"],
-                                ensure_ascii=False,
-                            ),
-                        ),
-                    )
-                )
-
+        # If we have valid tool calls, return them normally
+        if tool_calls:
             return ExtractedToolCallInformation(
                 tools_called=True, tool_calls=tool_calls, content=None
             )
 
-        except Exception:
-            logger.exception("Error in extracting tool call from response.")
-            # return information to just treat the tool call as regular JSON
-            return ExtractedToolCallInformation(
-                tools_called=False, tool_calls=[], content=model_output
-            )
+        # No valid tool calls found
+        return ExtractedToolCallInformation(
+            tools_called=False, tool_calls=[], content=model_output
+        )
 
     def extract_tool_calls_streaming(
         self,

From e924bbb4f4ac3258a71a18ac4c753c8056bc059f Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 24 Nov 2025 11:06:17 -0500
Subject: [PATCH 012/134] [Build/CI][DP/EP] Add QWen/Qwen3-30B-A3B-FP8 + EPLB
 tests to Nightly H100 and B200 (#29195)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 ...block_ep.sh => qwen30b_a3b_fp8_block_ep_eplb.sh} | 10 +++++++---
 .buildkite/test-amd.yaml                            |  2 +-
 .buildkite/test-pipeline.yaml                       | 13 +++++++++++--
 3 files changed, 19 insertions(+), 6 deletions(-)
 rename .buildkite/scripts/scheduled_integration_test/{qwen30b_a3b_fp8_block_ep.sh => qwen30b_a3b_fp8_block_ep_eplb.sh} (82%)

diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
similarity index 82%
rename from .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
rename to .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
index 0d06f53a183d0..6a1bef275d047 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
 THRESHOLD=${1:-0.8}
 NUM_Q=${2:-1319}
 PORT=${3:-8020}
+DATA_PARALLEL_SIZE=${4:-2}
+TENSOR_PARALLEL_SIZE=${5:-2}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"
 
@@ -45,8 +47,10 @@ for BACK in "${BACKENDS[@]}"; do
   VLLM_ALL2ALL_BACKEND=$BACK \
   vllm serve "$MODEL" \
     --enforce-eager \
-    --tensor-parallel-size 2 \
-    --data-parallel-size 2 \
+    --enable-eplb \
+    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --data-parallel-size ${DATA_PARALLEL_SIZE} \
     --enable-expert-parallel \
     --trust-remote-code \
     --max-model-len 2048 \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index f098e23866eb3..4ddf11c0b268f 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1486,4 +1486,4 @@ steps:
   num_gpus: 4
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7a46e919f93bf..f1cd39ef4f948 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1340,11 +1340,20 @@ steps:
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
   timeout_in_minutes: 60
   gpu: h100
   optional: true
   num_gpus: 4
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file

From 26a465584ae14a04a1f6e9b36621d70e9d907c10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Mon, 24 Nov 2025 18:18:04 +0100
Subject: [PATCH 013/134] [NIXL] Use config to enable telemetry + NIXL version
 bump (#29305)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 requirements/kv_connectors.txt                           | 2 +-
 .../kv_transfer/kv_connector/v1/nixl_connector.py        | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt
index b1f3269cd3813..083230c171096 100644
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@@ -1,2 +1,2 @@
 lmcache
-nixl >= 0.6.0 # Required for disaggregated prefill
+nixl >= 0.7.1 # Required for disaggregated prefill
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 7c0911240493c..493938d4aad92 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -4,7 +4,6 @@ import contextlib
 import copy
 import logging
 import math
-import os
 import queue
 import threading
 import time
@@ -810,9 +809,6 @@ class NixlConnectorWorker:
         self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config(
             "backends", ["UCX"]
         )
-        # TODO temporary, once nixl allows for telemetry flag in config
-        # (next release), we can remove this env var.
-        os.environ["NIXL_TELEMETRY_ENABLE"] = "1"
 
         # Agent.
         non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
@@ -828,10 +824,11 @@ class NixlConnectorWorker:
         if nixl_agent_config is None:
             config = None
         else:
+            # Enable telemetry by default for NIXL 0.7.1 and above.
             config = (
-                nixl_agent_config(backends=self.nixl_backends)
+                nixl_agent_config(backends=self.nixl_backends, capture_telemetry=True)
                 if len(non_ucx_backends) > 0
-                else nixl_agent_config(num_threads=num_threads)
+                else nixl_agent_config(num_threads=num_threads, capture_telemetry=True)
             )
 
         self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)

From cc313cb73d75cd5ac2715fc45bfadb89888cf8cd Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Nov 2025 09:32:27 -0800
Subject: [PATCH 014/134] [Model Runner V2] Implement Single-step Eagle 1
 (#29300)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py          |   3 +
 vllm/v1/worker/gpu/model_runner.py         |  79 +++++++++
 vllm/v1/worker/gpu/sampler.py              |   5 +-
 vllm/v1/worker/gpu/spec_decode/__init__.py |  18 ++
 vllm/v1/worker/gpu/spec_decode/eagle.py    | 197 +++++++++++++++++++++
 5 files changed, 300 insertions(+), 2 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/spec_decode/eagle.py

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 7675cb45170b5..1177d25e300cf 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -37,6 +37,9 @@ class InputBuffers:
         self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
         self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
 
+        # Spec decoding.
+        self.next_prefill_tokens = self._make_buffer(max_num_reqs, dtype=torch.int32)
+
         # Structured outputs.
         self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
         self.grammar_bitmask = self._make_buffer(
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 6e332ee4b75b8..205298a415d43 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -45,6 +45,7 @@ from vllm.v1.worker.gpu.input_batch import (
     prepare_prefill_inputs,
 )
 from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
+from vllm.v1.worker.gpu.spec_decode import init_speculator
 from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
 from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
 from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
@@ -97,16 +98,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if self.use_async_scheduling:
             self.input_prep_event = torch.cuda.Event()
             self.structured_outputs_event = torch.cuda.Event()
+            self.spec_decode_event = torch.cuda.Event()
         else:
             self.input_prep_event = None
             self.structured_outputs_event = None
+            self.spec_decode_event = None
 
         if self.speculative_config is not None:
             self.do_spec_decode = True
             self.num_speculative_steps = self.speculative_config.num_speculative_tokens
+            self.speculator = init_speculator(self.vllm_config, self.device)
         else:
             self.do_spec_decode = False
             self.num_speculative_steps = 0
+            self.speculator = None
 
         self.req_states = RequestState(
             max_num_reqs=self.max_num_reqs,
@@ -153,6 +158,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     self.vllm_config,
                     self.device,
                 )
+            if self.do_spec_decode:
+                self.speculator.load_model(self.model)
         time_after_load = time.perf_counter()
 
         self.model_memory_usage = m.consumed_memory
@@ -285,6 +292,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         logits = self.model.compute_logits(hidden_states)
         self.sampler(logits, sampling_metadata)
 
+    @torch.inference_mode()
+    def _dummy_speculator_run(
+        self,
+        hidden_states: torch.Tensor,
+        aux_hidden_states: list[torch.Tensor] | None,
+    ) -> None:
+        num_tokens = hidden_states.shape[0]
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        input_batch = InputBatch.make_dummy(
+            num_reqs=num_reqs,
+            num_tokens=num_tokens,
+            input_buffers=self.input_buffers,
+            device=self.device,
+        )
+        sampling_metadata = SamplingMetadata.make_dummy(
+            num_reqs=num_reqs,
+            device=self.device,
+        )
+        num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device)
+        self.propose_draft(
+            input_batch=input_batch,
+            sampling_metadata=sampling_metadata,
+            last_hidden_states=hidden_states,
+            aux_hidden_states=aux_hidden_states,
+            num_sampled=num_sampled,
+        )
+
     @torch.inference_mode()
     def profile_run(self) -> None:
         hidden_states, sample_hidden_states = self._dummy_run(
@@ -292,6 +326,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             skip_attn=True,
         )
         self._dummy_sampler_run(sample_hidden_states)
+        if self.do_spec_decode:
+            self._dummy_speculator_run(hidden_states, None)
         torch.cuda.synchronize()
         del hidden_states, sample_hidden_states
         gc.collect()
@@ -727,6 +763,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.req_states.prefill_len.np[idx_mapping_np],
         )
 
+    @torch.inference_mode()
+    def propose_draft(
+        self,
+        input_batch: InputBatch,
+        sampling_metadata: SamplingMetadata,
+        last_hidden_states: torch.Tensor,
+        aux_hidden_states: list[torch.Tensor] | None,
+        num_sampled: torch.Tensor,
+    ) -> torch.Tensor:
+        num_reqs = input_batch.num_reqs
+        idx_mapping_np = input_batch.idx_mapping_np
+        with async_barrier(self.spec_decode_event):
+            self.input_buffers.next_prefill_tokens.np[:num_reqs] = (
+                self.req_states.prefill_token_ids[
+                    idx_mapping_np,
+                    self.req_states.num_computed_prefill_tokens[idx_mapping_np],
+                ]
+            )
+            next_prefill_tokens = self.input_buffers.next_prefill_tokens.copy_to_gpu(
+                num_reqs
+            )
+
+        assert self.speculator is not None
+        draft_tokens = self.speculator.propose(
+            input_batch,
+            sampling_metadata,
+            last_hidden_states,
+            aux_hidden_states,
+            num_sampled,
+            self.req_states.last_sampled_tokens,
+            next_prefill_tokens,
+        )
+        self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens
+        return draft_tokens
+
     def get_cudagraph_and_dp_padding(
         self,
         scheduler_output: SchedulerOutput,
@@ -913,6 +984,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.postprocess(
             input_batch, sampler_output.sampled_token_ids, num_sampled_tokens
         )
+        if self.do_spec_decode:
+            _ = self.propose_draft(
+                input_batch,
+                sampling_metadata,
+                hidden_states,
+                None,  # aux_hidden_states
+                num_sampled_tokens,
+            )
 
         if self.use_async_scheduling:
             return async_output
diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
index c48ed2d8ca167..d8676079ab951 100644
--- a/vllm/v1/worker/gpu/sampler.py
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -100,8 +100,9 @@ def _gumbel_sample_kernel(
         mask=mask,
         other=float("-inf"),
     )
+    logits = logits.to(tl.float32)
 
-    temp = tl.load(temp_ptr + req_idx)
+    temp = tl.load(temp_ptr + req_idx).to(tl.float32)
     if temp != 0.0:
         # Calculate the seed for gumbel noise.
         seed = tl.load(seeds_ptr + req_idx)
@@ -116,7 +117,7 @@ def _gumbel_sample_kernel(
         # Apply temperature.
         if APPLY_TEMPERATURE:
             # NOTE(woosuk): Use div_rn to match the behavior of torch.
-            logits = tl.div_rn(logits, temp.to(tl.float32))
+            logits = tl.div_rn(logits, temp)
 
         # Apply gumbel noise.
         logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
diff --git a/vllm/v1/worker/gpu/spec_decode/__init__.py b/vllm/v1/worker/gpu/spec_decode/__init__.py
index e69de29bb2d1d..15b85204e05ce 100644
--- a/vllm/v1/worker/gpu/spec_decode/__init__.py
+++ b/vllm/v1/worker/gpu/spec_decode/__init__.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.config import VllmConfig
+
+
+def init_speculator(
+    vllm_config: VllmConfig,
+    device: torch.device,
+):
+    speculative_config = vllm_config.speculative_config
+    assert speculative_config is not None
+    if speculative_config.use_eagle():
+        from vllm.v1.worker.gpu.spec_decode.eagle import EagleSpeculator
+
+        return EagleSpeculator(vllm_config, device)
+    raise NotImplementedError(f"{speculative_config.method} is not supported yet.")
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
new file mode 100644
index 0000000000000..0f11903e14540
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.forward_context import set_forward_context
+from vllm.model_executor.model_loader import get_model
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu.input_batch import InputBatch
+from vllm.v1.worker.gpu.sampler import gumbel_sample
+from vllm.v1.worker.gpu.states import SamplingMetadata
+
+
+class EagleSpeculator:
+    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        self.vllm_config = vllm_config
+        self.device = device
+
+        self.speculative_config = vllm_config.speculative_config
+        assert self.speculative_config is not None
+        self.method = self.speculative_config.method
+        self.num_speculative_steps = self.speculative_config.num_speculative_tokens
+        self.draft_model_config = self.speculative_config.draft_model_config
+
+        self.scheduler_config = vllm_config.scheduler_config
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+
+        self.input_ids = torch.zeros(
+            self.max_num_tokens, dtype=torch.int32, device=device
+        )
+        self.positions = torch.zeros(
+            self.max_num_tokens, dtype=torch.int64, device=device
+        )
+
+    def load_model(self, target_model: nn.Module) -> None:
+        from vllm.compilation.backends import set_model_tag
+
+        with set_model_tag("eagle_head"):
+            self.model = get_model(
+                vllm_config=self.vllm_config, model_config=self.draft_model_config
+            )
+
+        share_lm_head = True
+        if share_lm_head and hasattr(target_model, "lm_head"):
+            if hasattr(self.model, "lm_head"):
+                del self.model.lm_head
+            self.model.lm_head = target_model.lm_head
+
+    @torch.inference_mode()
+    def propose(
+        self,
+        input_batch: InputBatch,
+        sampling_metadata: SamplingMetadata,
+        # [num_tokens, hidden_size]
+        last_hidden_states: torch.Tensor,
+        # num_layers x [num_tokens, hidden_size]
+        aux_hidden_states: list[torch.Tensor] | None,
+        # [num_reqs]
+        num_sampled: torch.Tensor,
+        # [max_num_reqs, 1]
+        last_sampled: torch.Tensor,
+        # [num_reqs]
+        next_prefill_tokens: torch.Tensor,
+    ) -> torch.Tensor:
+        if aux_hidden_states:
+            assert self.method == "eagle3"
+            hidden_states = self.model.combine_hidden_states(
+                torch.cat(aux_hidden_states, dim=-1)
+            )
+        else:
+            hidden_states = last_hidden_states
+
+        # Get the input ids and last token indices for the speculator.
+        last_token_indices = prepare_eagle_inputs(
+            self.input_ids,
+            input_batch,
+            num_sampled,
+            last_sampled,
+            next_prefill_tokens,
+        )
+        input_ids = self.input_ids[: input_batch.num_tokens_after_padding]
+
+        # Prefill: Run the eagle speculator with eager mode.
+        with set_forward_context(
+            input_batch.attn_metadata,
+            self.vllm_config,
+            num_tokens=input_batch.num_tokens_after_padding,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
+        ):
+            ret_hidden_states = self.model(
+                input_ids=input_ids,
+                positions=input_batch.positions,
+                hidden_states=hidden_states,
+            )
+        if self.method == "mtp":
+            last_hidden_states = ret_hidden_states
+            hidden_states = ret_hidden_states
+        else:
+            last_hidden_states, hidden_states = ret_hidden_states
+        sample_hidden_states = last_hidden_states[last_token_indices]
+        logits = self.model.compute_logits(sample_hidden_states)
+
+        num_reqs = input_batch.num_reqs
+        cu_num_logits = input_batch.cu_num_logits[:num_reqs]
+        temperature = sampling_metadata.temperature[cu_num_logits]
+        seed = sampling_metadata.seeds[cu_num_logits]
+        # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
+        # used for draft and target sampling.
+        pos = input_batch.positions[last_token_indices] + 1
+        draft_tokens = gumbel_sample(
+            logits, temperature, seed, pos, apply_temperature=True
+        )
+        if self.num_speculative_steps == 1:
+            # Early exit.
+            return draft_tokens.view(-1, 1)
+        raise NotImplementedError("num_speculative_steps > 1 is not supported yet.")
+
+
+@triton.jit
+def _prepare_eagle_inputs_kernel(
+    last_token_indices_ptr,
+    eagle_input_ids_ptr,
+    target_input_ids_ptr,
+    idx_mapping_ptr,
+    last_sampled_ptr,
+    next_prefill_tokens_ptr,
+    num_sampled_ptr,
+    query_start_loc_ptr,
+    cu_num_logits_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    query_start = tl.load(query_start_loc_ptr + batch_idx)
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    query_len = query_end - query_start
+
+    # Get the true query length and next token after accounting for rejected tokens.
+    num_sampled = tl.load(num_sampled_ptr + batch_idx)
+    if num_sampled > 0:
+        req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+        next_token = tl.load(last_sampled_ptr + req_state_idx).to(tl.int32)
+
+        logits_start = tl.load(cu_num_logits_ptr + batch_idx)
+        logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1)
+        num_logits = logits_end - logits_start
+
+        num_rejected = num_logits - num_sampled
+        query_len -= num_rejected
+    else:
+        # Chunked prefilling.
+        # Get the next prefill token.
+        next_token = tl.load(next_prefill_tokens_ptr + batch_idx)
+
+    # Shift target_input_ids by one.
+    for i in range(1, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        input_ids = tl.load(target_input_ids_ptr + query_start + block, mask=mask)
+        tl.store(eagle_input_ids_ptr + query_start + block - 1, input_ids, mask=mask)
+
+    last_token_index = query_start + query_len - 1
+    tl.store(last_token_indices_ptr + batch_idx, last_token_index)
+    tl.store(eagle_input_ids_ptr + last_token_index, next_token)
+
+
+def prepare_eagle_inputs(
+    eagle_input_ids: torch.Tensor,
+    input_batch: InputBatch,
+    # [num_reqs]
+    num_sampled: torch.Tensor,
+    # [max_num_reqs, 1]
+    last_sampled: torch.Tensor,
+    # [max_num_reqs]
+    next_prefill_tokens: torch.Tensor,
+) -> torch.Tensor:
+    num_reqs = input_batch.num_reqs
+    last_token_indices = torch.empty(
+        num_reqs,
+        dtype=torch.int64,
+        device=eagle_input_ids.device,
+    )
+    _prepare_eagle_inputs_kernel[(num_reqs,)](
+        last_token_indices,
+        eagle_input_ids,
+        input_batch.input_ids,
+        input_batch.idx_mapping,
+        last_sampled,
+        next_prefill_tokens,
+        num_sampled,
+        input_batch.query_start_loc,
+        input_batch.cu_num_logits,
+        BLOCK_SIZE=1024,
+    )
+    return last_token_indices

From cec418b5df3bf032a83b6a6795e8026d39e199bd Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Nov 2025 09:34:37 -0800
Subject: [PATCH 015/134] [Model Runner V2] Change Numba AoT to JIT (#29328)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py  | 71 +++++++-----------------------
 vllm/v1/worker/gpu/model_runner.py | 24 ++++++----
 2 files changed, 32 insertions(+), 63 deletions(-)

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 1177d25e300cf..3ac43ea4952de 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -4,7 +4,6 @@ from dataclasses import dataclass
 from typing import Any
 
 import numba
-import numba.types as types
 import numpy as np
 import torch
 
@@ -147,80 +146,42 @@ class InputBatch:
         )
 
 
-# NOTE: With the type annotations, this function is pre-compiled
-# before the first call.
-@numba.jit(
-    [
-        types.none(
-            types.int32[:],  # idx_mapping
-            types.int32[:],  # num_scheduled_tokens
-            types.int32[:, :],  # prefill_token_ids
-            types.int32[:],  # num_computed_prefill_tokens
-            types.int32[:],  # prefill_len
-            types.int32[:],  # input_ids
-            types.int32[:],  # query_start_loc
-        )
-    ],
-    nopython=True,
-    cache=True,
-)
+@numba.njit(cache=True)
 def _prepare_prefill_inputs(
-    idx_mapping: np.ndarray,  # batch_idx -> req_idx
-    num_scheduled_tokens: np.ndarray,  # [B]
+    idx_mapping: np.ndarray,  # [B]
+    query_lens: np.ndarray,  # [B]
+    query_start_loc: np.ndarray,  # [B + 1]
     prefill_token_ids: np.ndarray,  # [N, max_model_len]
     num_computed_prefill_tokens: np.ndarray,  # [N]
-    prefill_len: np.ndarray,  # [N]
     input_ids: np.ndarray,  # [num_input_tokens]
-    query_start_loc: np.ndarray,  # [B + 1]
 ) -> None:
-    num_reqs = num_scheduled_tokens.shape[0]
-    query_start_loc[0] = 0
-
-    cu_num_tokens = 0
+    num_reqs = idx_mapping.shape[0]
+    query_starts = query_start_loc[:num_reqs]
+    query_ends = query_start_loc[1 : num_reqs + 1]
+    starts = num_computed_prefill_tokens[idx_mapping]
+    ends = starts + query_lens
     for i in range(num_reqs):
-        req_idx = idx_mapping[i]
-        query_len = num_scheduled_tokens[i]
-
-        start = num_computed_prefill_tokens[req_idx]
-        end = min(start + query_len, prefill_len[req_idx])
-        n = end - start
-
-        start_idx = cu_num_tokens
-        input_ids[start_idx : start_idx + n] = prefill_token_ids[req_idx, start:end]
-
-        cu_num_tokens = start_idx + query_len
-        query_start_loc[i + 1] = cu_num_tokens
-
-    # Pad the inputs for CUDA graphs.
-    # Note: pad query_start_loc to be non-decreasing, as kernels
-    # like FlashAttention requires that
-    query_start_loc[num_reqs + 1 :].fill(cu_num_tokens)
+        input_ids[query_starts[i] : query_ends[i]] = prefill_token_ids[
+            idx_mapping[i], starts[i] : ends[i]
+        ]
 
 
 def prepare_prefill_inputs(
     idx_mapping: np.ndarray,
     num_scheduled_tokens: np.ndarray,
-    total_num_tokens: int,
+    query_start_loc: np.ndarray,
     prefill_token_ids: np.ndarray,
     num_computed_prefill_tokens: np.ndarray,
-    prefill_len: np.ndarray,
-    input_ids: CpuGpuBuffer,
-    query_start_loc: CpuGpuBuffer,
+    input_ids: np.ndarray,
 ) -> None:
     _prepare_prefill_inputs(
         idx_mapping,
         num_scheduled_tokens,
+        query_start_loc,
         prefill_token_ids,
         num_computed_prefill_tokens,
-        prefill_len,
-        input_ids.np,
-        query_start_loc.np,
+        input_ids,
     )
-    input_ids.copy_to_gpu(total_num_tokens)
-    # NOTE(woosuk): We should copy the whole query_start_loc and seq_lens
-    # tensors from CPU to GPU, because they may include paddings needed
-    # for full CUDA graph mode.
-    query_start_loc.copy_to_gpu()
 
 
 @triton.jit
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 205298a415d43..e0ed183d3c5b0 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -502,20 +502,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
         block_tables = self.block_tables.gather_block_tables(idx_mapping)
 
-        # Copy prefill tokens from CPU to GPU and get query_start_loc.
+        # Get query_start_loc.
+        np.cumsum(
+            num_scheduled_tokens,
+            out=self.input_buffers.query_start_loc.np[1 : num_reqs + 1],
+        )
+        # Pad for full CUDA graph mode.
+        # Some attention backends like FA3 require query_start_loc to be non-decreasing.
+        self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
+        self.input_buffers.query_start_loc.copy_to_gpu()
+        query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+        query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
+
+        # Copy prefill tokens from CPU to GPU.
         prepare_prefill_inputs(
             idx_mapping_np,
             num_scheduled_tokens,
-            num_tokens,
+            query_start_loc_np,
             self.req_states.prefill_token_ids,
             self.req_states.num_computed_prefill_tokens,
-            self.req_states.prefill_len.np,
-            self.input_buffers.input_ids,
-            self.input_buffers.query_start_loc,
+            self.input_buffers.input_ids.np,
         )
-        query_start_loc = self.input_buffers.query_start_loc
-        query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
-        query_start_loc_np = query_start_loc.np[: num_reqs + 1]
+        self.input_buffers.input_ids.copy_to_gpu(num_tokens)
 
         # Prepare positions and seq_lens.
         prepare_pos_seq_lens(

From 8f066146c395dfadb86914c88d9a0f3173f8fa39 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Mon, 24 Nov 2025 13:38:04 -0500
Subject: [PATCH 016/134] [MoE][Refactor] Make select_experts a non-static
 method (#29067)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 tests/kernels/moe/test_flashinfer.py          |  19 +--
 tests/test_routing_simulator.py               |  35 ++++-
 .../layers/fused_moe/fused_moe_method_base.py |   6 +-
 .../fused_moe/fused_moe_modular_method.py     |  41 +-----
 vllm/model_executor/layers/fused_moe/layer.py | 118 +++++++++--------
 .../fused_moe/unquantized_fused_moe_method.py |  32 +----
 .../layers/quantization/awq_marlin.py         |  17 +--
 .../layers/quantization/bitsandbytes.py       |  20 +--
 .../compressed_tensors_moe.py                 | 123 +++---------------
 .../layers/quantization/experts_int8.py       |  19 +--
 .../model_executor/layers/quantization/fp8.py |  29 +----
 .../layers/quantization/gguf.py               |  17 +--
 .../layers/quantization/gptq_marlin.py        |  19 +--
 .../layers/quantization/modelopt.py           |  45 ++-----
 .../layers/quantization/moe_wna16.py          |  17 +--
 .../layers/quantization/mxfp4.py              |  23 +---
 .../layers/quantization/quark/quark_moe.py    |  38 +-----
 .../model_executor/layers/quantization/rtn.py |  17 +--
 18 files changed, 163 insertions(+), 472 deletions(-)

diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index 638741e91619b..a6977f222408d 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -11,7 +11,6 @@ from vllm.model_executor.layers.fused_moe.config import (
     fp8_w8a8_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     apply_flashinfer_per_tensor_scale_fp8,
     flashinfer_cutlass_moe_fp8,
@@ -151,14 +150,11 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
         td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids = Llama4MoE.custom_routing_function(
             hidden_states=td.hidden_states,
-            router_logits=score,
-            use_grouped_topk=False,
-            top_k=topk,
+            gating_output=score,
+            topk=topk,
             renormalize=False,
-            custom_routing_function=Llama4MoE.custom_routing_function,
-            scoring_func="softmax",
         )
 
         quant_config = fp8_w8a8_moe_quant_config(
@@ -219,14 +215,11 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
         )
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids = Llama4MoE.custom_routing_function(
             hidden_states=td.hidden_states,
-            router_logits=score,
-            use_grouped_topk=False,
-            top_k=topk,
+            gating_output=score,
+            topk=topk,
             renormalize=False,
-            custom_routing_function=Llama4MoE.custom_routing_function,
-            scoring_func="softmax",
         )
 
         quant_config = fp8_w8a8_moe_quant_config(
diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py
index 5a162fa8f791b..e8826eb441a24 100644
--- a/tests/test_routing_simulator.py
+++ b/tests/test_routing_simulator.py
@@ -9,9 +9,16 @@ different routing strategies and analyze their performance, including
 integration tests with FusedMoE layer.
 """
 
+import tempfile
+
 import pytest
 import torch
 
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.layers.fused_moe.routing_simulator import (
     DistributionBasedRouting,
     RoutingSimulator,
@@ -89,6 +96,28 @@ def test_routing_strategy_integration(monkeypatch, device):
     # Test different routing strategies
     strategies = RoutingSimulator.get_available_strategies()
 
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        temp_file = tempfile.mkstemp()[1]
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            local_rank=0,
+            distributed_init_method=f"file://{temp_file}",
+        )
+        initialize_model_parallel(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=1,
+        )
+        fused_moe = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=0,
+            use_grouped_topk=False,
+            renormalize=True,
+        )
+
     for strategy in strategies:
         # Set environment variable
         env_name = "VLLM_MOE_ROUTING_SIMULATION_STRATEGY"
@@ -98,13 +127,9 @@ def test_routing_strategy_integration(monkeypatch, device):
         envs.environment_variables[env_name] = lambda s=strategy: s
 
         # Test the select_experts method
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = fused_moe.select_experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
-            top_k=top_k,
-            use_grouped_topk=False,
-            renormalize=True,
-            indices_type=torch.long,
         )
 
         # Verify output shapes
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
index 073e90a4e6808..ef7090c349fc6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -90,10 +90,14 @@ class FusedMoEMethodBase(QuantizeMethodBase):
     def allow_inplace(self) -> bool:
         return False
 
+    @property
+    def method_name(self) -> str:
+        return self.__class__.__name__
+
     @abstractmethod
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index c6dc95acdb636..c23c41df226f0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -66,6 +66,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
     def allow_inplace(self) -> bool:
         return self.old_quant_method.allow_inplace
 
+    @property
+    def method_name(self) -> str:
+        return self.old_quant_method.method_name
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -84,7 +88,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -105,42 +109,9 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        # Is getattr needed?
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        if enable_eplb:
-            if self.supports_eplb:
-                assert expert_load_view is not None
-                assert logical_to_physical_map is not None
-                assert logical_replica_count is not None
-            else:
-                raise NotImplementedError(
-                    "EPLB is not supported for "
-                    f"{self.old_quant_method.__class__.__name__}."
-                )
-
         topk_weights, topk_ids, zero_expert_result = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
         )
 
         result = self.fused_experts(
@@ -156,7 +127,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
             expert_map=None if self.disable_expert_map else expert_map,
         )
 
-        if zero_expert_num != 0 and zero_expert_type is not None:
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
             )
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 6619b64b2bbc0..0ef3130b26333 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1510,30 +1510,11 @@ class FusedMoE(CustomOp):
             logits_shape, dtype=moe.in_dtype, device=torch.cuda.current_device()
         )
 
-    @staticmethod
     def select_experts(
+        self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        use_grouped_topk: bool,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        indices_type: torch.dtype | None = None,
-        enable_eplb: bool = False,
-        expert_map: torch.Tensor | None = None,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-        global_num_experts: int | None = None,
-        zero_expert_num: int | None = None,
-        zero_expert_type: str | None = None,
-        num_fused_shared_experts: int = 0,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
         """
         Route the input hidden states to the top-k experts based on the
         router logits.
@@ -1552,6 +1533,27 @@ class FusedMoE(CustomOp):
             fused_topk_bias,
         )
 
+        if self.enable_eplb:
+            if self.quant_method.supports_eplb:
+                if self.expert_load_view is None:
+                    raise ValueError(
+                        "enable_eplb=True requiere expert_load_view != None"
+                    )
+                if self.logical_to_physical_map is None:
+                    raise ValueError(
+                        "enable_eplb=True requiere logical_to_physical_map != None"
+                    )
+                if self.logical_replica_count is None:
+                    raise ValueError(
+                        "enable_eplb=True requiere logical_replica_count != None"
+                    )
+            else:
+                raise NotImplementedError(
+                    f"EPLB is not supported for {self.quant_method.method_name}."
+                )
+
+        indices_type = self.quant_method.topk_indices_dtype
+
         # Check if we should use a routing simulation strategy
         routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
         if routing_strategy != "":
@@ -1559,20 +1561,20 @@ class FusedMoE(CustomOp):
                 hidden_states=hidden_states,
                 router_logits=router_logits,
                 strategy_name=routing_strategy,
-                top_k=top_k,
+                top_k=self.top_k,
                 indices_type=indices_type,
             )
 
         # DeepSeekv2 uses grouped_top_k
-        elif use_grouped_topk:
-            assert topk_group is not None
-            assert num_expert_group is not None
+        elif self.use_grouped_topk:
+            assert self.topk_group is not None
+            assert self.num_expert_group is not None
             if rocm_aiter_ops.is_fused_moe_enabled():
                 if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
-                    assert num_fused_shared_experts == 0
+                    assert self.num_fused_shared_experts == 0
                 grouped_topk_impl = partial(
                     rocm_aiter_grouped_topk,
-                    num_fused_shared_experts=num_fused_shared_experts,
+                    num_fused_shared_experts=self.num_fused_shared_experts,
                 )
             else:
                 grouped_topk_impl = grouped_topk
@@ -1580,50 +1582,46 @@ class FusedMoE(CustomOp):
             topk_weights, topk_ids = grouped_topk_impl(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                scoring_func=scoring_func,
-                routed_scaling_factor=routed_scaling_factor,
-                e_score_correction_bias=e_score_correction_bias,
+                topk=self.top_k,
+                renormalize=self.renormalize,
+                num_expert_group=self.num_expert_group,
+                topk_group=self.topk_group,
+                scoring_func=self.scoring_func,
+                routed_scaling_factor=self.routed_scaling_factor,
+                e_score_correction_bias=self.e_score_correction_bias,
             )
-        elif e_score_correction_bias is not None:
+        elif self.e_score_correction_bias is not None:
             topk_weights, topk_ids = fused_topk_bias(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                e_score_correction_bias=e_score_correction_bias.data,
-                topk=top_k,
-                renormalize=renormalize,
+                e_score_correction_bias=self.e_score_correction_bias.data,
+                topk=self.top_k,
+                renormalize=self.renormalize,
             )
-            if routed_scaling_factor != 1.0:
-                topk_weights *= routed_scaling_factor
-        elif custom_routing_function is None:
+            if self.routed_scaling_factor != 1.0:
+                topk_weights *= self.routed_scaling_factor
+        elif self.custom_routing_function is None:
             topk_weights, topk_ids, token_expert_indices = fused_topk(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
+                topk=self.top_k,
+                renormalize=self.renormalize,
                 indices_type=indices_type,
             )
         else:
-            topk_weights, topk_ids = custom_routing_function(
+            topk_weights, topk_ids = self.custom_routing_function(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
+                topk=self.top_k,
+                renormalize=self.renormalize,
             )
 
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-
+        if self.enable_eplb:
             topk_ids = eplb_map_to_physical_and_record(
                 topk_ids=topk_ids,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
+                expert_load_view=self.expert_load_view,
+                logical_to_physical_map=self.logical_to_physical_map,
+                logical_replica_count=self.logical_replica_count,
             )
 
         if (indices_type is not None) and topk_ids.dtype != indices_type:
@@ -1633,16 +1631,16 @@ class FusedMoE(CustomOp):
 
         # Compute zero expert result if needed
         if (
-            zero_expert_num is not None
-            and zero_expert_num > 0
-            and zero_expert_type is not None
-            and global_num_experts is not None
+            self.zero_expert_num is not None
+            and self.zero_expert_num > 0
+            and self.zero_expert_type is not None
+            and self.global_num_experts is not None
         ):
             zero_expert_result = zero_experts_compute_triton(
                 expert_indices=topk_ids,
                 expert_scales=topk_weights,
-                num_experts=global_num_experts,
-                zero_expert_type=zero_expert_type,
+                num_experts=self.global_num_experts,
+                zero_expert_type=self.zero_expert_type,
                 hidden_states=hidden_states,
             )
         else:
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 63b0e6f573d65..48e5a8907f926 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -331,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
     def forward_cuda(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
@@ -352,31 +352,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
         topk_weights, topk_ids, zero_expert_result = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
         )
 
         if self.rocm_aiter_moe_enabled:
@@ -415,7 +393,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 expert_map=expert_map,
             )
 
-        if zero_expert_num != 0 and zero_expert_type is not None:
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
             )
@@ -425,7 +403,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
     def forward_cpu(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
@@ -474,7 +452,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
     def forward_xpu(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
@@ -515,7 +493,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
     def forward_tpu(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 3f6ea68072b40..66945e2d2a7c8 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -597,7 +597,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -618,24 +618,11 @@ class AWQMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.")
-
         assert activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index e5a741e639ad9..1e57fa218b797 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -495,7 +495,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -518,25 +518,11 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `BitsAndBytesMoEMethod` yet."
-            )
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
+        # TODO(bnell): Do these need to be called on the hot path?
         if self.quant_config.load_in_8bit:
             w13, w2 = self._apply_8bit_dequant(layer)
         else:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index ad547dd409822..149e4419c64a4 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -511,7 +511,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -532,16 +532,17 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
-            )
         assert activation == "silu", "Only SiLU activation is supported."
 
         if (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
+            if enable_eplb:
+                raise NotImplementedError(
+                    "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
+                )
+
             return flashinfer_trtllm_fp4_moe(
                 layer=layer,
                 x=x,
@@ -554,19 +555,9 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 e_score_correction_bias=e_score_correction_bias,
             )
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.use_marlin:
@@ -1109,7 +1100,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1130,31 +1121,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-            assert isinstance(layer, FusedMoE)
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
         )
 
         per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
@@ -1377,7 +1346,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1398,26 +1367,11 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
-            )
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
@@ -1738,7 +1692,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1759,26 +1713,11 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
-            )
-
         assert activation == "silu", f"{activation} not supported for Marlin MoE."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
@@ -2001,7 +1940,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -2022,43 +1961,11 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            if expert_load_view is None:
-                raise ValueError("enable_eplb=True requiere expert_load_view != None")
-            if logical_to_physical_map is None:
-                raise ValueError(
-                    "enable_eplb=True requiere logical_to_physical_map != None"
-                )
-            if logical_replica_count is None:
-                raise ValueError(
-                    "enable_eplb=True requiere logical_replica_count != None"
-                )
-            if not isinstance(layer, FusedMoE):
-                raise TypeError(
-                    "EPLB is only supported when `layer` is a instance of FusedMoE."
-                )
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0),
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
         )
 
         return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 5241f9a2301be..7ebe40ec84687 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -137,7 +137,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -158,26 +158,11 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `ExpertsInt8MoEMethod` yet."
-            )
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 91bd45bf879cb..9e2718057038d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1140,7 +1140,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1216,31 +1216,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     apply_router_weight_on_input=apply_router_weight_on_input,
                 )
 
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        select_result = FusedMoE.select_experts(
+        select_result = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
         )
 
         topk_weights, topk_ids, zero_expert_result = select_result
@@ -1322,7 +1300,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     self.allow_cutlass_block_scaled_grouped_gemm
                 ),
             )
-        if zero_expert_num != 0 and zero_expert_type is not None:
+
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
             )
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 42d7a67371ae8..bcdfafb50fc5a 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -621,7 +621,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -642,9 +642,6 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.")
-
         assert activation == "silu", "Only SiLU activation is supported."
         if apply_router_weight_on_input:
             raise NotImplementedError(
@@ -652,19 +649,9 @@ class GGUFMoEMethod(FusedMoEMethodBase):
                 "fused GGUF MoE method."
             )
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
         return fused_moe_gguf(
             x,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 68a122fd46c6b..77b15db373a3a 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -722,7 +722,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -743,26 +743,11 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `GPTQMarlinMoEMethod` yet."
-            )
-
         assert activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 01a23168bdde3..8165673135910 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -696,7 +696,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -717,12 +717,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `ModelOptFp8MoEMethod` yet."
-            )
-
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            if layer.enable_eplb:
+                raise NotImplementedError(
+                    "EPLB not supported for `ModelOptFp8MoEMethod` yet."
+                )
             assert activation == "silu", (
                 f"Expected 'silu' activation but got {activation}"
             )
@@ -740,19 +739,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             )
 
         # Expert selection
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
@@ -1459,7 +1448,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1480,16 +1469,16 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
-            )
         assert activation == "silu", "Only SiLU activation is supported."
 
         if (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
+            if enable_eplb:
+                raise NotImplementedError(
+                    "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
+                )
             return flashinfer_trtllm_fp4_moe(
                 layer=layer,
                 x=x,
@@ -1502,19 +1491,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 e_score_correction_bias=e_score_correction_bias,
             )
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.use_marlin:
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 2090c86f78dc8..cf348290a2716 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -359,7 +359,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -380,25 +380,12 @@ class MoeWNA16Method(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.")
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         assert activation == "silu", "Only SiLU activation is supported."
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 66ae2e94c60a5..255b5aad17853 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -862,7 +862,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -887,18 +887,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            topk_weights, topk_ids, _ = FusedMoE.select_experts(
+            topk_weights, topk_ids, _ = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
-                use_grouped_topk=use_grouped_topk,
-                top_k=top_k,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                custom_routing_function=custom_routing_function,
-                scoring_func=scoring_func,
-                routed_scaling_factor=routed_scaling_factor,
-                e_score_correction_bias=e_score_correction_bias,
             )
 
             return fused_marlin_moe(
@@ -989,17 +980,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         ):
             from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
 
-            topk_weights, topk_ids, _ = FusedMoE.select_experts(
+            topk_weights, topk_ids, _ = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
-                use_grouped_topk=use_grouped_topk,
-                top_k=top_k,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                custom_routing_function=custom_routing_function,
-                scoring_func=scoring_func,
-                e_score_correction_bias=e_score_correction_bias,
             )
 
             # Backend-specific preparation
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 30772c3665b06..8be0299eaa66f 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -334,7 +334,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -355,24 +355,9 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet."
-            )
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.rocm_aiter_moe_enabled:
@@ -609,7 +594,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -630,24 +615,9 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `QuarkOCP_MX_MoEMethod` yet."
-            )
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if not self.emulate:
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 52656263a601b..7b51b828009fc 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -356,7 +356,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -377,22 +377,9 @@ class RTNMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `RTNMoEMethod` yet.")
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(

From 839c6b7b72bcc7197443019aae32be409f1c0363 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Mon, 24 Nov 2025 11:24:37 -0800
Subject: [PATCH 017/134] [Multimodal][Qwen3 Omni] Make Qwen3 Omni work with
 audio-in-video inputs in V1 engine.   (#27721)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 .../qwen3_omni/only_thinker.py                | 170 ++++++++++++++
 tests/model_executor/test_qwen3_omni.py       | 221 ++++++++++++++++++
 .../models/qwen2_5_omni_thinker.py            |  25 --
 .../models/qwen3_omni_moe_thinker.py          | 110 ++++++---
 4 files changed, 467 insertions(+), 59 deletions(-)
 create mode 100644 examples/offline_inference/qwen3_omni/only_thinker.py
 create mode 100644 tests/model_executor/test_qwen3_omni.py

diff --git a/examples/offline_inference/qwen3_omni/only_thinker.py b/examples/offline_inference/qwen3_omni/only_thinker.py
new file mode 100644
index 0000000000000..88a61ed694c2e
--- /dev/null
+++ b/examples/offline_inference/qwen3_omni/only_thinker.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference
+with the correct prompt format on Qwen2.5-Omni (thinker only).
+"""
+
+from typing import NamedTuple
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.multimodal.image import convert_image_mode
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+
+class QueryResult(NamedTuple):
+    inputs: dict
+    limit_mm_per_prompt: dict[str, int]
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+default_system = (
+    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+    "Group, capable of perceiving auditory and visual inputs, as well as "
+    "generating text and speech."
+)
+
+
+def get_mixed_modalities_query() -> QueryResult:
+    question = (
+        "What is recited in the audio? "
+        "What is the content of this image? Why is this video funny?"
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
+        "<|vision_start|><|image_pad|><|vision_end|>"
+        "<|vision_start|><|video_pad|><|vision_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                "image": convert_image_mode(
+                    ImageAsset("cherry_blossom").pil_image, "RGB"
+                ),
+                "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
+            },
+        },
+        limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
+    )
+
+
+def get_use_audio_in_video_query() -> QueryResult:
+    question = (
+        "Describe the content of the video in details, then convert what the "
+        "baby say into text."
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    asset = VideoAsset(name="baby_reading", num_frames=16)
+    audio = asset.get_audio(sampling_rate=16000)
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "video": asset.np_ndarrays,
+                "audio": audio,
+            },
+            "mm_processor_kwargs": {
+                "use_audio_in_video": True,
+            },
+        },
+        limit_mm_per_prompt={"audio": 1, "video": 1},
+    )
+
+
+def get_multi_audios_query() -> QueryResult:
+    question = "Are these two audio clips the same?"
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
+        "<|audio_start|><|audio_pad|><|audio_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": [
+                    AudioAsset("winning_call").audio_and_sample_rate,
+                    AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                ],
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 2,
+        },
+    )
+
+
+query_map = {
+    "mixed_modalities": get_mixed_modalities_query,
+    "use_audio_in_video": get_use_audio_in_video_query,
+    "multi_audios": get_multi_audios_query,
+}
+
+
+def main(args):
+    model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+    query_result = query_map[args.query_type]()
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=12800,
+        max_num_seqs=5,
+        limit_mm_per_prompt=query_result.limit_mm_per_prompt,
+        seed=args.seed,
+    )
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=256)
+
+    outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "audio language models"
+    )
+    parser.add_argument(
+        "--query-type",
+        "-q",
+        type=str,
+        default="mixed_modalities",
+        choices=query_map.keys(),
+        help="Query type.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/tests/model_executor/test_qwen3_omni.py b/tests/model_executor/test_qwen3_omni.py
new file mode 100644
index 0000000000000..c92c61dcd3bc2
--- /dev/null
+++ b/tests/model_executor/test_qwen3_omni.py
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import Mock
+
+import pytest
+from transformers import PretrainedConfig
+
+from vllm.multimodal.processing import InputProcessingContext
+
+
+# Helper function to print input IDs with coalesced audio/video tokens.
+def print_input_ids(input_ids):
+    """
+    Print input IDs, compressing consecutive special tokens.
+    - 151675: <|audio_pad|>
+    - 151656: <|video_pad|>
+    """
+    if not input_ids:
+        print("[]")
+        return
+
+    result = []
+    i = 0
+
+    while i < len(input_ids):
+        current_id = input_ids[i]
+
+        # Check if it's a special token that should be compressed
+        if current_id in [151675, 151656]:
+            # Count consecutive occurrences
+            count = 1
+            while i + count < len(input_ids) and input_ids[i + count] == current_id:
+                count += 1
+
+            # Add compressed representation
+            token_name = "<|audio_pad|>" if current_id == 151675 else "<|video_pad|>"
+            result.append(f"{token_name} * {count}")
+            i += count
+        else:
+            # Regular token, just add it
+            result.append(str(current_id))
+            i += 1
+
+    print(", ".join(result))
+
+
+@pytest.fixture
+def mock_qwen3_omni_config():
+    """Create a mock Qwen3OmniMoeThinker config."""
+    config = Mock(spec=PretrainedConfig)
+    # Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
+    config.audio_token_id = 151675  # <|audio_pad|>
+    config.video_token_id = 151656  # <|video_pad|>
+    config.image_token_id = 151655  # <|image_pad|>
+    config.audio_start_token_id = 151669  # <|audio_start|>
+    config.audio_end_token_id = 151670  # <|audio_end|>
+    config.vision_start_token_id = 151652  # <|vision_start|>
+    config.position_id_per_seconds = 12.5
+
+    # Vision config
+    vision_config = Mock()
+    vision_config.spatial_merge_size = 2
+    config.vision_config = vision_config
+
+    return config
+
+
+@pytest.fixture
+def mock_processor():
+    """Create a mock HF processor."""
+    from transformers.models.whisper import WhisperFeatureExtractor
+
+    processor = Mock()
+    processor.audio_token = "<|audio_pad|>"
+    processor.image_token = "<|image_pad|>"
+    processor.video_token = "<|video_pad|>"
+
+    # Create a real WhisperFeatureExtractor instance for the feature_extractor attribute
+    feature_extractor = WhisperFeatureExtractor()
+    processor.feature_extractor = feature_extractor
+
+    return processor
+
+
+@pytest.fixture
+def mock_tokenizer():
+    """Create a mock tokenizer."""
+    tokenizer = Mock()
+    # Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
+    tokenizer.get_vocab = Mock(
+        return_value={
+            "<|audio_pad|>": 151675,
+            "<|video_pad|>": 151656,
+            "<|image_pad|>": 151655,
+            "<|audio_start|>": 151669,
+            "<|audio_end|>": 151670,
+            "<|vision_start|>": 151652,
+            "<|vision_end|>": 151653,
+        }
+    )
+    tokenizer.encode = Mock(
+        side_effect=lambda x: {
+            "<|vision_start|>": [151652],
+            "<|vision_end|>": [151653],
+            "<|audio_start|>": [151669],
+            "<|audio_end|>": [151670],
+            "<|audio_pad|>": [151675],
+            "<|image_pad|>": [151655],
+            "<|video_pad|>": [151656],
+        }.get(x, [0])
+    )
+    tokenizer.vision_bos_token = "<|vision_start|>"
+    tokenizer.vision_eos_token = "<|vision_end|>"
+    tokenizer.audio_bos_token = "<|audio_start|>"
+    tokenizer.audio_eos_token = "<|audio_end|>"
+    return tokenizer
+
+
+@pytest.fixture
+def mock_image_processor():
+    """Create a mock image processor."""
+    image_processor = Mock()
+    image_processor.merge_size = 2
+    return image_processor
+
+
+def test_qwen3_omni_get_updates_use_audio_in_video(
+    mock_qwen3_omni_config,
+    mock_processor,
+    mock_tokenizer,
+    mock_image_processor,
+):
+    """Test the get_updates_use_audio_in_video method directly."""
+
+    from vllm.model_executor.models.qwen3_omni_moe_thinker import (
+        Qwen3OmniMoeThinkerMultiModalProcessor,
+        Qwen3OmniMoeThinkerProcessingInfo,
+    )
+
+    # Create a mock context
+    mock_ctx = Mock(spec=InputProcessingContext)
+
+    # Create processing info
+    info = Qwen3OmniMoeThinkerProcessingInfo(mock_ctx)
+    info.get_hf_config = Mock(return_value=mock_qwen3_omni_config)
+    info.get_hf_processor = Mock(return_value=mock_processor)
+    info.get_tokenizer = Mock(return_value=mock_tokenizer)
+    info.get_image_processor = Mock(return_value=mock_image_processor)
+
+    # Create a mock dummy_inputs builder
+    mock_dummy_inputs = Mock()
+
+    # Create the processor
+    processor = Qwen3OmniMoeThinkerMultiModalProcessor(info, mock_dummy_inputs)
+
+    # Test parameters from reference video
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4
+    audio_len = 85
+    video_grid_thw = [6, 36, 64]
+    video_second_per_grid_t = 2.0
+
+    # Call the method
+    updates = processor.get_updates_use_audio_in_video(
+        thinker_config=mock_qwen3_omni_config,
+        audio_len=audio_len,
+        video_grid_thw=video_grid_thw,
+        video_second_per_grid_t=video_second_per_grid_t,
+    )
+
+    # Updated input ids should align with HF implementation.
+    # 151669,
+    # <|video_pad|> * 576, <|audio_pad|> * 25,
+    # <|video_pad|> * 576, <|audio_pad|> * 25,
+    # <|video_pad|> * 576, <|audio_pad|> * 25,
+    # <|video_pad|> * 576, <|audio_pad|> * 10,
+    # <|video_pad|> * 1152,
+    # 151670
+    print_input_ids(updates)
+
+    # Verify structure
+    assert isinstance(updates, list)
+    assert len(updates) > 0
+
+    # Verify start and end tokens
+    audio_start_token_id = mock_qwen3_omni_config.audio_start_token_id
+    audio_end_token_id = mock_qwen3_omni_config.audio_end_token_id
+
+    assert updates[0] == audio_start_token_id
+    assert updates[-1] == audio_end_token_id
+
+    # Verify both audio and video tokens are present
+    audio_token_id = mock_qwen3_omni_config.audio_token_id
+    video_token_id = mock_qwen3_omni_config.video_token_id
+
+    audio_count = updates.count(audio_token_id)
+    video_count = updates.count(video_token_id)
+
+    assert audio_count == audio_len, (
+        f"Expected {audio_len} audio tokens, got {audio_count}"
+    )
+
+    # Calculate expected video token count
+    spatial_merge_size = mock_qwen3_omni_config.vision_config.spatial_merge_size
+    height = video_grid_thw[1] // spatial_merge_size
+    width = video_grid_thw[2] // spatial_merge_size
+    expected_video_count = video_grid_thw[0] * height * width
+
+    assert video_count == expected_video_count, (
+        f"Expected {expected_video_count} video tokens, got {video_count}"
+    )
+
+    # Total tokens should be: 1 (start) + audio_len + video_count + 1 (end)
+    expected_total = 1 + audio_len + expected_video_count + 1
+    assert len(updates) == expected_total, (
+        f"Expected {expected_total} total tokens, got {len(updates)}"
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 262ea771d9cdf..7506ee8656fda 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -23,7 +23,6 @@
 """Inference-only Qwen2.5-Omni model (thinker part)."""
 
 from collections.abc import Callable, Iterable, Mapping, Sequence
-from copy import copy
 from functools import partial
 from typing import Annotated, Any, Literal
 
@@ -387,15 +386,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
         self._validate_mm_updates(mm_prompt_updates, mm_item_counts)
 
-        use_audio_in_video = False
-        if "video" in mm_kwargs:
-            video_items = [item for item in mm_kwargs["video"] if item is not None]
-            # only check video items (if there are any)
-            if video_items:
-                use_audio_in_video = all(
-                    item["use_audio_in_video"].data for item in video_items
-                )
-
         if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
                 prompt_ids,
@@ -404,7 +394,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
-                use_audio_in_video=use_audio_in_video,
             )
         else:
             prompt_ids, mm_placeholders = self._apply_prompt_updates(
@@ -414,7 +403,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
-                use_audio_in_video=use_audio_in_video,
             )
 
         return prompt_ids, mm_placeholders
@@ -640,19 +628,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
 
         return mm_processed_data
 
-    def _validate_mm_placeholders(
-        self,
-        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
-        mm_item_counts: Mapping[str, int],
-        use_audio_in_video: bool = False,
-    ) -> None:
-        if use_audio_in_video:
-            mm_item_counts = copy(mm_item_counts)
-            if "video" in mm_item_counts:
-                assert "audio" in mm_item_counts
-                mm_item_counts["audio"] -= mm_item_counts["video"]
-        super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
-
 
 class Qwen2_5OmniConditionalGenerationMixin:
     def _parse_and_validate_audio_input(
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 61f218f16d79c..f5f88f66eff91 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -68,11 +68,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
 from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import (
-    BaseMultiModalProcessor,
     MultiModalPromptUpdates,
     PlaceholderFeaturesInfo,
     PromptReplacement,
     PromptUpdate,
+    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
 
@@ -87,7 +87,6 @@ from .qwen2_5_omni_thinker import (
     Qwen2_5OmniConditionalGenerationMixin,
     Qwen2_5OmniThinkerDummyInputsBuilder,
     Qwen2_5OmniThinkerMultiModalProcessor,
-    Qwen2_5OmniThinkerProcessingInfo,
 )
 from .qwen2_5_vl import (
     Qwen2_5_VisionAttention,
@@ -807,24 +806,8 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
                 else:
                     use_audio_in_video = False
 
-        if use_audio_in_video and "video" in mm_item_counts:
-            assert "audio" in mm_item_counts
-            mm_item_counts["audio"] -= mm_item_counts["video"]
-
-        # Special case with `use_audio_in_video=True`
-        if use_audio_in_video:
-            if is_update_applied:
-                prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video)
-            (
-                prompt_ids,
-                mm_placeholders,
-            ) = self._apply_prompt_updates(
-                prompt_ids,
-                mm_prompt_updates,
-            )
-            self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
         # normal case with `use_audio_in_video=False`
-        elif is_update_applied:
+        if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
                 prompt_ids,
                 mm_prompt_updates,
@@ -834,10 +817,24 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
                 mm_item_counts,
             )
         else:
-            prompt_ids, mm_placeholders = self._apply_prompt_updates(
-                prompt_ids,
-                mm_prompt_updates,
-            )
+            if use_audio_in_video and "audio" in mm_prompt_updates:
+                filtered_updates = {
+                    k: v for k, v in mm_prompt_updates.items() if k != "audio"
+                }
+                prompt_ids, mm_placeholders = self._apply_prompt_updates(
+                    prompt_ids,
+                    filtered_updates,
+                )
+                # Derive audio placeholders from video placeholders
+                mm_placeholders = self._derive_audio_from_video_placeholders(
+                    mm_placeholders, mm_prompt_updates
+                )
+            else:
+                prompt_ids, mm_placeholders = self._apply_prompt_updates(
+                    prompt_ids,
+                    mm_prompt_updates,
+                )
+
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
@@ -962,7 +959,9 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
 
         def get_replacement_qwen2_use_audio_in_video(item_idx: int):
             nonlocal audio_in_video_item_idx
-            audio_num_features = audio_output_lengths[audio_item_idx + item_idx]
+            audio_num_features = audio_output_lengths[
+                audio_in_video_item_idx + item_idx
+            ]
             video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
 
             audio_in_video_item_idx += 1
@@ -971,14 +970,17 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
             if second_per_grid_ts:
                 video_second_per_grid_t = second_per_grid_ts[item_idx]
             else:
-                video_second_per_grid_t = 1.0
+                video_second_per_grid_t = 2.0
 
-            return self.get_updates_use_audio_in_video(
+            placeholder = self.get_updates_use_audio_in_video(
                 thinker_config=thinker_config,
                 audio_len=audio_num_features,
                 video_grid_thw=video_grid_thw,
                 video_second_per_grid_t=video_second_per_grid_t,
             )
+            return PromptUpdateDetails.select_token_id(
+                placeholder, embed_token_id=video_token_id
+            )
 
         video_replacement_fn = (
             get_replacement_qwen2_use_audio_in_video
@@ -1004,14 +1006,50 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
             ),
         ]
 
-    def _validate_mm_placeholders(
+    def _derive_audio_from_video_placeholders(
         self,
-        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
-        mm_item_counts: Mapping[str, int],
-    ) -> None:
-        BaseMultiModalProcessor[
-            Qwen2_5OmniThinkerProcessingInfo
-        ]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts)
+        placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_prompt_updates: MultiModalPromptUpdates,
+    ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
+        """
+        Helper to derive audio placeholders from video placeholders when
+        use_audio_in_video=True.
+        """
+        if "video" not in placeholders:
+            return placeholders
+
+        # Validate audio and video counts match
+        num_videos = len(placeholders["video"])
+        num_audios = len(mm_prompt_updates.get("audio", []))
+        if num_audios != num_videos:
+            raise ValueError(
+                f"use_audio_in_video requires equal number of audio and video items, "
+                f"got {num_audios=}, {num_videos=}"
+            )
+
+        tokenizer = self.info.get_tokenizer()
+        processor = self.info.get_hf_processor()
+        audio_token_id = tokenizer.get_vocab()[processor.audio_token]
+
+        result_placeholders = dict(placeholders)
+        audio_placeholders = []
+
+        # Each video is paired with one audio
+        for video_idx, video_placeholder in enumerate(placeholders["video"]):
+            # Create is_embed mask selecting only audio tokens
+            audio_is_embed = torch.tensor(video_placeholder.tokens) == audio_token_id
+
+            audio_placeholder = PlaceholderFeaturesInfo(
+                modality="audio",
+                item_idx=video_idx,
+                start_idx=video_placeholder.start_idx,
+                tokens=video_placeholder.tokens,
+                is_embed=audio_is_embed,
+            )
+            audio_placeholders.append(audio_placeholder)
+
+        result_placeholders["audio"] = audio_placeholders
+        return result_placeholders
 
     def _get_raw_input_ids(
         self,
@@ -1454,7 +1492,11 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
             )
 
         if not len(second_per_grid_ts) and len(video_grid_thw):
-            second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
+            second_per_grid_ts = 2.0
+            second_per_grids = (
+                torch.ones(len(video_grid_thw), dtype=torch.float32)
+                * second_per_grid_ts
+            )
         else:
             second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
 

From 97588c4d1231287eb380cf9fb95ec77f88479b85 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Nov 2025 11:28:56 -0800
Subject: [PATCH 018/134] [Model Runner V2] Add minor clarification comments
 for Eagle (#29332)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/spec_decode/eagle.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 0f11903e14540..59d0f313d96a2 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -65,6 +65,12 @@ class EagleSpeculator:
         # [num_reqs]
         next_prefill_tokens: torch.Tensor,
     ) -> torch.Tensor:
+        # NOTE(woosuk): To avoid CPU-GPU synchronization without CPU knowing the
+        # number of rejected tokens, we maintain the size of eagle's input_ids and
+        # hidden_states the same as the target model's. This means, we pad each
+        # request's query length to include any rejected positions. By doing so,
+        # we can also reuse the attention metadata (e.g., query_start_loc,
+        # seq_lens) of the target model.
         if aux_hidden_states:
             assert self.method == "eagle3"
             hidden_states = self.model.combine_hidden_states(
@@ -110,6 +116,11 @@ class EagleSpeculator:
         # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
         # used for draft and target sampling.
         pos = input_batch.positions[last_token_indices] + 1
+        # NOTE(woosuk): For draft sampling, we only consider the temperature
+        # and ignore the other sampling parameters such as top_k and top_p,
+        # for simplicity and performance.
+        # While this may slightly degrade the acceptance rate, it does not
+        # affect the output distribution after rejection sampling.
         draft_tokens = gumbel_sample(
             logits, temperature, seed, pos, apply_temperature=True
         )

From 4d6afcaddccaf281385ddfa7c6078916af7d9d20 Mon Sep 17 00:00:00 2001
From: Benjamin Bartels <benjamin@bartels.dev>
Date: Mon, 24 Nov 2025 19:40:54 +0000
Subject: [PATCH 019/134] [CI/Build] Moves to cuda-base runtime image while
 retaining minimal JIT dependencies (#29270)

Signed-off-by: bbartels <benjamin@bartels.dev>
Signed-off-by: Benjamin Bartels <benjamin@bartels.dev>
---
 docker/Dockerfile                             |  16 ++++++++++++++--
 .../dockerfile-stages-dependency.png          | Bin 134558 -> 149377 bytes
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1b937bbc1225e..e03b9989a190c 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,8 +20,8 @@ ARG PYTHON_VERSION=3.12
 # glibc version is baked into the distro, and binaries built with one glibc
 # version are not backwards compatible with OSes that use an earlier version.
 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
-# TODO: Restore to base image after FlashInfer AOT wheel fixed
-ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
+ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
 
 # By parameterizing the Deadsnakes repository URL, we allow third-party to use
 # their own mirror. When doing so, we don't benefit from the transparent
@@ -328,6 +328,18 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 
+# Install CUDA development tools and build essentials for runtime JIT compilation
+# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
+RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
+    apt-get update -y && \
+    apt-get install -y --no-install-recommends \
+    cuda-nvcc-${CUDA_VERSION_DASH} \
+    cuda-cudart-${CUDA_VERSION_DASH} \
+    cuda-nvrtc-${CUDA_VERSION_DASH} \
+    cuda-cuobjdump-${CUDA_VERSION_DASH} \
+    libcublas-${CUDA_VERSION_DASH} && \
+    rm -rf /var/lib/apt/lists/*
+
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index 57a33524a5169c8b56c7de309cdeb243ab3fa918..b327eb2151f50e4d682fe533fa57e12f0b6118c2 100644
GIT binary patch
literal 149377
zcmbTecU;fw|3Ch;j&sa|j8Ji;5(@2|nHMb$?LjJ<G_}LALgketO{CHuD(xJSw4}X+
zmbUh;-~H+w<o)^Mce{OipYuMvs^{~1Uf1J#JnrLhJ#Wg(N-bZqaS26H%V|dsDNxjk
z3lz2B?C-zflcz#l8}OgsP9K*#M9q-@1!snNQPdWSc4)uSMgOjbbN7^60>rx}7;?94
zxw1<B^`+khx4!-J?@t>T@6lw+Uq3jsCY|ri$vUCZkh0n%sz#4ZjW^eY2M8X~Te#5V
z<&G7H4)5O^zDnxQdFoKGWJK}AR`U*)+Eq)cmCtJ#$Jb8?O-v3ujppq5R^PV6yP(Aa
zW0~`%>UTtiAz;o2_@nN@kk^0w@=@FhjtldD`6Oz=G5=papgLVwv;Xo*j&Sh*%Vi|>
zOXvN(jw^TT)JS8=L~nM`)urnVKL2y|aNfML@|>KUA3|mPo}VjY>#j?YE4aV=mit1=
zWZ^x*pT8T-te$M5#(eS9C28M%+sne_N^Udq>@)wE@mO!3$)4mov-GaC&X^k0l!kq0
z-g++EuxHDijeMNK>!`R(vlCUib<Qx(<hrd*ZE_c}D3cM;e|K+(TAJ0l(z_J@MnBVJ
zwc$pWnay_1cNuwSN9Km1@H#&6o;wy@%htkteIKp+oiXWL18pTUuZ@MRzIrl?*`K+!
zJNam5Wpvr&!?%X~R|kKd_;h7u!r9Wt8GUay`R>&_vTnEf;LR%Q_7B_hm>o>qg<2mx
zH>xla3tLF>Z<)IY^7m{TodopW+|G}6nd;o7k!<3fmt1QUxoi{vu2<JrmblJa@FB}-
z;<CbglNZ`fLlvTv-(E3FI6L-aj(psu8mH@>C(!6P;BVLJD>OaYl$g)v$8c{obv9@2
zoa#D7#>5nF_tB(V77MZ8TF)J$W&R<AmeqI1y!pDs<f-9$%aVJ?p3YaCZ=!iJM$<Q5
zFR%68&e%eykp`cK4<FXf4oK4?f4fk#=fNe?zf-)Da~~m2Tx_B%QE2?@lY!T3<jQ0C
z%s-0evA8(OiVc-peSQBZ$BJUjKhB+bZ=hI&JA0y`S3yflE8Y41{>3f*1>ACZ?o3Oi
z7gM}jp6qm!{keG5rdu<UB{R}N;!Z|O{nxHtJN9{{%L5;6`yJ~w&tIC)E-6-;WL#B$
z{?&3a-AC?hR8DuU(z1Q(Fw(;)Vq<ctopQbQT6Rvj$GJO=e!e~y<}w~uZ{Hc6WSriW
za6Be+xOT9&F+1lB<+w^O&%NkE9+QudWh*P~*i!{r!OwqB_WNW@>+9>k#P#e8f4Lr4
zkuMMXg5Q>f?>>M3*i*6uzWk;SF|q}FEt=nok~tZCf6$Ay#X<<fK8GQCZ$EyY!G8r6
zUh>QGFxj<ut0atkbZAR$3p`Fopm3%%Iy=0%&FIA@^ANw_T2Y#8T32sVXPVj(5ot8D
zENIWE1H%orjTsX?>5IY^P#uv9shV8tndWSFe%=zX5zXJ0Z@gzW(7K(LX4OvrV+~u#
zaUZTl><fZpnC-tl+HCcHzgnE`etHNd%~#aUqBg}`=lnh@IFQklea=#-78sNS25dce
zt$dwex@~XZVirgFOKWr&#8+uMw0yqKA{!<fK%b0$A#w5Br(K#a;y#_BB%PkjO(Vvo
zb>`XR3TJxiQ)%>h3w~>sa~W3@w&_yE{VGGXY-+Zz6dROJZm_9i&Bu<n`1W#fSVTm`
z?4=(6a;g0OmIBL9mln6S_~^*?r!WcTd9VakMr*A1U*>B4$g-{2m-n1PZIZDMs~vvI
zqHu2>gZ~mLT<(`C-#u34GF3G>R^lStKjS85T9>?Aa4psOVTDMSr$bLFyPsLwIeN3p
zbeoGs?pl@E3vmB>fzhCuu^>&X9F5*u<kz;+W%S_e<8V@y*Rf$+1+vFv5JU9lW-qyz
z-9rngO#<8J%xU<Ol#lj1*#j@0?jFN~1O|#Ztb6dg>rnTm#?d3PLM``L$#V{5_BwGZ
zD=RMwn@_og&YcEptNIIh`pB#_daujOgiuHL6Sw6HCAX5GL12i{`u|zV-8?x^!Y#Op
zl7Iex!ee7Z<IMEPzX7UYmZ~4<GChXG9QsL#VrU^@h<%rEI@gWYtX{AzROa|dPs1(u
zMHGYH^ME-SQg?Z@X6inPsdhQz67K9ti$$`ocQo;%171OsthD}8DVCO~#9~e|1lGAv
zuX=E1s&OV@z4pbew0(%0&5qw*E@Gb_EI!)j;o@(aT*F4&XVX<JikyOvDYsuEKo@CU
zs5fJP&iYj0+3*JHA)oPmmx&aYNU@ok66ce;8>!%<za*&NdR{cSXN<M_(dZ|h9C@mf
z?L2!kq5ikq5?`$lVWovi`_kJ29b!Hmq9j-Svd+sjFXXOG$_w+iIChCfg?n)B+-dpG
zAME9UI@T4+i$dp7ZVH?}1uc&c{>)sG2m3MahCW(0G<sLYK(ViorNPUa_+8Y5d$Iw-
zEw5HeNTYTPRBD(%#pXPmH=n8uDz*8t@W(Y=E3DcAY%7^rEDQH8Vw)E%ZrqS&oi$cu
z&Nim;Xdr8%XD6mKE^Eh}`krI75oMvfZ~tN@CEe9jNBWhO;xC3rSsEoM*E<KFX>$48
zTJf{YQp<DGd$as@Cs+SfQBhH2U8Ti?8uWPEJb91Z8p*r-%(Lw7b7?({POjxN&*(dX
zKrW4^T!;Q=_4%X&g!%CMG&;(C_9tGd+}6(|fj0cnp_et^U&yk*);M;p|L>G)&@Xe!
zCpu8bla%K)KDcAg9~g;5mQM9cMv9;7mqptc=rmkY+cdi|7P?-0EakzRl!u>)yKv%{
zncCwp+{H>mibjf4IKPqFcsaoKW9slIb`=*c|G%?@nu&xb<H!FjDr%SDa%%ZYpRB>B
zwM_sj>mDqmR2zRC>SU&4S(<g{WiRTz{Bxv<Vbz;!{?ACoQIuI}z%xr_=Sv<e#XjOa
z_W|+$axn{o|8M{2@$VJ1u+k1+_4qRXTdw!9r>rx*P7A4O!BM|$KLUXK;r|zknNZ`y
z-F^zs?%|*PSAJW|WEddQ{k+Ub#~ia!3NZU=&D;nf*ZhA(fJ*C_$Z!Am<vRW}B7!re
z-s1Qhq~5R~XW@a?qT|-q)+Fo3Xr&vx_1ro(oRS@YV0ZfLi;`2+ozfVHfD<8rF2im>
zL0Q?mh`(-^R=Q1*O7?gKsYHx>bR~~H{UGdUO;K^v_ujQwq<;PL`z<L<Kd*7+t^yx!
zxmfMYeWta~A2-F_1sv=qK^9$UX_h@TyxHRI)}ZlGCfoYEUUK3>h}<`Z#pf!c)JQ`0
zM_xZy9$spw?a=5n(lXIF<NVQaps1v%D%G;h-?2ZRAeb@?PMKK*+0^3IaskJl+b&j<
zeR*vO1zZ}@Z8>S5rWR2A$NmSXNKT!f9&2m!Jh<Z7?(@!5ojN54SBl*`d}|%q=!J@M
zV%}@yME81odq0bacsJG-6oj2AhzFS|_gyJ5dH&i;mOx-Q9zYxd;RdSpJV;(wOSRDd
z2m}UfCdRn$%u#x?w~BWDP4>VC0HbD^83q)k>N7W1sg2<(I!=5*8~6u%>zP)gdG?f1
z)l6@!T;0`y@y^(i8p9whhv|W!8ByE%3sMC35v;LW{rS<}kD2&*EGQKhS5Q;)WhEG@
zShzDvu4t5drk`6a{`6t`nZgHqQ@auhK4iPhJU43)2J(JKHqYksIrB)-l@)FeDRIoK
z(s6O>?i3s<S2V8L#Ah`4EVhaZIE;aYo%7>MciRX?)Hd4W5i7<bcwW~T!*b~uB9Ir)
zsOsR{m%G00Sy<qBwRNIlNiiUSQAzedK(C{L`%<nKTbP8cR0F<Jlzi+j5pL4#JXWA%
z(Ej0RQk%qdpNAr+xA=h2%y{g~LpQf<C)P|&uC@=yxu>f6fb7*;97`V`4iZYFd9l?v
z^k!zCHKC~R5zbpHzK=3a9JRM4I_2XrdKBs$kh3h^5jT*nOE&G~b?9%--{Dh0qa&|}
z;2P(uVziQL4E)lrf<dZYCrG++vQ;3d4HNwK{iA~tBX%0*nbv8xz1rzztza<sBV)6K
zk%^)+!HW1JMKR<iiT=9A>0zFwj{y&i%`%57E1VednA|@fbLhihd)Ywt%tZFs*C!r?
zb0{e*>(?acr+&J;^aFC0a8Y+-_Do)-d1}jj#{s{_sN>CD*)!7!s1{CR0?fI9e1(g|
zP!#(C*h5f)&ec9Y_fWt*kV`AA*(ZBaKcE#zk~JT(t30ttfc7C)CmTRubg5X1VM$=p
z_F^D2QLOJ+hf22k#3-&uFAflCLx9rg^;Tsv1kYNu`WhOHe*63^YVfWNMe&<Be_8b3
zymk;2Xg5$Kkf{6ea>mT`1OcHaG0vl}SSru;ELbV(StchjrPnDjmM_<#^u+|bmOVyA
zMJ2Ycuq1(B+;b9<+k)Uw*N--D%(4!g=+v1J&YtQvFN;u)Bq5DNEfaY%x}RmHkA>|2
zfVKr-`LB?ePYU(cCUvfNHoLxp>7<ZLe?Gg)jteK1l$w!6ioSh*rHIVLYS$H?X90fl
z6xU(oH&MN|g6Sc^tE0&}iZc2A$EI@?7Q!T0;z42)DuMZgeFXgk%%^X5BI8Au)pI+w
zA145-EJ43O+2($Y^F((iuK-O}VrsA)Ux6xEwof^itOM&1Js#0XSjl2b9ml?0(cx;n
z$S7qcC9>hHA5Ng0wiF38N~ha&e?WTW&bYD?A#vOGqn)Am0Er>&`=O0R_7mMH63N3!
zu}-@^MEj3A4?bbh?<jwk)VXxsuD89;Bk7Ypw%Ny@DZT4S?+tw7F3c&#Y!||Pe;%cK
z<<Splx#=cYyF_f(xKV4NuNOi?zt~@>=3H1n|1F)#$Fi1){E7IaMN8L(XmW)qaYpym
zzr3=z4ZJ)OxWTlJU*;h%&4){?W3@nlh;1=~{A3toP%H3|4!b40T%e{y#>KubQL6EO
z?K~M3K2V?CdwIY^J;iM5S-Q9L2~Z&OEG%PHU<xh;ji7{JTPq|d*~EF2Ym#~2mt14h
zN)(cbo1K{sBgIV~5`ouQoCOK1?9Y>!zTo|jVXEfr#|U&$KsvLSjQ)4GHw#%rx6G$p
zAOBE9a*lj?UM5>WB0_B<gh}CKmH0%L7YZpUOIY^kDRZsk{&l7)T10aYZmWB5lGYxI
zn<L15cgAMyBa;du<hfFObPJMs>_Gk4KwGRu7D5`lJh)h-9cvttmR4IFh<bKE3GCl{
z02_*Ly_Qv}aB5YcLyxc6NIj2==7qzdvH=f8dNX)X1}sTUsx|%yhPoLdNb2->Cm{jm
z0nHTf*Wn*2wBs3WM|NC30^p%LlZXN6UQ}7e?v(m|J!7z-=(0jq5`C6mA6>7sTakON
zF1ee%Cp<0GrbgdqvqM+BuYkGsN4vHF!Xe3@3Hb|4EfF1%&g%D$B?8A=Jh~sUpD;;V
z?!p}$B98MZNv7w<Rc77;NvS+-VLQB!wKrzFjF0pNl|GSbLtqty*OiIROb*bdlX=>*
zCkwLAG{3!TJk+}rv+xtJN{S`owuXo?wS+UY?poWdfdFhD59lPzSj0s&Op}y}(gOS$
zQT$YRl=aGQ%ijg>zfw6~kLn`E$H!;a8J*G(0NoOw#~hg%L{zM()Q_1Cu@=)Ek+CsV
z69Pu%99W<8*OqU(AM4a@Y&Tdc<uw^CZI5|szOl_+HP#NInP~)v@Q-XCk$Y=>timU0
zy1>VaPl_$2K-eK3aXmS5XLic~%{)b#w+>v|c^t$%fjmIL?iA@~N)cL(27aa;ytXax
zyrBWq+Si!=PEj3C#GS@NtUD`{+K|nxjEs!3z72rozK7O=e=;Wf3nE2rh-RV0sUPGt
zJXSA+GEfABko~R}A=ZD@k^+%F^Mi_qKhZC^-*R=m_FJN+U^x0YuJf!PreKhvp`mQp
za4}O!5LsfMK7W6HIT4VqEk8~tn~msLo6Ixr0)}PDTF#7DWrxZIiPPwamVICr$5YH2
z`sHQ@<=U%abpk9GP&wlt!cRQmbN;x}`P&suq$C<0*@hjlu1jQx5~rTai<a&?_0Qj|
zw30wEiJ`UuwL}A_N4rG2;$Cr*dN*2Fj}+Bd^CMm>C9y9rEoOdzA&4VYNCU-LRG+%u
z0yQskpo>I*qCwHSzyDa%Pm(!6r9FUA);fh(*PdX~+v`&;hmoRr0sLeWwK1aFCcw5<
zAnZcEDoro^xss0!F`z=2dw1W5r^h9RM`Z*Wl+yKDG2+a3dSQX~Mnqe5nC_jK?$!Ak
z2!^G7f0vqqNK;v%p8)qt(Qi3z#eQNA!<=pD-N|)f+7lkJDQRiK$W*E=eW=Hg!$W}<
z?{)+^n4`Y0|4{~Xmt!>x{e^e{RAgmjWV*+(JW9Hpx-us5x$6k3OoUMK^x*XsNu2<^
zyZ5ZcII~<P8Hcfb$EurL0DX)-Y#Xerl$q;{BBL#aDpaZhC9rpaS``(zd?bdP38vyo
zZKkv4GdpzMHFue)3-axik{CaM+K?&dH2K6s{GCvnzoNs(XpNMvq$=$o`>B~J->I2(
zk`kRm^Rb``YF6I^{cN}m!~-x>ks8QdiTH1rMv7T`O`>6GylX`{iv3_`RqQZ9M|SN`
zeL6J1_>0(@xr+@~4}LA(h6$4PU>3s+`|ui-hsWHV2cB_fv(P>jZ99E>GuF45+qv%+
zNim>Ug)c5N86AHJne1p=ae&0|Brvfk(o<1;SWqCuMG{=I<rbCL5L>gb)z3T-uy~K0
z*pO08T1Uj>^hmZG-3me~vg1J8bkqa@%61H*kkQcp?xTa(4UnW|ut$kVOaz0}Z~rVE
zfat<T!)p6(GJElv$8ZI+s9@1UzT<#%^kyg$?{e4c_}g^Xw&b%16{5J^Z#EaPao@;i
z#OES_df3h3Ezucy;=lRQUC@VF)(1qg8*dLA=t?XJRf*LmYtzANiHcMM#8ahkK=&Ze
zg)grO7LJS}5@<?;jRrzkTb>hX<}5(SeQupV?M!=ep~HEX)Pd3$Gg-;^fX&NghN9%y
zAya=qRg=qXMRCZdu?O7Co*s$~awxg7LYM=y+rpV%JM(1i&Xdh4*;A%mX<r{tws5<Q
zO|HKi0&<+%lh*kmOfIM-bC^IlQ?&svCEyx@`)yGlxvk7qeKK6AoFnI+|Iqw5Wgta4
zc3rsu=%9n_oppKjyiHebaiwu!1CHla2HCYTrVr1TToCs6L8p^iS^>r}sceM!VSuT-
z)Sq*p+~-4<^dTT%uuQJxq=xBGswIKGB0Vp*HRXAfls{AC!OHAXOJI#wPkJ|8XC4&m
zJQO(wEHZJ`?Y@fuc62@ThcWOE|7<*vU2o<{g5y~KbCa5VbtZ{2gyRxzyvAi_O2v5b
z1hPR3QKvzw?vkWLOUPZPamGVz_$`?+=c#e0&uVd8b~BJY=szDc3z5=_Wz^D-hRo~B
zV<@}-rt6(tq**#6k3sqLLx?nikZ?iVL%yCxmE-tK#j*W6Usn31-<=g*tvch_OhR8#
zYr<@MKenUp+0>p3gFUh75<qYmVq552&Vs7iLLVJxA<Tf>%=A<uFvO0(XVuwqy&oz^
z`EkSm$$FDx1Bt*sRp%<f&;k-m#2t2TU%*ibSZMKq#=d>5j*BbPe&`r3G|<E{BM4x`
zE%+}XpE>$H!@Alxkzj@`;0IjC>LEUUb{&`I0}RP<xU1%{)PisUW|zsn+9rbC0Wf`q
zt&IrjTK`}P#V<?9HcA#%ZuxU`n^d%{m5D&ET#4B8E)0@r?6II`vRP(E^9WfdsD+>`
z?yS+%WC}nD(=c^$r2w*rNw6bBtYcc_`Ws}#wQ5<8cJwpwVRl1vi4*@pXXMS7Z5E76
zew~U%3~`y6t^wsOL+!cjHLsinRl(8*7-ci)Dvb`%c?Mt<TBz%t<9__6{nHb@hQwQv
zniqU&G;icKDlfyY5>B|lwPlR-&e&}61&dcm1DCqFFL=KNIu1X9cmw`7*RUIaIbqhF
zD&L7x)t*yMi@U(wpU#3?NLQ}oW_XySJQo+1c@+POwTBmfPYSQT;r3yuyRtmceIIP|
z5LLLeCYT~lP8GH-KoXbJ#BdQwQu$u&>zS5P9Zw0CqLJ9nKrLGoHZOSV#m`sQ8+pu3
zi_fP_Hvgz7s@s4lOPce&EZ;C1h9JrawItDrD2w}z2y<lKY4A}3P}Mjgqn_e=@#!+=
zOmeCmRj9TJF^#Eky>@zhustbH%zlt#Rti)8q2st-GXyuNfkKdQCZDcc83qLuA-(?I
z5?8(o<m`GNLt+UKWfT_+0lom<F)uBpg0*Y0U5!1EY679Au6^*E>$OiT$7g<(=5&*~
zQD6u|H7eN&-!D=n*0KG#4kwDAam0nDH?y<~(2SLAcZ->+k)5pza>7R6sbncQOwN+q
z|D@cG^p1`W;sw$Ox3%i7O@c_9Gp_WD>iYhhW2Z*rJ2pQPI;jZngZI|}MiY0Diu(w~
zg3AsH4GAk0`Vya7>f5vF!7vGZ)Ej9!p~!^*LJH^qqIAVJ{&YSnZ>;p2_swqIE>%yR
zsSXuJJ|m@n^)@9dR!Y^F=r(OZF5=KXeFZO^b{G6|gXpJncbG(yhcANpLb#>e<WKx4
zyOr6fcaOJGajPD{z9~j<qs(##2wT5{dGP8siv&>{uu)xwMUX&@Jz}x-=6g}{K~8|K
zsqhuOSV_8_w|KKW8LE@wcNw($DE6iVNT4+#s#C*tv0nvYtCGr6+eB;zvXPJ6-f$N%
zBsxclinGC<+e7RxdNUx|+d^NSrLqepod#hBZbd}c(3|0A7NZht|Idwb08+GS;8>Qy
zBP10*6=`eW+pCqpQXh8+Z=^Q8IL?*ITBT_zbHshKuMB?;D=h&E`<d%ZHXMSCR1{O*
zW=rYrK4`PrM5K-^;7X4oBDDarwbU;nbxxXP{&pYYo@nbwBdMkiRcZ`)$2vX@R-#Ny
z$R0pEas!*a-Fkhqg4Je9btlPp#5*854FEcEK+XWk?Swl#^cu-D0a61yjif4q*5Dn$
z?8vd_ul?cbZF+HVg%C`I9fZ3Qsf}<;_!<fG*{M!th@3ni6^mpkiMMTS-+KZ0IZ(i6
zi#00|$%?`0F0Yv<uLF_S-?~yQsjbc5(jC8V64LdZWXG)<1;%{{v4w#Nx`ghPL}W~&
zD-{#R96lkT?d_0Cp}BKZFBhoyBy64b0p6SFShLscMoU=ppKqV@M)J}c*X4%+Jl!Rx
zN2MR^*}79DW|!~^YL<G4L@TSPv|$?-OR+d9o#&+7cwMF^{p~xRCAA?}o%poS_0EGI
zWs-WY3lzLyY$(<x;MA*Y1hWdERt$tmOzj{g;w)wIny7qTV1_h0hS-KV-UTn>uyEh&
zH3O)x+i7GYEMpdZfo&>z_n7{S{eQJnp1UYeOoRYgFd++M^|efksgA2GQ(vW0zqsub
zPj-eZs}B`-=ayB}WDya$u-w~OoCZ(8bm4)2-vwU^W{Iv?I@0Gr3G87S9lD4pk72R&
z#BldgS@=_De}EfFF;b?9mkmN#jAW|F<wK;H!T=Eg5RhHM%oIZm;QdIWi;EeQVv_L)
z83hjxf&(0qkdW~8A_YZpI5DWg`40kZyh*l&^vFTHLZTh)JO3#M>f3f2bm-*GuvCdr
zx8#RWCm-YwzK}Zl<(7(CB(N}I!wX0WNtWseBLXp!fHXF}JHXQ8CQ33QGLi>f4YS!U
z0h?7HE<KU2UglG(RUe@rk$wlF&Cr|wUL*VVsgJh02?rG{F8@6fgv%KeKDZ0ZgpFnc
zay6S2357FBEaHdl3rFDAMdaHLDdvr}t4rSA-DZi6Tf29o?t)=3sWTv<`B2^hSP7|T
zh9^hR5V2szel&&1H~OQAfKl|~`K<AZSd@3j5`5pv73K7?dPQGyZ+Wp2W6`oZsVemG
z<Hz{rDAVKjcjd>Ode#5_;L7bZVcQ;m0#JzQi05*(KkyyFkjh$_yqNjomc^UTO9H0Q
zS+V~v0>EE``1pccYBy0`FzkW1+dNol#BLxv6s+^$?4`~;j~)2oaq%0)W>4Fbx|o3^
z1(2Y-AfZHjqB+#hEMm7Qky%JaCj^jK(AU5&cmQe*O)n(^8vqB=v1z~U{Q-vnoVy9^
z^fe6h1@&%OEHSztlYIiUfd5Y-G_XlO@VviUgN?=}>;Hf_z##J6$a&U%cy8i35+9I0
ztE<2*3jz~Z>%R=H;E$?bNhg)gUoigb1}eu$75aZndQTcF4FTee8g&MQq5#uKx+l&H
zQ~V1FW+(T8?8ZhjD0=8kY;{D`F5EOS5=uArW+8&c2jrNImS)ra5GjBc%xM?EqrA=!
z2v%N-WwQj)3k3@l`sBKxKWD*82Btr#(aYF)WU8C7zC}<tK$BPs2_OPfQX*qu0IVtm
z!Y_Lw<-wXy<S=u`rcFXn0rnxwTrQjU?J$u!NQ+IeVAE|PqSF$f-nGuU|F%7vxJa=0
zk3yz;aOrxkSzbrdID3GV0%<&89jl0SR>mUfTALs}q<(#TtDkrWDC-4;PL#=tM1NJK
zOmwHS6zhnPRGWlu1F`Sz9kmDkq=pLU+mBoHq_%~UPKSN^$LJxCAO9sWGge}Wuzd=k
zwupTpJkwun^5CuNNCzZdXK984r#ERi@<;t3&qEqQ_>>hDz1)^GdaTSz8-Q-l&mQDx
zK}?(4BzRDJiTt<%Ju{@Yrfi-kD=AhLDmnqA4+g~~QVUWv2vzIDE}-^Bp$n*uo~6@I
zSN)4Hb(cdB{)=4jA-YPy?}ZV(CRp$A9C)Vwq$16#W<+Zfg8yh-MM40zb{CR~9vGHR
z3&M<h&(@qbozMdch%RS>EG6{$?=@(sxwA>2n1nKN_|Eh5smR<PMbP>iS~?8!>D$}k
zS(FkIeAbm25M;8MT_PXO*7JAv;4J&l*+Y~%7&fyVJjZ^I`|JQPyAL~+Kp7$xA<rAJ
z?DRfH;$e335nqc!7s9A2{kg3m6U70Bw~}Bb`$BH~VlUAhNs|?{F&VT6q5r^-_)TE&
zJGbnXeDxe^0QyY!kYE9}4+K6v1DxwC;WC{UV_N^>!V2N9!DP5Jh)he4E~QMi{Mck9
z#Srxxd;D-773@yX9g!P|Bn1Pv4A9rwxn#YL<6Z)q%(BK>Q7>!*8dth1L&@8X%ykC&
zl!a_#4%FE9-+pW(h&nQ4iN=RIeR`dG9fU^eEz5vL0Se=;5hsUa;|Q_X@bKBjF&K>y
zvv@HM89X2H9f@&41VALHhs08Xq;{?$&6>aFj2E#!0W)aiC(&_6q-atXx|8tp_{a2Q
zsx9O-QjjqH(t)BvWFUY;FS1;gQv4TDvW^nh6rC2gH|+fj7u_kmnu<F$$4Ey@Apl}u
zQy%-QYyx3j+%UlUq<WGO*?A|ca983zI0A}xwm4%Y3JBOn+f5jnMv_Ea%@Xq2#@gjv
z9+KRM%s03BPYs&Ceg1E8ThC(r-LcSJ^vTMiWRsQ>lj`q_<9erxq!8k0kiEZAz)Tak
zPIrMkb{R;q2<f5%@KaBphU`Izb3e~w$pyT4s(L`us}0%M$ElyMZ6Y{tJ$wypQ2w;X
zU%$6ibZP)2kxFlcg(Lza$*LsJRZBuBIRkCW4$7M)e5+7s3^Y2FDos$jOB;nO&YT)y
z#-4|08iU<Oy?6Z~Y*vSpYE9Zh@J;RXO61x}peLEbS9@PBqAFuZB+YjBVMk(pLV?Vq
zE}^vVhHBMdpJ@oEz8KEKWL@@@Uv?xE0$!6JJ&3M%j=)rhUl@o`*iJgq(4G``n^{a4
zA;~qX01Q>2Mm!{mCbQlcU5)vWJnvmvusFtN0?!{OduO;p^xu4vbL0uw+E^^iP*xhT
zdjauV?`xO=0Rk5Az{@qttZxG@wFl8;1jwaam*4o_>*IQ-`1Q>-1e+)+jegX?6~!Bg
z_kxNOl{iMC;lW-#qE$pEI^dh~H-6dR-x!ey5-Lt=v_Krx?o32FZL7B^*iG6_s>Wfu
zQcDDCtKR)=Udu65g;0wJ&#G*LZ`D`?t>+P$__%qIyME~P6cuC!!5s$U32xUR)9J}E
z!o%L&+2n&pw`Q_>#)-%{0g*9joWzK&s+GpAI@o6tzLR)LUJmHkkZ4E56VsXevi=<8
z0Ma5#LbaeeUXKTDNYZ}%yIU;mq8s%_8f;5|IQ^lj#17TqYUL7hGTBf$>9KsK7-j(c
zPMS2QC$eWmAkf+Z$n$RiEH5OJMv4;%A4~t6P3DeYtclHqOv__j86^r3ZEO__7ix{H
z;8qw#C&(V~sm6_Xe{_vXUICDWvF+wQkBTeC_{k0=@`|?eSc_w~u}%SWAb(P0NWK(i
zmb@}Z<V&=_5>J16G*6-x4G5#JJBhLaXKdfLBe&n?Tv~uj66HDe#LYy&e!@4&dZ)d?
zWCj0EZ*5jMB#pLvin95meNAO}V1QRD%92P~uw(qLGuv~}K0t^Tof<4V_c6l`;B2Ei
z<YmLI>Uc&x5rLK@ldz<RBC>1rJjC{$etiSTp>gzewq+KWUOPa=$+o7HgV&bj-x2uu
zU<H$Kc{?BG;r*67C%>cJjeiqA1b8$bp2!1`>t!2xIe_(InS|!K-Z_AbNUQ+>S}}ZR
zZi7mXTwHDp!cG)tk}B&_tHHab`)^j6BGnV{vk$$S%b0|DAy$sp*G!EMO&mzS0A-06
zJWiH)x5yPy!^Rkj?v*`h2|uz+P#Xy2+f`DV?~*j-Jp?bw<<ZLO`4q1e2qmGt1PYj-
zK>)qb;Xc5f^c{+!XHc!ipy@q;)9Uoa8YA)H!BmiCLI0>6=T4R0rsvzF+$ZZ-8^WZ)
zrb!8b_#DZ%iR@30AN$k3SEF(KB&jm2G(Zc>my6an!ydS+MVj^aEg&VRqm}d?VM7Rp
zBDb|W#chaaPFgw0&49Xfi76o5FaT6AR%p?BSBlve?0gprV^o<F9=CeS?H@DWk%@Mx
zPzX~sr*G{hoiZWDRWXO9u!DoQfQ=Pmp9ezZAyW5Ph{r$4Kr%!<S^@LxHrlmO%G8WU
zxP1z0NdFG7Tx>#Cd#GQl+s+s=gYlI;pcRr=1m;d}80s7mq>^EcDxDl}B%QsRyhs3G
zr1{vXmU&FJL>~;gh$d1~AJT04`o=0oVXLzbdCw)t_5+2HCxt3&*R9@|<wSh@FgW#{
z%`j*wUd_4vcfr~qgSN!ry1hw&YYm(9Tkwq0cw^GR4ke2eQ7j7SO|I0)Lb1??fF%cK
z{SS73&!yZM=L5WtVBXKZKeF>@hn|TmVF(}wp)i3!M@A1A6#M&<S;DR$0jZ#<I1q14
z%6%8we;HAANJ)mZyUvrBNDD-`(6n!V$|&x5aVwfasIz~5ublV`{~`2#DH04wOg;4f
z%sNFV1;pvYHD%n~7kh$W5bc-j!Rc?;C7NM_Oup_UJPOcD0*_P(b&0T;n6{806Qsgv
zQLMzLfgiRx`X6qrkeK{N(l?6+=_1Emr%bwZHP~Kc7^<D=D+#5`ul_yrw~%YA8I~Vi
zb?Enbt0nh@-(WZ)`Dfqe{c%hSm6TS%tavE3U(%6z(V?Ru>)U12w}fxW)N}pY^B8?>
z>ZONMb-7Hyy^+o|3oVhv5yz2_MQkFCHK}comy;$ZC(F~Ujrq>L&-r`VynBZP%<A7G
z<F8%4`my&;6)X>DgCGkFi*ZU3P)7dgc!u+#m>Y^}zIjg8z;O1DKmPBnLsnK+`??Ek
zY-~m-h_&gqvDN`TK0Y!h!h(W=IF-WXJ=^DVyaAwj>goCC)oa&`01(qG+hiYZU}tAf
zGHXzO`SN8)T}rG`MTGu=|L!9u{By<0oh0`@w;wVZ`dWIBJ_r`|3`!Qc5hVF1P-_nE
z*s+7r%s5`PJb3>CT(MTY9Fu-rNJM14(m3T7ckdnpljz0w>Mv3ujaxNkWMq`5S{g3;
z?Y9UlDyK?}<~&1keua|E@;kEXKS7B69bXIwA$KdFth3Ulr>E7^ZK8*=!oxX?_qRbF
zc#a+<Vnvqq9rNaVihO+(7^Re_MAR!bbpeO{X+6E93r$DJBcs5kS+pEWyD-#Ip{T1H
zeEIU_@_0Qdw^xX7ippg=O!nb_UrUP7k*^dg&6t{=?iYZ8N#2wPi=7&((uq)bdhFUF
z<q>~zCw0hJ$?L3{MK8+t_4S?mf-)bY9C@-u<sIebiRn3qvbE%oKT3h4y*)iW+1`|v
zo<PDlAusQVn>RN#>HRm?53c<+r=faL6J*T(kr5S`-bs#Q7HQEK6@x1<s}Vk-hd@z}
z#6sDaJG2!ukx8wMx5Qk#-MXcSN3({HH+hUva_f3#=9u^I<*)wz_gVOr$9j2{Y;A3=
zd+OEp@893ike(8u^z8J1_h(qbFlVjbx?Eq$a#B*#HEDv72wtagE3ZT(M2>01XBa5T
zQ92O&69jR~#VFthW29+%2$D&JMaw%o`Zm{pc8iN^*QJ=N96fr&d9qJR>&5x44KIYP
zJHi2CmMmL#FFQM1Q%h^9pyKRwFM0ZFL<e82OS6u&AL@9DRi-IG{~Uzw_0S(?vt<6K
z&Q9Y@C!EF5Tzg@Bd|dsO;H!WBDThC!*|s?NyxWZ%@6kBLuyJDqpK&FJu(0s#CR((3
z@lFnoqv%-h+_mob_elFJH7C$q9lahr2!{&(Q9Xxf{hYkKN^3cl_6YHyJkcK^J6G0W
zT8b-FpDjJG<I6%fAH^`)9jvT}9zA+A4iWeszS!Or70E!>C(#vX-?5`2{KTujrDwy7
zL!IM$t;aKanrmumGGM18C3b+LR(eQGheObN7=X4c$(yAiv^A+|X`EZPZml}(Z>RR>
zk{rkU{Ctw!4jep~bUJs*6darOH_8!8w0ZO9Em^WeK~j>k{1;9p<e%Pf?Uc>x{~{Ux
zaZ8Dd+rxM6!=d8hVgsmpl>8Ub(N9-MSITB4rnO8=y9ESNHs2x7k1=rW-Fu?QPe4Ug
zRh8Lr$(HQg*RKhJgn)4Kz<~n{jEuf!ny~N~Hf#w0^hr0#wC?1J6)P6`@0h(`?i=2@
zS>vrO8YuRN5M^z3B&LwulJ4cT7LiHRE~%y$FJ8dPI3jG(d;{{1z;;&Fw-pt99*K)Q
zt7L=3qlY@Hc4CUrVXk=k^y!Biq@<*h&XpZO0NcZ4uZTN!aLqnk9JTu5;~R5YGRZ^l
z)zQ)E>;|cCdvQZCP2*4d=C`Z1Z{OaI`f86O83TVErtCPjZ$G?vC9@LZluYbwW<s_M
z#$kp7mo8l*yF^q}bjg2ldh9p1IR^&FT{j%pdhJ^m4~lz+>DFJe!RK$>x>bfY9TI;?
zv|3q?LJcnv$UFAzkte&ZI{t}&K*0C7KgRrju2<0cS8>M1rIJ0G={7Qe(I?KH4L=#J
z{`A3v2fTYR&t&A$SN(wK_HF%l^Bm*<KNpFVe{>o7;VF)J?cB9X`r4_tr{kN??!wPm
zUcG)@Bi$4|i;CAyh1|SgYpa6Pu$F-#1RCBm{0`fh*(6N$Jo?}16vA`{NVsU<y?a;K
zVK^!yGt-h+_USQirA&Kktu(8fczTL;`}T7<GC@KGrfCfBAGS#aX%ljT3<?ZVnone;
z@qgQo+IL{i_9KhPV2LWWWchM$tCTASvbO|#?~p@h4KFSnY%2*;!87-B9H;13Q{&b|
zK)~iuk3W4CS;*VW-^0TL8I{B&X@z+4ii=#2daE-xt>FX7y=}AU<5SMOp`Hd26}@Qf
zO!W^9m*w3AFV%}!cibZpj<o|){EEbO?lh?>!`*c|v3|hMipb|OZy9E<eP;dKdV!DR
z3mbG^3+caO*sU5TNwPemwgLrS3|q~@(z0M!m1I?2UERR8r3|)nwliuHj4pF*=P1<^
z4e7QjC=c)lLy=6^{#Wz2r2LXScGGAy3yUw2N3gFRhoDb@Q!DK`HV7*XJ*^dBCquwM
z$LR>Qmd?&dC>sSOC2}A!Cs981<5fT0_D;L8#?=YG0UpmrY9nmzG!*w=YU49S_Ew$Q
zXJxps?$w@UiC(jRnf*U`Ud>dC!%QMJ<zP1z*)A@LWk_?9{85-sG>%r1!t65RY`ov_
z$U7fybH(x+%!Hkt-3}okHRJ(hO-=ue{3hYR4NqJH=gANHi#ePG?pDS|Btp#V*ROf^
zy1Tm@m4)6RWjfu~oX@Pj`oCN%Den6VL`X+Rl@v1Q3lQ9`w{joDRyf9;@$GWWIgDw`
zVmh79fP@3JSk;9jnWrHkXVSFLhS%KI7KSp<`PW|uu(jOW+!UavF5$2Y{<d`G$|}gL
zBorVEb7YqBl0H~C)tUq+RYdxTiiv#>1O{sE2CQxaL?Es?H%$SRz#5T;$Mz+)i|T&|
zf^7Zu{Z=$J>r9OFR^d^t0iSlI)FHEjU?@_I^RFsmZtx(FkUxl$x1L*DTA;YDP6N;f
zrBeyisrkzng&bexaeS?j|8B+MFI$X3%#{G;&uqAc`kLf4VIu}T)gX){U`LzbF17BD
znHo4`QVM^Z3@tY|x8`+WpqK{mGVhLqIijadog%%rfJ|?AUDm@q{S*ws$bs64^kr20
z{MjFB|K8W@#LxSBt~qhy1S+rmnKL0~A2ZI`2uo&7pidT1_Ai<}h&U3=8H*zv?{=-b
zg^6ovZ-0h;^$ZbYfJ3C)0V)Knn%*oweB=n{zI`9MtmGGNXJ_w3CyDTdFMt2@>Xr52
z*Arvo<3?y9HVFHzS!D6z#huP3fOFQ^Ap}3q)?GQ#zQ6y=0aziwd%H0qOL$t_C{N30
zZEX+@_KG}>nloFT-mhODGHpmx-XvgF4snxV-MUb$vM)+7!zO?SrCaNEH)H$2+YE<_
zU}0m!FS48B=>PKN%U}U_HmQlij-zq^{PU01P=|72*?ia2udqz3R;@Zlqn$ucHm}uJ
z`Jt|wL=JyZBvse>uO~NC{XQuvDQHiR0Wo>k*r<cvmT=VIBH2#oIjJgyea8+Oma-gf
zMw0WC{b4C7zkt=$y$yI~FJLDeg$#kH%#h4oJ1<!NxU_UR>h>Ud10kQtxZSzKC$gR5
z*h>nTt1GpfSyUAX`tFr0S88@(L*%d)mdwi$y?WkR_GkXhnF6{b3Pw95%MT#@Wq11e
z`xU?vl%v)8>@Hq>e`*0`@d>FA+k|}Idi2{VWae!RFMvvn(INn8F$AD~iQsXHLzqiD
zBYbAEO@cJy<HGmNq`f(n%O4+JimNFzGczM!S<N&2gF4)Zti-T(?e|iJIBI`W5v!vE
zA?H0>*p*{6Q|A{GC}VKSc?GrT$&I2CU@!cA_RBNm?c1Y85B45}+vjW6m}P3IL+<lL
zY9(PQ(4&=shh^Tarrcs7u%CtgJ`O!M35o9Tp+nX6y`Minf{Ckz7Q{x5z{Rc;hYlSA
z@)Q3MrIyIBe!Zck4mii`)w*gF$x4Bym?+$*<Q)TqERUdlysSiGCJM!^9mgj5M^uo$
z!LQ$cEq&6`+G>CwL<KS@ldXBxaqeVu)GT}T9yx0WS-;NelBI=(0wir-i{_(ZP|YmP
zpWm|=lc)xulw{Mbj-$C!H~^qvVUaMDg?-bG^FXDaUS2^Wmo8jQ6?*&oM?oW(_F|Vi
zX=U}?I`h`8TOb@^cehDJAZ{mP7rM>|HVr{OA(z_3Zz6x}nA=bmRMM}IqRY_8t$+F&
z87ogy11ieD?*%Z%xP!m#_<O+eXLgg!x1JomkU99_crEa`bpUPzTK@<VWjLIEHiN~K
zX9!qNJM_wOm;hE_C!OP1ek0(G2#u7On>F)Xh1@p_J%%5D4zfk1bx%*vMlt*6+AdSp
z%Fh&XUw2Wc?_Xa3kqOybJ>z10(P0|gkSFMIivWrucc5G|a<Gq<WFOck5%v?K+>-_e
zDMJg~;}<UkY4qIFr2{1ru~4PL!FkyX7gI;OVVm1S2^l0EIhEGP2c=j7KHVH@4bKAC
z%R^|-*FcL~%DQ9`LBX>KXnQE6JAR+<df+QTj8IsTP^df8aFo^f<>f`d-23)|8c@x@
z4Omhf>e7Khia#^rvHO4jy)YBH<2ce_bZ19pbR>E&6cF`$@k_2^gg#+?I*=VItpQQ1
zpsjCRzkWR#%B1zguxTcY_HkhR=uTK>5u~62QT+S6fGC4FY~G37K=09(CRy>}UcGDn
z{{GU%<hal9Y3!84ezad#4r%ETiZW3LDy?C0R>F?^3w#iFq|W4_F7e?2Ghly7S`30l
zJ=NkVDRG6%sQx_c_O*Zh8H`^#mgaD}6wTL<FtPGExXWAo2H)w|zwcH{c%2plLy|zh
zV?JD_rwuTYxW!~@n57^5GBwu-eFYCBn4dKtgb6BJIaFhr;$@VlBKB!HvTIazwJOav
z_F^_R0AW$hh18Z_!ni^8E1=kJry&&*rwJ{-5?*+Q=5nZE&)`pRfWFWN;q;!Zh)|AD
zjenf)OHjZz63?SMa>j&y!nWX15xf2nxS~;rS{mJ`GU{1Hq)H^RAe-TD)Z6H(i4g+)
zpiSIm#W&;tY)+!4lYv_FVwZP(czP>FJCl<Zp_83;GFCep+s@CQk=l}yY}OEgcGR!P
z3-k~OS?8MG+=<XkP55^#d?-rtj%(jf`u}A8@)>wC8Ms^TVO(u**xjQBxgYP2m^b@y
z-sqqyK)Jxl=CF`-_7kR!Sy~`}dqGa$rh<n)gI$#7IF^9#t&G-6KZ(%_;CqtVf2>@o
z0wNFlYzxh(^zqeXlbXNe<mAXjsT{G{IoattHpY8BW8;6lEnqcqIMEb}BdDi$e=@=E
z*z|ssna55s@F3Gw0J5N`PLK8>$YuIjPL3ogO?c$yLTFaay}jmQ`-vD{fkGj)AIQiT
zey0(DaD-x*JL?XR5jpX(kS|S5WDY56HHnWMOHTii9Pbl?xOsCFAjwfWnSp4v#3M-S
z`^W=1gAF|-2oY``jqW@(q=H5i%YDmc1H@tV>eU1)28h}zd}vs~A~7?W2>K>X0*i(0
zL4ZAV4;8rh+#AImU*xMHFkHuksC&D{#O5XwhU4({WBASLIcO2;NfDJXT10Lk7;cLM
zg6cS-qX5j?2?ZHP$Cym<5-On$&U6`I)%K~zn+ID8d<2AcVz}rkzf1TV>hrPL!*u}X
ze*X#?Z;-~01y1@0X)NOs`ImKJO5A*rr@>7$z}jkR)JVwTtW=@wQc7Nu-=vxpbbzA*
z2uXw;LKh?7N_Y-cdih3PoUH8WiGic`uBxi4If|kdk;8OLR;;*xYJ-KfHCeotwl+f>
zb=+wC3NraM{imG8@BA>#sbJOA3r*Lw?c29Eq`SAEacRe=g|4d!E_x_xm%vKk`<2Bj
zamef*W>2ye*eC?edKGD+maWpVe=fq0qB~?92U59c$BrIF^Wqf-78Y}hljMd=aYISx
z!np-J_Z>QHY_&4&mFeYSaxCaKtZcrGVt)}wVh~N-?L9YDa5~;SpAk2MDWngKc#MwL
zzzAsK5B&b5av0E5br+|ucGJdTNDV?udFYR}1Ihe<Fuhw^rY7r8*QYNdcFXLS<f@cY
zuYR-cu2rTh!Dj*;jzlD}87`vs6rjqWY*{vBIcb0n)Mh!QqgUVr0d~BwY9k<a&=hd?
z$*QY*ubsU;k)v{ULH~u|7k%%+0}ew&LnguVlAwq@z=@RWX%B?fCuciBiJ&7t&b?6b
zyEa@S6;sv6u<*ftY^!_cKYsi;4hNH!1_c!d-!sU=s)OP2H(WL>_Rs(0F(v;G@TjzT
z)>tTOJ7_5mRHaB4qsKXTR1dlS=x*+|HmjegDEA+90R+PpR;QOoDDOmWms@u$mxP1C
zaaq~>v9Ynh)Z1wUQ-G|K;GHfZr4aaorhx8Th_}LCJo71r-ymv|0LCrZX|?fs*K3na
zjc3;~cRg84`~I`evU4775X|Z<bo9y=48U`~Eal3Yk3dxnm5%2^w@V$H!2&9I^_n$9
zC~U{+AOl1Ecs<cE?+-ZNy0Wt$J^RL|@d=f?iCq0iztJywp`UE~Mwb~aI#F%MCMNu_
z(7}7`AOsB|-5;gnWJEaWcf!Z@N$9wT>bRd?9;w1THa12+4BmrYuQ<RSv|Bwt=jF~U
zW@G`pL*=d^w=?MfH4XKnU&l0a3da+QUQ1Kd)w{qH=$ALp3dGr@n>AF<Osb2-Fi=DK
zO<xd=ZP3y0F!H=fD|Ft4@SE1f!n~E%E-cx7?bHT4-+wg&+)Q$R+WF6yp`H-fhPOR*
zV2@c?SnNZ{NN)A=^bAHgwxipN9vK;ltQ`z{3!<RwY8WeegnzN-=jBO#{`{HEa3RHU
z8*OPyi3TzxWPz&5i&5$u8j6q+sNiZcf;cIzwzjgbKY@wcbDX*0z%HiqnpE#u0y(Rl
zBnH4q)SR!AK*uu0zf00au$R=$SF(uL&px~F_3PIIoyX77Y~?mI#GSf=2!&e1jq*Va
zzwQ2|jD<Al{9T5cyoP5U#b86S{SZGYZlTT0%p+992M_*0ebJLxhmt(AM`AHmy;ESo
z^46&hN3_wk^xv2KpHS_UVH1I#K11~+eM^WILg$w}Yi>S6NCfZ<(769!fBn_m)n$T=
zb}abDjo-<4psW~u%t!?<E0p~M{yn(}+V(+<lmX|hrYG&(U2vo3d8op!U2k(I9XEdW
z9<e-k2|vfL(CG8$&lkM3it^k6(Gk&hoDQYw>9=p^u{Gosx@!`pa3gw8Pu_gWlTkoG
zz`CpY1l_9d%i#bqhuI`Xxh?+p4n@CA&>7VDxAy7C8f*shDGpA&y=LpIOzHRF4_Er_
zDOF0+reVfz|8)+6bzdTjd1?s-%tcUcq%^9UPmGU0B(oX*VrJmWbjd1;<7gKKlMsn+
z%*wi1Hrc#<b_DO*e;ono)3#3s#s7B%j*gB|DRP0L=T5Ux#}XVS;$>xJ38KkISAg66
zVLhyMKQq}PHZ4}a?8pR={T=N+KAnMqfoz5gD2^qZC!@}0I`vvEq?XiX&Pd>M65B$X
zD9IC_T3YlonHU+#U9jf}H2l98>~Y$m5ThxiwriaiPRTMX;^N|hbmsLszn3bs?49vI
z9Oh%Uxw!#_YnUgpAkaMp9t16*5_X3Df_Lu{5KSX`j&$+D+){KzZo8FB?<qkCgQ1Q6
z*t@zqkymYjD(Fe@?-@l3dY70-?NdOf#;n<o-;t!{I`0bc?>1le+FAXS5`CoTk^>}_
zmM>rax`{Qy()=@7f6}JSzp%n}axc0ImdzxiX<>b0+0H%-;a|{%+O<<h>6<r?zFR=)
zW;+CJ8WESGf`j8R__sG5*h}U-g~ea&1D4|FzV8Fd)oK~zzC+hesk*pbfjagK|8l~^
zM(chg1YxTtSj2ixAK{O@^U-vknX>KFBR$hP%m8=6dkXTW4fc11tv!-^p>Z?0<PyKG
zVj<tid1Fm@CrbRZmg&cgnl39(skzePu^WK)XCV`&IZY%J4HwL^u<PEB5dk1P`N_H{
zj;#P|ruV(*bm?~sp+y?Pdm^H2&5vhr+whY=!UXt&6F1jk1hIm8KpRnnoBxog67pXO
z2mp$$K>J1opukXXqYn1?taLSdcaFb)9VZakIdDF=n>UF@4wy%FyXyf{bo-G!iLsj{
z#sJEX_vNt=9`o;)ywV&6h<sGvhEO6pKMwxJ;;7(pdc?`-h$U+|kEYcHBm&HA6g(e{
z_KH&m5&vDn^?>_N*iJtCG@}95Lqr{5VD&0^O`YiLB`0aHP{vx>7yC|+jgG$e{_anB
zZk;<Nn$<5ZJRyP(ni}4|dUcSfqhJesXwN6@G+l$zM8+eeo0MX-zH9@2$Mw#_0;^5X
zzg5$YQ+`cQyoh}(Gg8E<9B~$)ZyZQSDgJaWZ^1}19xI4E9ug!^(0orlDfi8#x45$i
zX(#-PFe#k*d5q?l4iJ`eUrPLio}j&goNd9??%|$KFu1cIn!<p2FC{L`aL0}mk)ML-
zstR3I3EF&fYilSDQbr<Q6JEq77f4u@zC%5j98tuHe2#yWvB%&EM?%OjYRt;~c(G}D
z_pBRn#-R(u&=@a$BEOEVqism4z*!M^9Sjg`C4`#j>gs9}5A1mW+wE`|XQp~xDzcm$
zARQb=rhxBvpFAdx3y0j4qtQq&a7%A*EK2CR+}y+9OD7<n5~vKE((F(SIldB2KNZQQ
z8pKG$70Bk=$a^*zXQ)o<IFCIYy6BY&?cxa_`JD}W4?vE+2UDHM>Mbo#3A4c(sAoB*
zViQ>28^HVxLec3&`?=E9C2N)NhCYbasyK#z8=1jp^Bvzn@G@g`a*zOpR@U6n5rKsx
z;yl52!@Z3`>>id76Umzg$jy-MP|u_D7UE?p9=v^}rKRKe10Z}D_yp`*g$S+mWYXgX
z*t1(osH-7e1p<^jw8yAkt(7ZR64Ht}FC%&u6H`E4*hB6WEaFi(M|0fS*%^WU5H=((
zvz@?8NN)0H&z^O6R}6uRAclGH1Wak^=-|YaNOdFKFvE?syQj$l%ziPPKpEp>WdwpX
z2)n#ml0IYz_;&;<R3A=I)<wxp>BXzwUbXGW1|Z3Hck(KiWho7q@3gu>*e?yEq(i=8
z6zgs{x&h3(px;go$ASztIn-x4)osALR6=9#u<CvL^zw+b^zFT;{#n>5jCW_GGcS0%
zTwH<*8HUodL0em!KS&X}{2LvYDeuqxpzC|d#!Hj|X}pc?q(62sCMM<@YZEzPT@ZFj
z6g@nTca0cArwHgte%2^AGDA);HJB>&nj7Q>PVqVnF9J9}B`7F(W&^RW^Z<%0_xd2C
zyhu+s+@KSov|}49D{H}|<u-}oqTUq!f_1MN8yn+Pcd2D9hMIc^8;9}c&6_qYSrBiZ
zU`MFA@63ny9n%MZwQ29(y=v}${{ALtx6I{$CUn2)WaP=BQS|L~y<2~EX%Gu0*Df-=
zTSp8BSk!uY(I6pKWP{gf91g9+>mOK)_=U#IasutTZ>-w3uTT#w{06l04sjtL_IhqT
zcsnG76@E<<0P_(<{JuLQBO@us+TLs8<Ky1|>UBSq5Ec>Hgp>Tw<6(J9=iR=2n-@nQ
zjg+EPOJh>?&`fn3bIGS+=Ylr|9eJkkbRSa+TD8a`#6yQ$rgy}HIpDM2W*`~J;CHl2
zKI080`-sr{Srr)T-T68Z+nx<*=G_PLC+O5|FE6jXhQ*sZFJ{k7r;|-6ZAn~)Zu{M+
z9JkSbwNYGL+(t|Vfu@I+e4CjVe)EqeiBrf%Wv~4OFZ>ge=YH$f@kG-@hyL!v3sY)_
zn9V@MLwl2=rn8p6+Jh`|N1CM*XKE5BlJ+XD5SI`S`F^MfbLlB_?Nm4(l!#L}%SFVJ
z9m2xua56~(g}|tH#}A;Mh^DXvyb`%KV4^xopIpzH8rJAcBiTU2rt3YVYSL~?EEE*T
zvVX2F<vagLa!ukQ>YmlGRb%1rT0$o_;|%zEA{Gwe^ukv(ni3X_2OPw)-{WRxW^~eB
z_6++AonqO<UV?e|49#ccqz*u@HEfV6C<+m3iJ>9sOp0W015YMP1|D?o^0UPzg!bTd
zY6DSIF*)Rf?hfc(_^M5{48zgZwL(HdgrpPk9k$%euJm*hv}Gelmtz#f&cPvH3_;87
z_H88v1v>CE>D8Xax5RJ<nJ0(C3asj(qYib~so;RD1jCv&+rVz`ut;d5bew=c6{C|K
zL%eyYV;>sK0&pb$J)FcYCnB{`TN=@oeH<W@4uP1+Vi4&`69MFRVu17Y0DhE0jU@~S
zat7(>MzkeC>VG^sdJ&Yo6TMu|-^6O9$dYz7AAfXLN21aF?Ve?*ibsh;h;T<kO)!3(
zIHx!%@6ik`A3^sO$IeoU$M*H(OnwIp5oJUSsDt*!&x=TzK?WrY0;d05_vL(&_2`3e
z^}?aOk>(05A&*T0<fihDjv1`^$jfMw^g>*RVm8Q~>~Z4gIrcPad|UX$7r3<Ls6Qkk
z=)@{QvLW0}0lzKlFZc+ZGT{fVE}@5@I|Z5AMo9R698A$g3m0;1*^<+$nu~g=oNfc0
zij^ShhG})&{`Fj%@=$!50hb5F;W7AOV1`U1=yQFKq@Un8iCljU!V19`A?bMx=v!C|
zZI?Vdev<eb6^Vv&GzCaBcoCwV6VCb9ILFv^gJ&Aypzu0w?Om_-keu&?NUa?ChOrUn
zh|tOp4OM9n6V<bwo!pu6q#w&564MQt#BTXJa<I+zS&+2P@u!u-!TlecR4zrM;(mrM
zLQcfL8U@HMGw>tlkcFM$<H|T52@vdsI#;6B#MANGw-%kxryAHKgfE~5)w4|p%}g3z
zi^nObJy@U<m<D6f0KpS~j!sg#8-zNueD8MIwCpkp5I>fN*0Co*0hNuWNG`YWq}55?
zSi&eRuEdV#7iK2dV}&n-+RS9>LVH<mYdf+m-JlpJybUJNS;B}s!9(8dX6AVD;)~B<
zk9Syj3~p-)wH$Kp^=rHpi3bjySwr$oS!#15)Tq#=kBu^|Fj?|>VZ+8(R^w3kUuD%e
z?53d?5D;(&BBH?tzwsn;swq_O$@$L%fZJQl*H%ms=2g>Gh2mM)W2%FYJ4j!}z;N%J
zrnB{zj7FSoGD&_0UR8>a*#_xQ#soX56k@7zl%JnpAKn1NNMufI!+@SqY^k_6B0@rp
z$h%3McGbqfbd`PRu`;XAu;Y1E4XZ*!#Ms`>PG7iZD%Qz9K)Ls(hlhauNRO%4K@$^`
zP(2s;7yPkfco5&ud)=|aDmWBFZ}*{duf`-1DL|TKo2Y30d0X3*Hxm;Rn@ELz6DVp&
z-j;HWud<|syFiZ%lOr>bmH$;Wm&IlBl%R-+h8{xFc;Ui@XGX<u-MqQCqbk<?4&I4i
z7n-`^nrT>JaPXFGXtH@#O&YoNm^N>|4Y|<3*m1{{tp5lp>b1Vm#?#&)O7`8pbMt0=
zJF|M^$=yvbLfYTpYa_(V<Tv&gFK$3zN&#<gxl1FYZvj$Nfs{7kKsY}kJ+JffHsKXa
z&)YxIe(FyYbrO9dRd{1b2szyN>H%8AC4o#6xKKc~pqVVX8+Z=wAy09rFT1h@+qwf@
zeaRkY?52%-bX_5Hsr(5vy=8}Og8<I~mEtc&r{mA!l|Pj&IEG7FBy3DYP{x2B!y$#^
zh~)MDow=J+q356|>US4EsD<9Cgs|xB2jYS*5rqSp5)Btg7pKY}G#l&4w@b%T&BbBB
zOV6-db(GWaTEA=7c2J{^fpZHCqnBdI(xr)9h81_Gv2G_24he-9u=470x|gi&CP({x
zZLICkpgV*bq2Z$gnHExo{+J-ymP<Ua69HuN(?oH^CO!Gq?c3#eJIu^6I&i)>cWTQU
z090NaR7-vFA|y2yDS%jS)<RrdvS<i?g!<2EZr*E^w+RyDkPR|YgQcAVy58j~>(C0-
zKR8&pCmFktcxVZHS!gRI#}htJ0a`tjaLEDzV<m6P`T$R6-)#?f_evPaMq0!D{U7jt
zo#y7|duEPM9LYJM-CAkoJ^ssIy@&f4Nqio9SXdaq3#*Dsl~q$`MPwM#yM_<6$w+Ly
ze|qw~LeLMg%tINHl{`An61aMi%25alzB#q_^q38Zi@-`k!ab$JjM1pSIR6Mue%JP{
zVV5mOKRz1`etaoT<LFjZ55I7pN+X63HgN>{)QPBx5Dq#uiMRDcH5}aDj<TF=!zEJ$
zUIhQ22nehV-E8AHE_r5WHlaI^+211<5Vf@8nW7YK7@-myh_^pIf#+dpobmX^9Co;6
zanDS@gc#&kHa1k|IvvsVe0Uqlvt_U1xEfs!I2<8;)Is3Cx_WbXtJZqhAF$gZ@J6^)
zW{J#rbl5)zifl)zDjXW!L(W(bhgmDRZD9=%OJ}~<)xKs`8airJuZ;)^3KAKVzmvQN
z*GkSJBMgRKWgS@?@Bz!95T<^;#tbxHfM*@Q?;!>Uw6hz{250e3j`lZN85c_pgwT9k
zc0o&16H05|E*2NCvC29;={ODs-anUtP6C-iTq~OW0#YBvu@S<^5DvYm`gp+H#l7`Y
z8_>P6cGIRv0DMWHfr1hS%VzjO-X0!u*U+BbcCROEg|F6@ys9b{6wM?qyf<;n2<>Cr
zw4L@zOHrDy_IY-(;mwhpI$0TLsx{NXI}d_*ZfiY1d$;K78(_F|1LGlvFll|)g@q2|
zSVWCx&e`g?Q~c|19Fs%)6v&IN)-=3VZ#w<-h@L^4>eYKwBfgovewf1gn5vHHrr(hQ
zl@aAyO@^&dzX=%~M#Nzejfr0c)8<+qs6>1!hcV)|904QE@&x<bez?Xkx&Y~07~Wxn
z(2+xj&cY66%wn-0{Ca9c^|11I%LlR@|NYumj#kcD82`q)_oEgXFwu4G&e{&Gy9`af
zEnmMrXxy>x#e)YM{32z2cB&M|cooC*;W9TjM~?v;OX7kLJ^(tPwAE1qgM-i8Wxdzn
zCDC$arz%g!e<jB>Dqf&MRKQIB5arT*S1L+HqvR%V5gbCXqH(k$8eD8Kb-<d){PD*h
z{GA|;j{zLpjOx)63P@69papnu;gr;a)EerbKZMgMXD%jPba>*Eh&T2B+Y4~$$dL}v
zZ<)j_6gaiO*Y_A-?Ric*ggQIRpXphCQIze%Yg~%*4G)j?HbG7ytsr=B&bds|PX3U`
z&>dANATm$3TU(_V?_@ZMx7Jjk=O;=_Xz{WQ!RVn7m19}C=m}mXVviGRIMnY*BQMvV
zMP{Ugn<bvfUrp>$P`wTu=idL+NI*&|kC&Y-^kkSUlV1xKRkUmrXsmXZPL^X$j2S)k
zF3Uvgh|UAno#+5bR1-h{$rb(fA6`EdziWwT9tW1=>3-jbHn~n9#*%N-K*(Rwa8lWV
zj@;S`89@Lv13SEtWg`w<+^-|BmP3?q2OKihxYdBo=-s}#WY02mROF?Z8d$DMJj2%}
zEhgoJB>?enU)HpH-7V1P=)BupI$8b`ti6^qqUKOqNCzx=nbRpp$aY;1&9iWxhoH)?
z;-+b#3kHj>qtqm;tyr?;orMV+0!Yxz3`orAsc;cNqR1lKd8(_HIPH~932=wV%hbEZ
z-D&TpC)n8Y`1g?$OeRcHFJ?y2)*B52R1p)M>@ZP|#|lN^Z3k;X>%s>VCUeWJ34?gQ
zUry@_CmUzE5jP2>WSrKB#37IxwDqFBMNHeG1YrFcL3T0mZk%&<E0g}Ap`@Jv?G{8U
zCVh1@+w9xR;Ws01_Vb7-A|^`g4`f)S2$t2<)Wn!-JU_dC6}vRz<_@olj~O^f+M+xE
zA7gI<)^qxW|9>zT+Zg-4r7V@LNC+`P2~kNYYce8Q6j_onj4ewWDMX@@P(qUIsZ>%S
z*`rjlRF;VTuk+zM%<uaBuj|+4nrmj%r}y$a=Q-y-_kEw-ek<ND#2k5^PD%Ur?R_CJ
z9C2C=j5-us<@vWED@>#6N`HHX?;iI-Jx^li^cAJC?YC(diY|0_`W*42iyGoFK0wO=
zwo;6x1(_rzla57xZ?e({ixx>jP2T9Nw`ke2WoM6xpIYg#5#5GDfF1Jc8@;_kK2Kx=
zFeDb<ZjqC?O=v8Tx^GkD94#T<Q7UN0Z`$K*kIQB8z1SlhsdGy&RSM;b`^hct=6Puj
zs>|Cd6N+BdL$vuJQG3fi3OCgv^js!vv9h6wFKL2L@ULAQKEzp?FIgY8=BC`$q`Oe4
z8Mm<Sibwb9_riI+T|H+teeJVOsD#XkovxoMXuMQ+683|XCZw0opZR_<trWO9@Hq)h
zZmf;qP2^ER(DfEY1UcCA9<XH;ih?)q-%sHzJ-VkA)%+M>)&Z9r6fL}eW)gA4f-9?E
zYG56`+r@6_R`uL;)AHq8+rrixTF|UjtHT7t5>(zfY>elnrF#x<^9%UAZIE@MhYb&}
z=dJOSQeVG(KAK9qG@xbB!wkC4Q7Y-64}o+30Zv=aW_anb<#XxX6PI7XYxa2b?86DK
z?4%7iK^|-0%IVd;*m0y5XX!ZOZFE4Mnp;|_%W}Og36Fddgn+Pgq0{r3Zr>P(ax?mq
zOrfL2&jHa~9MisKOaD7X&FhE9@5{|<RJ4}_vCtI`{8(?n?1aKghH~4Lb89)0Bnm3O
z*V|qL+`&A*dnoG>ueRq?%>(q?=DL(UAxUZ8u3g}wg;S^Q`c5MyrnWTOlsozS%1X^S
zlb43yuy4nX419jS<)3+7hkr=Jc_K<+Qs4xk3EaPb-xh9d)%|Kc?d1~zk6w@gMGvxc
zVeJ-Y9o|N}DiRG7EDQsvwh<39y1V}4_j1Xs^VL^gkp;n}z46LH0!{PZ0AQjYPJq@&
zIqo;>CCQ}fgM<rKBic7h-WXXBl)ERy=u~<3xxFF7P|O!UOo(?K)UDfSxPPXkN5X0)
zs7$1YOnI9>otK4}aWZiu8@jw-pBd9xWAh6R3dO-Skawllt=pkCB><s1ds@LM2zb73
z&#Q27MH{E?6M}AnK_5+h`0%0I49y;{Z{Jeg+TU@T{4jB`^YVohF(cS>SwKz@5XT99
znYTXQV@CLp0=p&cyJ$#@nY=^DouU(f4}SXdyY!;Bf8Vd#>zw6q^33B~mhxJaU-~+s
z=j$)FOYzf~7qfD86GJEG)l0Pv%ig>>z9R4CxxM<+)oCQ}p5WtJoSppc40XZ;mzu`K
z8v6=I5lQ=_Z|h&;GVNCzBGQs18IR#IAZp2nQPD7sawAhWFS-KwXlS}2zo;mn#OVuX
z!Kz-9XB-_G59=r3)fqvpa?Z5etoFdE6J9{pUK$UE);r36)hXHZ^wAi=j0)=2F*y18
za;6^QFM4wG5;6!UVv$EW)aTB&eS2vTHtI0g49FrTs{HNSO!A<2*kHF9>(s97A*EGh
zhjBD*Ptp?*NPqs*S9{vG`~COri(j9(pJKT5=DFe?XYJD31~#+N)*@9NAEetaYgg6X
zfz`+m`Tjn0kAz$ya@j7ML46cEC+Ro2@I!fT;bFz{nZ2f;o7!@-0WtG7FGb|!#xu&Z
zZHCJ^i@@1-$UH-lK<!;QTA8@%ekcf%ffuT)z^8YkINp6Amh(p)$%VJWlX<gz9t@f{
zJ#CU&IH5?V{A9%$G%3S|Jv{sJ#fvcd$S*iP?_g&w_)yWSbB<8@1jiESpDaK_S9dB&
z?5PD0=eAxytnyj2@p}g647PJSIIvGz+EU&39ovl`I%cRx)v~SIKHJ1OUwt)I`_aG;
ze?vwmkdmWYpE{lPQg@_`pE-yv*a#RA@ZD7HMN{?bug6Q?PEPFP{HeobZrzV=*r7KA
zhPpddoe%rDKXttuT;cdR5ANSz=pPaiQa*Uk$R}TqXpGDK+*I#fJvIFiJwg13fgkC0
z8u*Rl>^^LfoL`}5E@fqD&zN4<P{<Z#bMLcr3sTP@ub#TMxxS<wdse!8^=dip{Fhe(
zhW5$d{q931jQI{jmz%-Fybat!?s<SH6h;#Y*`~LE5JxjU+z^ojT@LwQv6t5XjSBm5
z{9e7>SH0`@2ol6Ef%tg)X(Cn7;hs;=xt90~XcO+Zul+RZXYbBeKDqo)4$`@j*LRbX
z4QT~uQ%7k~W5UusIH+C@ZRuB+UWyP7b=qz616gcGr=GY}@XU<7ZN-O0f>*JZG8t>2
zRh1a9(`EC94IA{@Nn=I-qNAhZO>pJNR;Q%x4!V>Bqbl$WW&rAtk}0RXDgD}nx12^U
zE1aNfTRR*fmP|lNxz_E~{rTEsr~nV0uPPfSi%Pm@;?izwHLCLI<~}s$-ETi9dzz0}
zSfVoOk$3%*{2#8aUKO|~s3=($hTBrjaF=WVzZY_L!c|pGPE8#rrLySjNWsG~&Cq#^
zzHDNam-Y~(r{UBP+_vHO^cr?8D7P?_g}j_qI%m$D@`|U<v=iN0tO5Bh%$?KH+})+3
z&G*Q|ZJ&kJz0lt!FYV}Yqc7)PCANhzv~CTs42z?yki;qbwlDeMvi#|ljdI;fsKu>^
zUnBfyN7)Qvn`T;FD9X<dA&Luw_<(0j>mIbuVw)XSMW82uUi3ywsuxJgy~ypMCx--^
zj_MYDZ=f`)fDIyR2eEO5P{w1*rbR;QtHAKXOvB=v``1c!2?a~6^MoKo$eQ2G1!;`%
z#DGJMmu|`4WAMkwk+!WzZ}o7#cTqZxo^g`=n{FRS<`VcBoV|jR-0<nhKPtn)B$s1r
ze$usMigaD^BK3moaLf8i8G3$mU|Q{$$K#kl5m+-_^l7C2LJ%<rw{PERn)-&<j*~Mc
zO^dbn%O7Mvz5QX0UkOEGNXIPzKV-5~DzF0x6EF7qy5*M$kk4VIp+sDTc<J^#ZXyeE
z?&37HuF!wy*LPuKx>Uud%x{(9{h!)7aPq#YJpo7n1E156eP-El>8tz6F(;f;`LY{?
zQs9+?_Fy=Co>ez_2OWm3oJx9h-0e*sMLj8YIMuptYO_7Jd>}td<u2+!N<59CMLYk3
zf%Sq;^OCq8b^^Qxjy#{;{dzh=2=D%A*7Zu>SltKYy6dX)7z{tAy+$97dxyM36ip@0
zaJ8HN{`<@i9|z4s?Rsc@yH!w-=!|X=sUw})sjmk%`~%P9P0%9(?DFVz=4uwsy`yaZ
z8my&(_}ALCM^i~*?!cj(Z!?_+A~k}9%o5xi2XwuvGK+l8GJR5Muay<xCp>$htnVTJ
zqjHMvxqAq*1L`CHSEp#nowWS=GA5(!*@2n0-*%W_7!(5ZIfMSi?dM=@5oCvzpLS6<
z-J{D~_2ml<N!`HwmXN@2QZxk>z30eF`@CPZDyn|1txbO<TDhXKil6g(xaCN1D_OVJ
z#aoBb(z4vT=acFZ?NE>D0>3E_W~eml&b>@A{E+5dhy~wv^a-x+pcx24wyQeiMjL><
zIXgLJ;`emJ=)|bx)K#2F=AgJi<yV|9zz-9W&~NQ^g9%kucSwg90NKue+6i7v;B#*9
z+0sq2PwVghJY-?p^|KPrY_C||s$*L%+YMw~_4Q7BM^USP;u(m`YT!E<Vyn)+6=Ci}
zp6WTNwYs~LZ{IvLrvxT*X>X$h@=Y5jHuUkLYwpm2;8V`)lc{oTOa8B9C>B)zY7f3F
za-#T?4(YqEtE=)m%`Y?Z%CA=t{1f-#`XNrYdo5bEXffC2z%nKhWRs^Yne*Vm0|)=b
zh|->aB!4=tvF)!nfOV<hbZ1whK#U)<?%wC+aE9(UXZdL-yb2^6Tk=S=%d6odbjT>A
z5puA4$RHi^$MJRRKihWh{=kW=X4@A}*i#gDXNQ`GG$g>wp-}kFjf5k{yp}%#T`{nn
z(CkC?-T3k|Jxab*S7*%HG5(hpA8Lo*cf5LH)Ww9RE~9&?P0LQXm7<k+c-Wq;Zqbnw
z+=f~0+iGQHxu@8yrC+C@emk4*9A#WP`?sggmj*Z+)ZVeD@=NBi%26tRU9#5P_PO+0
zsm`4NuS>po{O0SDFfZ!PyRZ3I_p>4_N<R<e<|K_W?q=$DOu2}H?P=re?2+=RaCmii
zP*D5OLx=3Cle7YiC3rS8o;rlGd|WplL$o*N?DHt{{dP6>*GY;e>TJA}Qg*{o&F_dw
z^z?+~k~zD?pE)y?5UOrb{d~{T`1tq*n1!d8KKj~D^O*Fq`J8S($(4h~xi0OsudDUq
zSw|8RS8ptZv+!a@^{1WXF=>}BjeG$0XKvJ~<wN&%KX~w<ZuJD<!AO1QpXnfGCbMTB
z0DfPU<0d+)S+n-L$iM<ukBy(kAb(Ojb<FKf9>Dv~Pil#iR<2s*3KTl{bWC*gNdyf?
zIxa<~Bo*@dv;O9Tj-C8eK>yWpRm@iU!VnnT7WJ`=9#vFSB!q#3-UfTato+;;J!Drj
z&0{Z}vrnX2cL}KdX!S{8Kj&DW71NCl!-prP-XoVdd}ZA|vT0G?q~=tMduVZXPeXO5
zj*L%G1lxUDwZm@o==F$-3eIKT#5Lh*)s0$yudNE)L(|@JrQ_~vmh&mkcQ5WZ#4XY~
zf8wsb`>(tgVub5`VIbs8@CN_2nmvSC;C_+^JtJ>bXeK0=5%-hY6j==$)8qA{BPv3?
zrSUTOG|a$Ah(gvUzTib^6nSHLuhrcR+v3Sm+vm2!PeLX|3$v+H!#K))bFV#d{FBo^
z-2vv0$XqNIJ|d@RTDucKgGNj>u1BY>HNKv?weM?1cUDs2`_<JSR?m31i%Blk;cmxj
zDNF|X%`;Drp7O}p*x2@6+Tbf`X-BWD+bAp!IN%|F{Z&&4MR4{9_O*F1IiRp0afaV0
zZ<?yd274IvUgGPK5FDd-<mAE7P`|scNY}$lr7KvN&%x?f(*7J?Rv=My{ovqe+ub-*
z+)tuT#A%j|1x0Flsf2+~czCKhh3t=jM<4AR`}vFBYa^F)k21z-Yil1Hy!u9Rawhdl
z>VU7n$=T@n3g2Um8gk>t4Xte_@=8M=K7MHwIc34dz(&ZT!>k8RkIr34KXSV&3B)8w
zA3Z<p8tr_w@Ym-9Xfr@{9(+3SQ?W)udHRshQ|LeDZ@l!HY<B?-e~S+JNY!Z>C8#Mg
zgvco+u7{$g{Y;Tkzy9EX4ZU2A@&;+7pprbKm(JVkJ{lw~VfG4MuYrNVpwlEruJkx2
z=aHJ&Nc>}&MRi$~6UU6PaVv}BX;3$9m0G>y!6Grvf?xh>(=vs^I%n|_XS%@C$ymI<
zE;dg`K<6L#so?eN>D$gefqT^Z#K>y^jcQI#kw?r6;wm}B_Ef(fC&1G&7V(S|k5IZw
zqc_sAfL>7%4Iqn^=Yxaw@%MLHB%N-AmzuGq<P}EEn`h9PVK7Kei_g8WEdkUo&>XcI
ztdQ>O=S}C8#L=8B5Sw2mR(lk3#>cxpxl|ja`1+AY)K4LCEsQv95zQuTT7FyOsP{))
z4E^qUJl)vccgK!N_IXdA&K?TG2DRa+IOk85mGddl(@VE(X(&9Lb?erJo*E8nC&qE0
zEW3@ntpbfk%e-`JKQuGAUfI0eMPE^LzTO9c97+I=dVg3pIvx?za}K}z9{FAaPl70E
zoT`?iAfcwc0k#q)r0u3u`az$?z_+KGT4?rrggz0=-@h!nzZYBGfFmh(wzfjMO1W{v
zH`!JiDIYzK{q>)h`~sIEQ;Kj=;V`7qRNC0W%4Z_jDjG#?>2(5_4Vd?t)GpzQ!Ceb2
zC|rfD_Qu7<(Jt6^p7jM<HU)~b(YSG05S$P3n6Sz;Y>O|ZdBT+iI>51w+Gt&g`$8<Y
z6iXsh7QTLdq0nwcPLD%VHjY_KaL#$bd)Mj9zS-FhqBXJ+QOw?tOQ|4PnYyZ(8qFUw
zGMHW``gHEBPj_<Ph>gXEQuGFXsHn(hp9rz7@VqPr(ukX61HOCq96fhVPYCRN5-NnN
zK;j=-e;$(w%t66oY2k~KmRnKp(L;1I?|AYNLiE5QL^Yh1E6jkDbA6pb=o!usulnVz
zR*}up2cbClOq2wS63d(b1+*&Qp(zFII|9DpC)ahpWj9PE0Ax1=R&nvz`Zf?`b@HFj
zYm!Nx!H#$lc1ue|2#+38Siiz90FjJjc9&a=R;|W)9|#E<04Oo}vy5&OysyyB-aY~(
z{3spwV&E2aO-(am_Di>TVgJFHKR&9-lblgw#z+?Ex+v+|HMtdN%OdFZ+qqM0><z(R
z&O%t}`p@P43{A0kfgRVJF;}l$8*H&&EFp4Y7g!Kl#^MIK+BY!ph`gh2-HP!+ngjr}
z1OYHQ+G6xoX56ku|Ghh45ridveRnLqly4OU3EOWH*I8CpCQly}xCqu%Hf#-ziVqQC
zXn|kAL~vYG`0UxAlt3xT$#IcKk+d$<3FJ`fW+;l?8+HIryF=L$?2VqohtI%aAzLW^
zG)>bigFoTr;eV!Iuasl~yOduhe*mG}?~<HJ9v>1C+)_j}9lVXcxZ}duo}XT_E3H7|
z)Q)F;1B|zF?C6S|eyDi-m%MrNrbo;p>>4iZ*thTX83DEK7O?7;=(hhYNhEi6GZL%e
zot?b=#8H>LJ2l6nvg~>2`Ssjx5@%P`@|8T8_Aut0Wy-0=!e-5yxqMk|{J8P5YK2J?
z@?^)q|NdL|Q6PnI2TI(_uJ4K<|80NvYl8+y4*K>@A)d{6dVenO<oU2uk>zQRsw-aR
zX1U*p4LS^F965517>7M?n@@}Bw|_1_Ii9=lxqYJJlS>P}4vqhG|Nf%WL(nWoecDZQ
z&%b-lJ?8R*2MO7Quj5}2o3{_)_*OpuQ6?+KdEdSdFETHmAdEPF*^1Rm2SU3GHHvlY
z4E9YD&7=;sYuCP0HE>NoE}iLL*0FD^?CFWArGEiY-|5_GTIQg_*c%HFlw0rbhb3@I
zjt4kIdcP>0;l9uQynp7cYju>Vsc8q==E3EOA!X@TX-22M9H^m@>V(OmO>E+$f;fD}
z@3+Wn&;HKtSCFyq^_#bE<BEfpuyeM;1@ybSZ~JzWSC2A4pHkk%7omp=D1XGx+<9c*
zCXXqTGON?U3#QqhmkC!EUlvXD?n)i^mIZI~wvyPA!CPuiMXPDCmqTKRws@plCmlR=
z$p5+r5j_dM-^!~QIv&-{nm2#;k>2#Nk4x2GS3jDZMyI>&!{O79o-MQazNpBHTE-*6
z1bNgEuFEGt=UC7WQHLpoF+{adKs8ZsimgFICLvzOW-^R^2`r^KvOB2+#HnNqZqr}~
z=5M{#y*Oh85g5w2O<AO3EfjkAa4OwmU@23uW8(i|S{zPw9TobqAMzrCR%6O$2BSt4
zon`Oe>kMugu@OR)<SG;?{uLz=+Z_{9h7IgQ(@YdBLRxs_@%g9}lLB&5yV0=m&Ryt_
z00mBt_~Q5-$>5*!8QLZY4E_d6QL8AHFy7j8WDlaq8Ejp1Qdt?kNjEp*eqkzUa`n3z
z)uG5I<?P8K@e_y&EHj+shZdt5L}~%z>4?oB-RNnND38+KpZ#P_wA`us04p)kR$Sa@
ze+;%niI9?{KLmU&)FlwQ!5&p-L~O{C6s{KSi!6a+@T>D%(0dAxLEgmfVXMwc-6FV(
zG$E!>-&?a*En)IDYOiY|I%bZr3R>=Fx}WLo9_4H>B}>a8wIWq%LhD<4_d*i}Q#;1$
zN=0{H_qX;voFeHjBg=7ea=OP}!UnHh;Zg(O2_N1YKz}F`bquLPvOrU(ha~oqX{!>;
zCBtG%@w>jr!6^cCfIHsWm;T_B4}buLD46Lq?$4XSYd9c=N_$EwOOXcBT`v;9rZ;a)
zI4T4Oen2<uEJqCF<Hbv!wM~7-q8|OvzsvFrh~~+gRObR6i5dhWP0zM}eQ2y0T~4ot
zr-60_#kMVuq56$3ex{A1|NLoH{WE779Wn!$_JGT0Q3K&2wq~Ou-{$h?B<s@=zx~V4
z`u7(R1S}yHzh3$B_3Mn{s0#!BPmi2$-)VA*^Eqf67;$F^n6<JL1AekL3BKmS#gCEi
zRNqpL@PV%_NmcD#ufj&N)e7iO+uJ$z5m^T*@zeVrw0t7e+nSX8WDjAlIehqVLtvnt
z4JXu8gc%Zgltx~4y4NtVaO&7V5f)ZewUVf6_*4&FP;hkL;bUy(;)wxxp_K>yb0oNq
zW6ME}IiARuoDKAXt>SP0N633kOr=!mem#7ws_@)>fIO0(qxV;tMr!TFW$PP!`8ss1
zT)$@uni^MgC#MsX(O;oSYEX8wL`)}5+QF5boR`s&VFe3{b7Kxe%TqO!0a_TPG@Q`L
ztnr<}!#{*=>Z^PR@{g$BfB(I6QFVQ1kN7!S3WY`4=RS8<uVB1}#plvT`-o%LpEzE(
z)oR1sSd$0H(Jof*`f&-^aIFyzCrjDpgcs=@IYBIq?t(5Cs$W|b7XB!sJlh@tfKN|i
ziJK5&)*!AJa1od-Xxr*)rfozC1_g+r(WlQu;b&4QZ(&?SSb_50-*6W_+VhznThDfb
z^eS#uq-hmI(jEf_n73)uX7K8-=S1P>lY9`wP%H!BhGqkM3CJ8Wr&dl%|8pAr%5PkN
z*z}+En**jWJ}GG1hnKm3;3e@mf}F2(W!2{oO%#ryqBe&!<?S(a+jEskiVHc8ylRC>
z<*t1pQDQvAPl~q!joYg)boITK94&V_9r3I5C6r%}>nB!Ee)eI8ze3@tW?l7L@Zc9T
zy2|?%GQ79!Zp^%G%70ZT{MI6~3_cL$`m;1C>;#EcFDgyLTNIY_7^SN}{y4Hm!^z;y
zXFhiDoVDm|QMbZ5>4=*wOwG&=ifaHKTAsA$XEG>4;dyV3e5^{;y#IbI>cNAwcnohP
zR)1Lx<aqb4zQWVK7dBG!=g;>(2r=jgX=k18HBBsFj7;EZdmp45BSL@vr^0L=EDV7P
zReP$d8zI?~|EUP;AQC3^r^^fe^U_NpDJ&g!DzCH-(>KhP>I9}$E~8T8zdWlyFAXDR
zLkHAh22JqbeC5U|7D1iwop=9{*bcY{*1TsM5!IeC75c6Pj=B}3bYs1ZyZJ;;Il?sO
zY}X)kV<{nQ8d!cORKtN?+V-3MH-AO+ZzU44vNI3=v)@Kzw0H!*G_ufy6{jf(8ctxr
zZ54$kd_N;^+&iblE`Idz@hC;{S!5?l!7Y@L(QxNl%Wmk<YsPtN@UiWx5188}W(rJR
zQ0SM6YS*OWogqI9sIr&QcG*?_mhz{eF##ls`1x$5o@=pLEgxi<m0uFaEKYdm&rEAY
zP79WYLSe3y-r7ek^;qPr+kR_G!+)iPkL{@zL{36BFNzgv;c&<lqj>;k&BU<|l4$$i
z{y|WD^yJC@VDN33b|acW>0eq}S_;>TZ6(IfZTK6L@nC+Vp<jC+r05biF@PvWaVQkA
zjYZ&QsgAsQr{BM>VftQ$rKLxOQ0{GJZ5>Hs-%g&!9)@_BCO&xd5th?o&u03uO)#=j
z`s|gJm%pR!r;H#r<g{cNsc}O=2kOgbM*PYzx!`AZyDI3X-*I$1nLj+97X%a#H|}K*
zz>WP9O@-%0=D+QS0Y1)~g&qvA7QD(#Ll_becD1y$L_Q7~aD-h<E;jUvwKA?{IrWRG
zA;Y|G6D5@9D`6u=`5791uyW83Q&7))7deF~A9hrfJm}lW{aF)~4a#THb*`OVF)FO3
zixSIHG@3is40EI15VX{}54HRsjfWltx>;*lL$O6Lt)ZmL>7}?zd!Zjzc&dv^{>>P_
zLH}70Z6!sm!+~%3DP#555W0gGS!s%{Uog|oBp*;SgvY|bfs)>9o7K=SVz$A4*0iCw
zl%#}Pd73j1!fl=m5#LW06342u81*({4TCbtCaRh~H$7uK3Hv0=i^^yu|JdU8=J5#0
z+hFpEDiK74M%fm8za3AI0m(vUweiNXs1d{9PPAi%MEwbtkH>kSnTd%UuRx{qK`&*&
zow+N$V+_L#Cqr4|uo1*a9)u#hgFN8Lp&1#4(LV;7gvlJqV1}m|C>M{kFd$}JThAwD
za~jB;?k{c=^J&-Y;o0R(r$l}SEN<jY8t+Qp{(>s5&SAb==;m0dzPeAIavFg@5w@S~
z@`zpgVt)E{u51M9?qOyy&RYssQe!-e9x=VbGlEv%phm6x>Z4irJ}8$0U$q~zM0_W5
zj$(zY!;Knx^NzwOgRbZ0<$e1pyj|d3WVm;Y0Mis76}fyxSO@`7|ADzdd3oGV$DUPB
z4UKWIE6gcKRp&E_TWDLAe|1pUp8!*C)M=PCV1I&YI4ypZ>46Zu8cwLAc)pY}_gzs|
zlKYPBIrcds{S;lv+8T-oV>`P;bsBe!q%<|0%uc7AJxC(mhT#>8;8)_C;ySTKdPd<{
zo2B2p6nX2O_SZ4qkRBO@nyR66P5}3SHUGp+Uf=wBMur7?rx?!|78b>NDBITy*UUMf
zRTk;y&m43DK~^JxnAZAm1tZ<uP7!s>Yi0_!hTM6-Cw-v{GsTHxQF!rxHdOsUQa#at
z^Q}in=LuU=9378yt0J;%)8W|PbQ^{Bn39s>t%_z=a%)&w19aPvMA^=pw}2+ARl@RE
z)o^&b(f}f_lqYm&5jn6#cvi8RAq;)Ijb3z>V8??nv6N)sNKOPK!f)Q2boqrPU#B}~
z^M%Forb-L{s;spB!3NV`$3Vq7$ZHJ4GiZQ`oMLB-rKdOmk!9KEMu(zev)R3rYJl_8
z7fG<HA>tb=grZ<jFK)FbDO(E5iCYk36K`R#P)y1`;g@=roL*V!EaDUDTknJP1Le_1
zYBf~EUSO{?Ydn-dX2^8SM&gyVXcP;|q*1@&bv^H;8vI8TeD#0fEUIWK1qvT0jbjX2
zoxY+96nUQ+iQtiE<CG|CcGx75>Z|sI$ADx@_OSXYV(0>C;PYgK$#-x@;?&c5rH~R6
zF$NDViv$rBQV7MXnjzJjQng6uQ{H`44%wb7s8I<{H9J&6Jghmtr?5*K@lF(;8scyj
zyRE^0GQUlEiK`QZ4579v(6}38G%m$$DqO7LUZ-x|Ar|HQx~~J?B`}{2D7T31?X8MS
zV**N*vtW-J<!S)6Gik+rT3!9Rx+m$Ow>S>tr?P(YAw{+kl>ca}qtfPAt`I3`rKO=<
z1x<eY=MT`}JDhK<^&U!)SzL=GtObe9ki}&pvwhuK|H)Z4Q7mwnPX^cOl8>SBRNjB|
zRL$Km;aCExt6s_rgP6!_9`u1ho0DOiOhG3Wq0(Cs+*De)Vzb5X$-m#KO`CA&-||-a
z_idvvJ4#|<&ttzw>gO_pEhoGX3YWIKShF3aEhhVnJg)FO-XJ1@ts260nye+Bd+tsP
zX2G70DE_zQpFN#&nAi|AE5ERypqL?$;utk!f6WO+cl_?*{Subcam9@|p==7lN{|8K
zOUhQNm0w69gz^p>Vfq2j@`<ZI`M@6N@%R^$63hce5RGz(5J8`*8lVkZ2;JR1X_-R*
zoi~UYj_QW&1DENYi%CC(*8<9rKN3d*&$jDHIkSyNXZ)OD-%&-RvE3<YOPdgm;Nrxu
zC5cF#oW`$O=I&lZ8GD<12;otT^_EB>XPPWKD)vaODP>C|#hK9{*vo-}b3h8EZaur~
z?E%$r-hh}qiT+l(@tZ2FFQa$bx3X-baCO)lhTbSGLV(-CNRW_O@3{P=udbRx-<uOo
z=D9p@yxeFx03A4^M$w(kU-R+BgS(KD<~2@o6MGeo6cZP-ot1IP7l-_t)}N^(zBP!G
zQm@hH^<#F?c#f5*$)AT(hA7D|tX#G8CiK7(<b}f9HzDaclHHF?edpf2Q3ALL7(f=Y
z=QG3}Q4}PADFVn3?At}rx`aO0Zv3!>8rSqQy1g(4os424t|zg_)?~0EEM6?ggdz7j
zvHCo+fhYH0<4A~PL}?m{8$v^4yj46?uCTx(4%_Pt-#6}uQQnzu;zdgQJhJkt5$6hp
z*?94f6R<$No%r?ve%=CDFnW26T#<^Z8;+Dv3DtFV&!li-*zEN8KZH;`Wzp#cge;Kj
z9&fAYatPibPr8oUFP_EoSQNVlxg0{4WIa0(e3c1(QGAF!`M<=|5%p>Wzn9PedJNNA
z_*s4*Bd^$M6){7Dd@<o5#ErJQ#5m7<Bde<Qv9|wRRlRXH4i67Ezr9*uuRhZw;Rc)M
zwq7lMpkI(x-;q1$WdcGIE2JCD21W7$@5*H6%+5y`Dq0&!2Yw;tQax_%G+f*@|0I+d
zO`Mn*d6X8daHVAgs;Db!%&!q_H<g1(%3W}#u#>54z+!W#{)FEwaTmj6a>8e&r4IU5
zim(8Z>1TS|nDrPY?&8uZq8qw1_8pA^dJX+)VX&y%YzHk+j2530^J8`XQ<pxspr#X<
zC*nw=+$S8LTp6li)afwY4Oz6}Y0Wk^Utay(3BwO?wH2R$6UAxr4|z!dJ42-VBd|Qs
z>>Qw&uzo^PVBp6XXd7`tz71i{*&fCRj(Na5S3%<N09eF#LP3?F+8Yx@Y9uKLQO`t{
z^$XZn%kxA;?$2f%j&p@%$;%2d2$aA9ND{G7k5XJxD%439a2q2Q25piuOI|cDK^%5d
zu3zuV*qiLZVk(Rs>7uyjEjc|YvvOXHRhWbfYX2L(nPiq2y+Uz7IP$EHro!Y7=+s0E
zqv`D&m!z1=ETk)Y^>O3#AD=dOKuaUt0o6SM#p?(&Kpw%5_lr;+Z6i&J>(H=pA!Yyj
zuq5Su<>z;W&4h`23V(&9TpWJ+tC*v(T!8e(ZAoJfS%=VmBnQo6W~e%wL6&BRq7Of{
zu@d=lQ<74Tk5B3nm_({zVYB8Ip9uu+r{w`9NxTvlkn8x}nqe*=BRQ<JGlcLg8<NB>
z^Mt3mA5<S%VARI+=`F~XGIyA)#UfOn7Ovckn8^~8u?M<Efn!qAVUY7EHk~;KR=J)I
zNcW4WVQVOa$ku}^Dwcd<;jrj~9UHlhWQmp^h8r(7YO*It=?oKF*7|$hI?Va6+sCFX
z)M*#gnwRgr1gKi<wh5xW+hkA`p>F0=w)0S2YMa2+`@&cqbGA4O;)3hP{Hk+Yf_MUC
zo=Ex06WP^#`SP<a>(baCVrs{7o!gO-Neu$h?C1@PYb|cAM1$w9E&6`?`0)j$kh1YY
zMr-3TToG|w>=5Y^PxqF>g7pwiQY8fdxkh`~#_pH*NrwdTftiQL`SMB%2OA-{uBlgJ
z&PwSWVvd4)qIq#SAmvR$lx?gPQqwL(y;fq@b78TR`NGdX{#MR#j_$FP!t3WM6s>>q
zvoHQjx=_GV+yi_O0cHunlMDJ$2v`s@_80`>#jF1Al^@Vk4goW&=T4w*gbF!G^cD>6
z=vhMHE7cY0wVx`)k_F(b&1Z0=WNXpP07Z=tDo^4m;l}XC+>S9MflzR>4eG5a4uv=Z
zF_1QUbY7{Ev9V^X4G5xi<Jc{*8;n>My(eJ4W^!6QD(L(7wSCDrKLa@jWj9~{s8C$~
zfo1AD`0m|1efGLEGH>6hYN7pUe+$nEbUt_C6q+D-+q&nRkoFhkhABN}NN1MpjZhLk
zGO0%2DXcRpsZ!0FGf=7RUG}t|vfg`aMHV|BwL`kOj<Cu(=_6RfH}ZwXcH@RIqAa|H
z$dFl4aVlCSrIIP3U-C<{%c9{s?u*_v5^P${q;t~og&kz8KbiM@_Bl=@y%WR3#K39A
zt3iydoQ4X$cZYTglcPVVFZ$zm($FQ$4%w_DG&fjRzV61nmF$ESOTVu*h!t||9h+Z8
zt&7vuA!~=BdV0#!m#UtPpLpo4qBR#~BiS&sb`Q{?+<D|K(_Iufmsw~BNK6J><Zjuv
z?GDc-MeiDA6>Nq(U}k$hOK7E+Uo1*2&KyF-l$^{pZF{0P1<aT1gInbT$2qjZRG1+0
z;>Dj>#O!NOMfi#sOGp$0&pfp`g42i_EX`tTt&WOds~>b=&)pOcV_{}%I9+tWO%=rd
z*s?90RXi@$nB13|_g+$vhuZwU3><>0^<_jUtxVD0VO(XFaz?Smh2w!=aGR-y6s^G!
zjC|d*_T4JjMiqRPBV=4;dL@@JjRem}Rnl9x_=J?0S_5pTi2=!MPv?myj&sTHUT^5e
zIBrm(nwyz5A9zu!;87ImLDQ*j)H@YoACS$}U~zw09j73Zu0OpkS2qF-qKZGJom$xQ
zgL+S%qJ@|~>jci#2^2?D0`ipk>A04#al^6;1+)ZF84m&ScMVW77RCsVua)yKdOLgN
zJdJh{8m8}~S(l1}qq2%6CH6BQ@`FB5&b`1ZMz+7IA?spEvcXM#2Ood`Zh>jt;vYnj
zyhR*8{w7)njR3f@FFO}ajrh*h^5&khJ3{X<B{zzQT+K-#$U4c{Xg{wSim)9&PJrOE
zJv245(UP?2)akY6J!2aDmYtrjShg$-zCe15$-D;|uyeHntan+<gMJ54w)5^h+7Eyp
z-2Oy1nA7}@a%=jA(mf;n=xHdelCOwLM3-%`uoHq7$-|g+aOErbow9u<5*?Y71+8{v
zPVKtIp4{RKs1kmFJETCq0Rfiw@f2{_T&CK`M&Q)YySj>oOz<I%-o0_{IEq{CMx)n5
zNT-BAz&g}OSgg?Z8hGdOzvGk?+2+u`h(1fAj<@O5=@=p>t!;2j*4?mk*?3?)=RqsA
z^U-v&n;HaxnTc%;CsyZLf&?iP9zH=qFIVcpO4Vwh+pK9*X}F};*HsJWC=#-V%MeY?
zNawXRT}=RySdlcfyY(D6(1Nugdq&Cwj=~{%rI^Ej&I#$eIr36S<f$kK8BgfLv9#tG
zzbV-6dUCS3bEqe5aN)MZE(7kEM(hlL*^57hgPqZ*b3tB~Z#~O}o2Cvn2}t@_Ss9KH
zQ7bTwKf?Ov+6s4HXmJwrBJq*n2>b*OJQDht^V`mHV`g2KK5*SBuJvE6e!WfNGdWLO
z)fkgCY4qsP;Th93TS6NwhV~$gb1iLc`hHX9#;rNq8<^EC-bN>IJQ<Vu{GBTq10qg5
z68um!^V;l`!LL{*ZwdY(NirUsQfpd0BK;8wGmX8EMkUP@CK_p>|B^aA)wfe&0DR2L
zEC2<8bYW4@k7ivD$n(rfi{xaD>+$v-#t?S^4Ufu!2{~mT!xQ;>+?Y6W5Z2OltP}zQ
z0%!=HiA)n<4wn#`&Sjgv7?V*}f4Y+>*e}rzKL!W$HtjCFuA1Wcd$VRC!d}SE*iEG@
zT|`bO8BxtVgaMs<vSm-u=e3Q=<qy3<vtqBlQ5M2iVJDO)v{nS`{n$;9m&<`c8)!0o
zM$A?rLUlvI5^-Y=3}DH+=VjV~exX2kdf<v&y60I{0xk3p6WU@>8HC1eeY=3hf#~8f
zO0=?@A!=&MOAOs<@ttJQx_WZM;DxjmZjs|kw~nbKd)RQ<Zk&nK{PH%i5W+>+CJ6V#
z;Nli7TgnX+%QK-mV<>cDa^3(KL1KT*!!X*@%I_b#V3cnIpd6($^XB=04`q4b%3UBA
zsTCR;I>_QK2Z+V@=zaGg#~ybk-Eq!eN*NF?VEuwi+23xfSg0WB(^kAIcFjdaSrr5N
zw=$>Z1Ve}`PQzY-zdv91i%A#8PH?&W5zlGNoaXYFPv@a3d9hHcHTa97@fH8A@AVwT
zrv?hq3l{tu$5rsgj{rR6F4TdEqpqzTlkG<1M?P_?rep`)6<n1S;|+qSWV56;&jTS{
zo?lB*-1-NI*5eP7*i(xiy(0c0tmE2J^}0cn8u(y|52qQ?U3@+e9>(=hQWF(w=s5B!
zKW<yM6Z-FMl56SR`5ifin~%o(+80`L(5DCfBfJBsT`nz`EMjN|2?VaW)dRbA8x5u<
zJ!{sGATmfH#V~!qM{{n;`hj1%i-%;cz;xUM%+5&K2&*jHZN94FldK9Z{_4bHK;BjO
z1Ht^wbxo+F(0^d-Z1&8C;Hdoaw2J(8+PXv?kHaB}z3g9EJBmaE&e03&S=!{UJS7?w
zy^WRo=YpTq(ri+HRh;Q8F~HcaDi)sH6gut6ai68(55Iv^KpHZH$Y3L_T4C*JVd7Yl
z00T*y8#G;g??s(#A#QD3@TonB5qEB>8~T@#)VralP`};+wI`Jwu=3*@=QPtrumqfs
zUW3&W%BbuYP^ESR22NWi>7{qU<bRzr!(znIg?*t*iiEqfutwdDkqQNAv_l^)R4t%a
z*GaP#VR5vP#R_=|sIuqap4=yK$}ibgP%(-kKTse*OqlVZBUcF>i(~#QJQY`LuGEbh
z2CG+{|8+JbrNM-a@KQGE0lwSv=oE1vBV}IdL}F*MbMCx(oIn$ZgK+(kqT7a-VhoM|
zQSNNno0*7#%>E1lX|zPx-8)8R&x$81WwUwMVuj+X=f5rm`VVL-hmqb5vA7GhHe1pF
zy=z%nrj*|%*yqGJid#r;Bk<NFKzqqANdHDdM4D^Ril^L)riTosVD+N@I!4><InV&*
z+8wws$KoELk%94bKH^-k!n${_UgL?|EInwi1e>>W9l%?fQ&fCQ>7*|6JV>-SC6*50
zfL#OA4$(+eA3nS~S_kfF{l<-V0_qc)=uQ`wv=wpX+?Mmu4}C$V&}86A&Xu30^u(He
zE1@23(s6+^rdM#<T&HW-uCkHH>7*@Tu(3w4TM8gPISO%A)o{1g+)bjlqIoQ+ts=XT
zkm<7b|54ZQ7T7}t6vJYMoZp0uqH%vGTYK@&K#}q-hZL$YKtqww0Xte!5IJ=)KZ=Zu
zOucs|E)MhS8*xwl2oe&_!+No#MK?zZg$C1a%Dtz6-T;@3tg@m+narG+P_NgHx=M}+
zr`i>d)vH&Fybl0*MyOY9MOgWdfS<h*wgVO-Q;7HY>&Cj5+fmpayfFv#<rsbV!JBp9
z4unybNbJb&#U{Ir8_yH#oVjT2#H0Q>v&iUY(D#0g1IBGqI8{Ry&TUe%caU_gO=nQu
zr=+L*$NZhg1jTP$09C>rL2;6we>44eJA8aXI61k$RZK|}AEI#%yqqC2!B9fTu$5;(
zDhi*<+wB&<wXn6!>tFz4uR=F+*9yMB1l3SvG&r9tiO{zo0+*xv(4h%e>IVYknQ#dK
zNzn0nu%+x`Ecqj;H0B|jo7Pf@;!Jc{m9O7%ga|kETPw{&ZUe@nrs3J={?nMfe0ISr
ztkfUH4SqxpgweQl&o(v87PqiblLQZVsE#U3nw&J2I%d!65<9G7ji@uvMSATxvg+8k
zelKB5SWEy~NnX58^EE%V(>wqF*ILTPw>8`b=_s+Vq(&zutZT8IMQ1$|)-UDmQQz|%
zpyq%P7cS_}wY7yzVs4bEu-}U8?@taB3foWcAaNyJYT1qzOL5Fyh=7txDyD`Z;;2b1
zG!8S4G+&G)0EJWUeZ)NyDCOG$C2(R5!%!GF3|lh5j_D>PW20o2XtN&+%&Zd)2GMr1
zx;PfYc+ph!J3l>twOFzhbdP`%WEG?X%H#JLlmT%hli>x_7i~N6NP{Q*38stplXdti
z$~_Z-46=vS$kzLjLr8T5{mcJeZ&lSRanZM-jtPNUxn|TUkwtS$#KSM}4tbT4!yU{u
zxUghy8CON|A=vEW-@AMFF=U&gli0h9esmMJxcn%ZoLV}Fi}Q^+V%jNzV4+UxLLE_+
z5XhDg!{y6<EB%inEC|1Qhe?s8d>#s>?02t^=WYV{Sn@0HG&@r~@gST}vyUg*Es!jf
zwN436?(<lA-CDMAxIy7~RnqVRr8iMl6ovMIkP7Aa1Aowu)&Cb_zAyl+j8SABSCf*c
z$6;ie-%iSgD<bA;F6+j}<VI4EoPW$C<)OCUyRyu%m-U(Bw>|}u;(dL)Z8c#n{??~o
zOooszCELXa3J72V(0nG+3vf=QYga~x;B7TkrZg&}?YWfludgsrBMnn0{(x~F7SL0&
z=!gzKG8FT!By&JG=`D2f^Yi_Dd`y+uG3ZC{NheW1=6}m3Df%Rn6~4UI;=N0Ht(p*^
z6?*GaEt7#kFwH6FBeQEmY-7%mG-BsHv7-hEFP%ESox=0T4~ClK7aB>z#wx5ING+3z
zzyQnuSQ60d|Ex0PNhq>si#vo=KrC{j(t$cUQwXHWCDw~$Pddc>qLcIzImIKh-N-h?
z*g%pW(lPNCQND=>aLm<?(#K<=G38!(dPd(t2Nk-aBi7bQ&dSQV%GL^-^wS2h=Aa?f
z2Fya)X2ypFB+*kmK;$BSYu0LpGK)2-EeK~C0zGtIuAxy^F#8Xt@G~lDtoV1EBFnrz
zxjKz(B1iD7SS$YI0Q*TjZL&i2#k39mDea&wSQIby_;xBTJ>Th{_3PViZ%!YHNp#95
zLNS~xAT;gR??5<j)kh$`f_~t*;&2%Ar?#G+z*A1&SAAzgNRTI`Ln5MH%9uM3W>{J6
zX7A6am#lc~EU6|&t3(uuj?@`%qjg9$S+XusRlpC8#(vU(oX(*yTHmvbFmISzLvbNC
z?1fizgwjW)b1fHX^;5(&ggf<ZO>xxsW}lN;!AAl5gec3a!=yHib6W06-;zS`9RxOe
z;Cyi4j{`oqz`jSmU_M(*(ZgoK1R)`HkAKjcgSDcfVsf6)km6MCBC&l&6&}4sNIvAe
zIZP>KYRM?+ey3imqX<4gGdz|&RSq!x8!dub{ZhBC3^rmzpDjI=hkwL#+0fZW|Hc%3
z8~=cSa4JByDU~^RbgtI6dcXd9lgZ`JH1BO;A{Hyqnka9wlb?3V{2kELRTu*c7e-1i
zgoSAKVk#Q1$re{@c}7h6wj;O#P91pr{`aB?KXPo5*YyXHQQ|sLR0bk3aa%NzHz~%k
zq)-t{tQCrs5_qV>k&to>#VGW9fF_^`Y!3<c#kd?p`r7QnI{Jy9>1qHvMO(M<TRCr}
z#9o^Hi@oS{RZ+pHGG*s!bRCQn#6{w=7*_(u#nhy*yWlDlp(wgj9t^`bfJUroi*7YN
z%LXA*rHi8Re^21#z=MKil+lE#TdYsNyLY5j%DgkMvm+I6vD?{3CRym!SX!}Eb;?sq
z4t`Y9z>uauFAc{MIQRSj%(JI_@1-z}Axvg_p*^KH(5HMC{ZT?ZZ;7F6edv(4QmdnQ
zobr`Qym;~I&w;8%WFFbWt1Blm7OHg+aB#4l*efnN^P~==%X(0R@mon+LtBpgatN$U
zVo3wjniq1uXW{%6;oYLDN}DIZxAMFN1Vx`3wctJ~QR)3<7xJ1!82thttb`~^xE|XM
z98i&omy1qo5Ud24jJ2WP!#pfCA`MwTww&?0q!+Gj|4oQY+A*ih9;+@k3|fU)5Gk!~
zZ!0Rypi`1ZKheDZ5^A0#L?-<@t$edajt8!C+$UFaKV^$Eu_~I5PQwY*X^#if<nQKF
zboSFSv``hO$qc<kYR&@FAktGxi7Y3COy9<B3t2aM!Kek+iZG++y=K?@R!2o0*)SKi
z6uHpy00-0rA<~`|*`OmD1mCM+fgq)x3w})N%N7Ns0P&Mf22f@d^h-P%O#v59Cko2W
z7(hfRG%mawB7&=xZ$>~vH}sZ4!P3r3N=kx71nqVQVN-NMlohuc4ZQC-epS$5S0l6n
zrtE*(^h=QbQ|S)chxhh@VILpYN<hped<~as;hTXqP5-a}3WMReaXx8wLgeD`MhLXb
zvK=5LfOiVUe{c=3uQHGO)REMcmAoRyjjPE7kBX>KtB#rOaD^gqDZN7x0?FZy=7`Q%
zp9T4F=%9#Fu}RND4sFT9^n3n$Sa=rq25q4&!E-PpW2eb0j{YF-qzPi)3Hl^1=#iG%
zvQUGISrA1`j=w3;9UvyGp<KFrWS#OklHLjjpLUe!cQ{_}ob*n2$ryTAyj;C2=xQz?
zR)(IUT)a)u^6uV(WAo`fEq@hPC_QKEJ0%K5h@8x5UVZ<6Vb9R0l`9FfJXLrk$E@cN
zA70?MD@y)8SZ<aq?*NJv@~z#H-ma;tGJ2uw6_FW4YYF|Ho{Co8$y%QO$aTyHvK-W=
z+`Uf|HGFB-mjp2DFEW99<;243>bGO+SlaHYXEggtdfokM{bs-RSC1HW^wi25x9yMU
zn8%yvFF0*5&g;O@YdvPKyW4N}olSnft2W)Zqw@3RjQGErYE4kh_*{BBuk=RA@@|LA
zCZrEKR)PS7%C;HgA0a+C(@h_-kv*^V5DYmvaNx1bevE({b&D8O@*+2;JM5K14*1PH
zVO1w3Cl{@|N31(A^~CJ^XI&d;XlMZZ1Rk&~#4l)swRL;W?Nyl{Ra64f>EA%muM4bH
zMKrd?Kqx#3zmzSkGHKv}$N%l;*Fr%adzjeQ=l-#6+ZqEU7`<J%PesN1%l(8Et)R?P
zJ)$@114uG)Xw<T|Psu0#Lg70Z%d?@)*z3tvW>-KLzJJX=$jRoy!Z*z<Kmf@YpKIC~
zCr+O}%<PNoNXup{Z1%m`mG9r~ZNPr}hfP#t=A1c~K{Cfv08h7XnNH)mFUW%e+`PIU
zjtl+M)K$vVB^-uNwSyj|R~l<IZQ3+yxiNshZ_CT)fK~L#vEU|m4PQM>)MpM0xi}Sq
z_D5<-`u+pJ4>-;WHqN<`mKM$!tZ+wCs4UNeT#I*o^{3p+_pjt0w0?v1+`TDxfJ!(^
zA5#Ocn7)<_YT2@-Z*QBnygSoOJ0SZR#RlWQ|Cswb3?fbTkF33^Wv>)=2}Y+k#>Ki+
ze_cfg{<Tq~tCueQ1%7FQ`(6JD8Y*S>=**kS+o9@Qh*Y0r;jnurJ3G6TD4&ffBKyZ5
z-mt;8PoKe_7uZ`HV`8YKSSPApVH-?kg($yN699gq7e5-t`=6yQ$NjQ`L0FLBhTnp#
zq;R9*+7f+#8{etle?%*UcW!#daTb!`>1Vy@i)XUm6=YW#pg>vbdpC^rm$7*e6R+@u
zE7@If9OS!C%?LGWK_}|3`1lpOPie5mCazqmhknS8hhK&8&l<NZ)>sb{Pg!Hwhe*$7
z_Ua~0c0~SRAX8Glf6ebQY}j7Nd`&R0P`{_=^z@49nDkZGZ{2E2^_bN#kcl0>8Wz3H
z``7|pD!=r4cj>YQr!vQa8*LQ46CA@fT?`*JYI+AM0d!#7!HZ%zBKj{Fh+QHG_eT1$
zhb#kmwS%y#^<ih_mz1QEGj7<lX?d{|gsKo&cP!?bEXkqWWg+jrX4!U+{`Y4zYSc4x
zMq^9Aeg671g2t<d37>Dc;8g#nqQZCzb;%qb<*t?QSB<wiyCgi^GlrF{{<wJ)_Yt7K
z%a$*9CP(P+FmUkT-4y&!3Ja5HEIKV&Vx0BfV3g_{I+4GT2C%`>s7{&Lyy9u=1q&8{
zWE!#!u{3XFu&H^!&YVgcZ2PDEw8_fa6JLL+T|2-|LRMX}i`4~#PMtbs7G>S$`zyDJ
z*THeDO*47rdo}v^Ct|k>nIn(#U_3|<OYjn<jN;9Q4~J<=)H&R=iTgWQ@yKZoP5aD`
zVy65z@BZ+#PCUX>v<>gZE@(x{Uw}OQ8M2ItL-Dm(N#oU*U~mhj>C4hmZC$z6*2daS
z|9fp1`cOpw&Eq2;fa1gfP-<sz^@ci^?MU5^qyGz*U{Iz4eP;r1g^7$qp}95!?e0I=
zn~xiNmi6hyv!(5!e71&cnWB3h%%|i7X2ZvA*lAdM7nkxSr4v~X_?0!py3!73he)f~
zP3-xh@6OQ6l88qi8+YQy{{HF@e&>Cs_ix7j`u=6RfOH8f`3<t?Yu3Db%Ap-4UWQ(G
zf)9>;yEf@?^AkA2w0oT3M-EQm)#e_bRZD^E9OF^`X8kQt)FBci%LG3|eSO87U`04B
zzdGG{wgVs$1d^i98#{yJaVDl7x<iJ{8@rktUkLAgCP(haPoL0+*TE@ZpYAmx1rI+9
zJ2Szi7=1w%ET4VwTKG>}KiZEK&mw3@fTBoa7=T`R5ce8rcNfBBH|Slrd%WP`9+Nzj
zlJpAs?gmWu9r{>AwTH9AZ1vW=Ca^-H4?z9shkqFRcO-l}?URea`EaPLrH^)o9J()2
zsm<;Yqf|A{xR-Y$x?p`gZ|c6<^_xG2+Hr^|-{v;YvMQeg2lLAyI!7)H_T4+hZPvVb
zV>rm>Ga7;-p&N0ijw-<C4vP!AY016*7Z6k|0m++l5d&H{;K#R_H#n@kecifs*|2u%
z_bT4M$IeUdTLilm7#BB)F$A?~xy%MYs$v)gw|)~&X$wad-q_usx-;pNtc0LTL=qoT
z*baO4w;?Fav+77!u)eOrsOh)p4QikCY}Q#*a|`ugKboENYd405POogy#Bd@hty<^K
z3fZ7h*2z4}y~~4+9MK@F#DJh>e;jNs*R|)J?6B~$`*C*c(4_CQ2J~q6x4lU%u&g-j
zOk+zctEZR*@1B*Fmlw!L?V@#~`O$@{J$h_lT4Z0O6kfi4cpCFndiLB1ZtKPFbS`^l
znB|7$J!sALS$*6)4O_X3)CnV@zXJo?fi<75@5zdv3<NY4H;-9gR=B&Lz-5aB*=}kb
zVlH6I2Eg3oG~Oh7UGV^YMmjQq6O(SUnv4hR9Li!o_w?G_B9(L&xHFS;%wi`o{D^CO
zcdHQjw=-yx*<43Aw?hC4;r98+IZ%fuR#XmheiqDkg{K6Uylrf=t?rVcU-$oC{%uE_
zsDH*<#jX0RbLZHM?kXyOQB8y2d2LlyDKm0%cD{7+;u^fVo&v%w32G>FN2mhF!0&?;
zLw9a1U5laR?FnC;WaAA2!8GpPyJu%I`}?7r{2L9)H)UnBX?h=WU~@edI{V~Mqc;Be
zbVp_KpyI~v0h>ui=1qO1qGD;daG@quV|R7+l;aKk{r#KwvhpR^nBZH<KsqK>7E-!r
zB=qU6qLNH;kpJSvhLDi1veMa+zUt=cx&3r>bkMf$lfXw*!4=BZ#7cIjiq?!K^Oh~s
z1tsp>ts82W!3QIbqUv9V*7b0N7u0-U#J9H!|5>*#Qy9&2nd#b<tv}AZ9g?Sh?8rW=
zJl52#jmwNfgw~7YRe#*cuVJ%o_5bbPY}c1`DboV$Av>8#ENai<2o4T5Yo)q*!}Z}D
zQ(;q2sNmdv0LckR>rATfC6Q*Ew{0t=;yu(xr&g_6g8(kHGm>|7tqZKTY{iPe`U3}O
zMg1=x{!Jh5Nq+txEcn&~7qu6^!<%2J^$j@d4abhXkeeF-GC9Y=fsI83*#zP3CV+im
z$3mVMd8M11+q3jN?Z{6#8BWB;UQJ1<&vd%Tqep2Kk<!-SZ&*GCI3;olunKClYuApy
z^@*K3e||F58_}T`RTcxrXg$_w)g-wi{?6o+bvW=Lc5H^{wE^^E!OWFBdc$$~nqCtW
zjq4Aj`|q`HUnl5fs{Q)ys9C%I!L$F1m|Fi31~IkkBdn9#$7USMoM|xX7IT|=klZf3
zdwdNb+ib!Wj3aq50PZRX8Ap#XrqKJHoWrMu!$XptfI_{Xxk7Q{k(tKYn<oST2q?eS
zhnXJ!4dCq{?a}oW!|iUg=G;_#ME$a48%aWoP0f2=!a8A7Lc-A7w{KTlEer4`+*`&u
ztI-OvLrG71(0zPft0p^EgZc#evtcp-Lm6gRk63pA&~FzTzm+m#H~eF>?%KcpntFCg
zPjJaDI9}$S#}^D()ydIuS<(AcKaDZi15Wlkp}(eP_~1vkZ|mf=ky&*S<u$;=cSpG0
znyuPIxw!PcZD5w!|M72Kqmt+}lkceFtsh=&5xtE9nSq3IojwmLJbZn8nz3}6_nge-
z)^#Q=KhectRJ12P(p*QqMvb0>Y~s^8H1|B5zY`#Jtjibo<Ft`KUZPiY;Yu<SIULb5
zNtWHP;gv^9xvE}y`yoSyF!RY){az%A47>A&GD%QA<cxV{H44E7ETe7fDcDk>oEqz&
z{oW*5o!B@A11znB$EYFuLak+ajF~$1uIaX(Q;&D%LOFBuA3yfov&Uw=ZL=oHMh6ZY
z;5d9n&Py`YgrRWvT6OH0&H-w9ZdpIIasCI1J!t=1tXQ$4Rx{iQuOCU*gk#k+WXMq0
z;^uI4R6BR>?A}^io9;t62QIP6x6h1@<5G?LS%%3z8hdJKdBJ$#{Ru#s^c{u@7P4qU
zN+R@wSsbq`Q9eofn74D_;_38QN3sUqzJD)~woYjCCdp@^8|9LBK~|q79t0FIxgI<l
zj<YmoHDlXoy1uYMy?SI*1uT>eUS6vT23_6UDk(3o-M2*0oi121Y?nG}E!igSr`Q1s
zuM|0H0!H$_0tPPbsDRuwm*E4FsO%_|9R^cp9taEt;K1DTFd&)Sq%Ai=D40BS*(s7b
z)F``1XV?W))_31riSJER$fv(`kp0Jetk%Ahxv%N7^0iKGpvC?dDDKw8ub(!LiBI$C
z-WZ)^O9-2XsFm8Z7{fR6trt+MT_-3Sm(Wo?#5+9+g|1HzBQ5ZsXHdT$b6OvAAgzXL
z^*2OtCcT(yYHBt#?xEtHfEMNips6fP0x4B+(ENP_ZQ$(zFv5*bXO#cHRHgl!o#CC?
z+RJYQ(|L*zAB?8=?%LE`qZ$gpi;12A#|J4N3!anV&n22onpC!;T4adQI&5<`@p}SK
z72XK&p?10nZ4vBd9q{#<p6rXABqA`G0<eXqhj^&Q{l_C_eevySx(p(a+y{wT@D`1y
z*70t=a4P6Pe_zYN#mVW(>gvy(XcMRrjM_B$<J;r91>;S0nwHF`j1Guq?m=svg~GL{
zbI9k$W0vre*AMpq1q7TT^SDK4OlfV*v({Q8Z5u2J2w+!12#oU*5)vw@-6V`e+?zOd
z>|+!>ztpMIv~}yTWU=_37{8xc?(~1wy#vPn&BHpU#Zew^vcD#Ij-oSBLVhhydcTmh
ztj*fU4sgqY>BEa@bb*Fm8>OC?(&|zBLC$}}ZHl38&C;QeC%M0hAayipTO(t#1O*7c
z9Sua8l1y^9>(pt0KeVWS*3^)2A70Vt>6VQfpYWPyvu6%D@Jg79<i*i_WSO!ye%8XJ
z^xgng@mj!(#??XB1GKxt$ZP|6)=zg&>$$AgC5Q)eUmSmZuKYLDQWvqzyI@N~-k(n3
zQ~6*z2Zu6TIHsVHjwHR95s^kjiJ{wTyRY`d{Fg6pFob*rg+}<KkU9jo9Ak6dVVO~b
z1`R@>izM&;rzUObwUl{neT{f+ljp!QmeQ1r4-lYdNdCd896s#VLaK+md3ntkRDu(0
z<_VrqPh=-4>~Vo~6fAq<kRLF7c;~UBRGZOq(B+hRN-fsk!NuA6DRAHc%Go;Inn^-B
z>QhT6`tAMzgm}Ev&N{_;Bs0HAOZQe3Pf@NQEwH`1t%ok}PfxkO0|BND%BE}g?%lg!
z>%D`tqRrqXJpe2Ex2VCO4hg0YJw7|gE4^Nh-OJ6;ajhmWtMlG%ATu6v`K5J!a8)H9
z=IV?aH;#ZfUVGv4<;!J`CMd%#)kF!0b|jNs&VPP2gw$y?8;LUa0MTc3Qd2IwIZW7m
z6ti=nev$aq@+4)`#x``VPpCDd<jTbA&%+fg<4e#k8ck4DQQ5Y6>sDq$-|2L0`~SEA
zy)~zux^eO-9{!l*G&VanUHMAA1!Z9|t>s)$%(Sz+#^XwXp?Dl@Uw4n}`PoI&u8^X-
zF}QEy<jGCnzke@9Wv5R2DvIT_W!v0^ni4Q<)!=ddn*AI!rk&EntfdPJzD_rvEx)&D
zVE~rtlh(hCjik-PBIRjh+)YKL&r{xP!?yi5&}G<Nzw8KQH*t*PXy&H8N(c{f&;WKM
zZ_;sCxbT(+BJ~Yp4=s6~(!vAZ6|@ecI5)OzrT3xIbNlw~`pf~*MRgKgDr<&3?%0*o
zyRaKBr=@w3n2@_9k=Z8i8=k0KLQOzI_QS($KXiHXWhNiBq0F!l45;rKO<;@lqzoHD
zkdfM9{xo{CmVjps8vBClsN{}j3Drao?!)TU!jX(5*LSZC8=9{XVB4(+*f*y#uJM?8
z<7}QbpW1b9(4cW+v5|QBFrl4-UBgfjeK-{jC&+U^Wo3@AT0CbIpkb74wK!mnGb_=>
z9J|uMi{GjInwWKT+CW&34fOGne{ttnCz?}NP?kQn?ODzW)`n=SsME0Z1*B~am1_Xy
zj~=P#Gpr31Y$0umSZNwNYH`|GQXW4iimU19wDcqrpKvR+g7h~}{<2B(-?tJsXp<un
zmA#s>oWMD*BIMZ18*A3KZGC-#5(g}3DKWLX&&ifxL8L9Z9Q+C%G9yEF;>mIOF%iSt
zis%;%O2MH}%OuRC3$cF`uWCyp?p*ohosQz;BhEiXIY(M4Spn`{1}=VhB`3#^LxiMv
z(9&1!N#SH8?5KXt0`LFlO;50k8YZY&32Hl^j%W^KR1U#iG^s2|MdLbu=boJ3jKZX^
zZUAou)zyse7*Q@+PiwRT^*G^c1yc%3iA?g-F39+nk*%#NR{+xS>oSx3qEb*d3!5oV
z#887YSWZ`_Cn@UV=S%db`qx%|y7I;ArP04^J={Z2K@6OW?~gKN_=h+?1{TQpwOuU2
zOx8dSGvyhq_1U;#tkod5m#t~*KH;e=Kv9lkg4Di4heom_R01|^q73It)q!5eEx4vc
zlzN`=@k0m;msli^ZN07NX`z$Y4nAF@@#qZ?7&;}hS}!K^Q%9khVEEgW+_6RvHYB~|
zXQ?|g<c+X^{~_#y58X<g&VQ8|Q+Qz;?#wt|m{7Va-fJ?4PG_+Hh7Af5mQc<@S`RHn
zn<xGp%z#j?+WDnO%d&a;WCV)G`T3Ut<d2^^rSt81H_fkT8LNCQ2YZtKB4ytSJ^&2I
zP|0bvgDmw4B&6+HEgV~_n{5z1kf-S=)y*j+Km>~r8vF`eW&UEZ{LXNC<(k~_@(D>S
z?4Y)sp9BiI36hI%@zMBCa3F?Pwev)mru;0QGdQn^=>+H)Zpvg&qich*TB3z)hf`Ab
z)nBU=0xScCwhfXKNUPcZV8kMo{{4MPZznMbH6&zg+mCbz#2Q=S=<GZjeyq%3d~C+r
zehBy=)8yk;R%$7TfihZC;6gUgC-(hia3>!J7gyK0U|InDS;72}O28c`?jYWwv|dJ!
zNj5=`!_()7Mi9?*SAFU)>q`0bVfW3XVSQokrM>KQ**t4aSvbp46%OZQ)oyfBoC0O2
zjib8!&R_nA*&S>QB+$aqt_iQ50Q;)~Y(uoTH75Y8NOL68I<6^ENC9wkgNNE^f*Kd*
z{f;%U39ildJ~M`8oe~)Q+%l(H{K^fsy*c-1LpW!hK7R6K1ZQ>fqB*$}19iNpdKBUm
zCGe7MQx-%27LJ^hWjItFk#`yxW)-VWF`RVcMng|@b|;YJ%58Aq1qXav<D;I<co64O
z7ts>kng$o(ezt}hcVa*5_%cs-Gskt!?Gyomf`R}v{C>4i+sXjgw4fqu_UdJc+jZs+
zd5`OuX4-do#h+wfJ`OmTc1J1Ew}<&P>WYgT2E6<^vu0hSo)!CVISitur^Oh|4DxP;
zbVy{_*0*0TK9Xc5^*pJATsWTzN8cV+om<A=G5fODV%?$sxZ>^GS#S|#+c)Vd2Py)F
z8jh4Wkgw-Cs_=q#yzIg}m~%K;u^QI{WTm~(l?5@H>DI=zgMblH43qyf6im}GdW#&E
z7<m46foFM4-ktG*G21#Gs+le_E?LaE*#J)NSXU|}^pY$L(A~kNYd!vdAoF!o+!yzO
z%`mp!^XSoDII$jr7p~khYmvoGI=|AL$;mY?u%G6mZj-Ko@=JBdS+);f{TJNc0^Ts0
zl@QW(ZWsmp<AQ?Ggivx`6lz;|F6}yuqe`20Vs>p%k=hroUR_6xBn5!1%6=&jv1xv*
z>gf2B*E#}xN=%j7ph>^={{shi9R5r$3v!~ENmeLGc5*@e<~=TuZI3U3X52SzSr?v^
z%CS0Mz7ifS*MXsc><Pk|8|B$db_sV=<L3EQE0pW7CMRb@1ZF3P@idkH$EAh3*-Q#K
z#g(g9`$4{4+f2^u9sk=g8}u$_WNf3LuPMnV3vDaREO{EL+6-rrrkX<2>f{E|MFx$P
zCp1|_OG_Fu6smuGZ%6>@d<#acmV9_+W8~Xo1PLtJDfP2F=a4xYH`uN~-?5(pP9cGt
zu-95XO=~UxXA-lBVFLZC&&fiq{4I>sI``2yh!#d6kGZG`gRJCBYa_XCfd3e;wxKMF
zCjH3#x6F5kT}Mkt&J0zT@^|ml0B*rB88*}%Twxgv@Ieg2%FN`i1BseX$Z$s|Et|`5
zq<^S+6q3MxdvX9!TRIMHtH3TdE1E|X8#@5{|G=ZeKMmW;f6{TYzEX+bgmkcoaetVZ
znJu@KkjGao6f8T@sMV4f!TV8WApz1>pgOMkm^37EdcwWOqvTb3H|5yTrTcRmwniGQ
zmHv)CeEbG}nKqjjY<u($6@wSIj|OJ$m~IH*{2c7E#)bR$gFt_kFF_&LGxu0y8Dt=-
z<&}wVfFZ~ZBYXQRz{y1)q1=#7x}Gy>GH=wVUkK~WrQXX<pA9-UJx=*VM_uVmwXW6c
zLup<sm-1E}tEmN}nM3y}z@Kt;&P?U)8HSVmNZka{U>pmAW^|vqDZcw<KaFYDWQXP}
znJNId+OVC@77hcT=usfG<eB{j-_w{!;tnrXAwKvrZ9)0sJ6D$H(D-S$OWtZ%N`qlg
zbF$9vJqA*5JJAlaKEs|vmr?c^?;F-(nd916&f}t%#^0`4>mfJM!z}D~p3T+d<c-AZ
zIdsTmXC>Pb`+Ay3H0Lre^Ta@7*tK@bP1nS|2Ac!u-!L(W!{n@t)tz53p4SZkW(@jW
zlo?W28cezmj&ud6rCe^-vgP`9>uOVC@5fVF)_C%JiN%zOHG0r3fw-<MfyZ_<S_A;E
zzuEhw3BT5>m-$%&TyE-_R=X$6r53%gwp|2QJz=ME38<*d4x$WqVz(${yu^dPEt~8(
ztTk}pVUUFjYyV0g{`KSKZ>y#303qJ|0=@J$Qp?d{*u}D!0*(x)4W~tY-5EDY`s+1l
zP%D`FQW-*2W#m<sGetlnV=f~gVC~vk05f$vHeZ5;a2w@D*0S2PS@-hg%j^zS1&D^=
zE=4URAK+^XGSRkKGIGH|)<E*hjvbzrW@a>&wVpg3XTuEGT!V0!&78!(@^J=_0Kz<g
zWzjiQbfc5p)u6uile8Q(rZqypiyNt4_m6qKjfy?xf-1Wpx2Q6Lv+%uLz^z)fYStWU
zZ2ZfGyu2X~YAVZ#vG1MfkkgU}i2f&YQgi}R3BcLLdaY~RWW<6z8}9(Wo~G-Ysu?+=
zwMIO&anq(taCBZw<&IUn*GRQWQp$@_`1Y9e7iUoxD-hE8V#mRxwG9B9L81d`GPLcW
zHZ4%kU5jn0SjiX&I{C`2TGNr)K(rm|O1rD)l*nCj6RO%YL^_hsfp-2wal4(q3s8=n
zrlVBX9$eNfokTgynQebdjjp={j!q;a6~#H5xdazb&5-<G-@w^$;lmZefxkqTzL
zkea`@YL&#UXf#1eJC!yLBpZinPVj_9qa`C}7Us^L$-!diMG0^MGd^KKOgb>C9!c?3
z<ZziMqGKtfKnUvVlTh_<aCpPmZqblb1#Si{-gt37mlzXs;>3PvmI{WWk?r<YZpTqt
zlUWRyTfDB@eOKUu{sZMbH!+MQW}M&yD7lkB3n@7-Jh}YO_*81sI^6_PFrbxSoVv|J
z;v=u}@$=_b1wWuMzybQ~MnE%9a^to^?Az(rSbsgO&-7)STt~cWYq3FV&17TL>7M#f
zMp<*iijP$1J*S^L^KG%Pm;?OZtz=5_<foY#o5ys|WhL&2>|<aM1lWJ5T?~Eji`TAo
zv-#uOin(pU0hUz;H=ae`7>&V|lU4hwtCD23l|?M8DCbj@_m$ic>><oLP76JpE&HA~
z2XC+r43sTfr7G=037cGJVf3j}hrYE*TDMXUjHkm%HC(IXjZ1^*3CbyYJ)}aeZn`hJ
zq4}0dCPVJ#xkkU8<y#3&wJ)Tl^|Gn;eL25*F*Z{oV&9E=6^?H@dJ^~BO7xrFVw=Zw
z%8c7`nXKD3^s#PnIcBF_C`&jO#{ypW)z%)j)|S5&1C>Lh&)G*T8u#hbhosDFtMAcO
z0vH<@)DZk7;k25Hce38f^Y-)S8)C1cTujIQqNnoxVXCe$oYbU{w=q7Iwrf$#Z)a<C
zW3pmK^PXApA*!hMFa0Pc|DHdHUjDZiz0S+45vM#%7S+$~zb%eNuz=+7q~Ke7`t(3X
z;MAeNo0OW`fR0T_K1aUQhu4piQc~3aKdSx&tmm$4|HnUO%B;*p8Yn~NOht-HQmJGr
zAu6JbMafVa6crH)Whx0JgcQ+uA(ae;k|;w+(xm#ocJAl<KYks@ec#V>UG*9Ed+)W^
zI?r>R%gjNHCil<PwSSx7w2$Z|>2@lr^Wf>P#3}xj;8l}4Xwab8c!Qj;34FyTAD^6j
z8SlzpwBcIx)}1pi$I2fT{=!Fg?D?gA|2;xWc!eiixQDw89G2$dy!q9eH_O}`2<yI6
zrcUJ!IP2xtvYcyR?Ij(%b)z!x18FM}bHraWs=X<h0-y3*=oc7yYt2y;(SX})z@YiL
z8Dy99vlp;=#Z~+dQrPKi^@f;H+_IafhsKTi`Y(V5MaVxTI}ADM;NXxq%5QohY<C^*
zH>d7wL>NHHG<IJvy(MTsy1?l>uiOdJ@|yO8RzBK(7v1|rO=egtVvok>Ze&Og80@sK
z*S*lozqWd3<jwfoAg`p9R;<vvxpVQVwQIewS$JTl5G~!cYub}16;Tc3U=P6o#!m|H
zPK+lYwXt2}Al653Jiy9XAOS!*$Hw}t5-RPvK0Osh{Tk$w$Uz@Tp78pn$V*Z5+$%iX
z4i+I8u%gup2ZtxSTeWHV&I_u3*4dlO4XLp?^;JwQ`-@(0!Hl&W8Mwo9)>?s=tp%Zu
z58r~flks?mC_f&%M`5|e;>CgE*BZiNwH#lClkB*)hzadpKYl#&ZHo7n@@IRPoZRN;
zheMjb#1Spl4l)NXNJmdapIc`Mo~ma^LdLwc+_Gf?Yg<av_LtO)JR+niBr|N<WN@Ge
z@I1=!mjSJ>0#e2?FbIp{42VIrJSk6~9^_vegM-uTa387HhF96o<C6x<%SSmuCezCJ
zlwwpAoq~TR8AF*BG;R9^*u=`8KhJ=V$*7u-%x|Rp^b@iLDj@3RIl!*KJQuZndJ0H{
z+Tk$b$O*MsCZl-?vf5(QJS1j<E)j$UAdA3y!G{;3DVp$OJG`?iIopM5knOPuoF@Dz
zTRgb8l#-&NjcxL;E#0o}Yx+(`)h9fAbIWsF$2q+StRg`}>I*I}po+KGyZ3O><^dm;
zbt-uE$_VupayG}UTPKbVIp>3k)yej6e;^PPScXVWrayVuP||<MB4W*>hdY<;NWz^X
z=-xThVwR67QTl<|u?FH@fazz<FJLB=8VlNf1tA*HVp?jMl9G}@``Br7(bLv}SdD)>
zzk$+mVXL*Lu$IYx907(pVN`Qbb7N8R!I{;U=q70gt}MhoQT7-Q`S9wMJcT%>!f`R5
z-th+M)otP=sg*Up`A0@B4kEiX3D@#8tv=n8#D6VdFAA=oK5l@KCA>8cXg`*EVX!S@
z*+89}Z-tn@9Cpv9+PzOdYs8Wla&D0keI{*=-D7EDUN=?YHf9j;Q1{vVAWjf|<l}s&
zX;3Nt5I{fKP^irY;RC44LA6TAhNc-$>j|pv+DA!{ujn3hxOMje;DBa^gUyV2ll~v-
zsI@#*WR|M0|9G8Vi%axc{$Aji=(()zj!#NX@$&Fc<4-2o&C@Qm0pw8HIRw`HA%d>X
z1}$-(q|BJ_o%hZ1E%p=W$%ZhVDxB|@aQE)T*Nu~U1p1lZ+OQmHqb`7lsB|M4ioL&^
z$%xOxz-ZW9tyipFdvP&1!Xw8qRm(PV5Sf=M*-esZ!ixR&8y*zSwh)vxAg~ymFmhQ!
z_i5Aan%-)lMd26!Iq_m*NU#X8FCvspK8yM@6z$n=Fn~QkUVSyrmIBCwnY5FDWUc@M
z0r|~)W^kwwziGSIG?JfPXw!=P%CG-Kl_!>uKt{(2UhQTKmkm5!Z(t2BhuMe$kBzZP
zqsk=9By0~N06wJA_%SrFm}YTewaW>CCnN04vkhU_UI3s4q#uMX0K)h+n2$&ph`mCH
zg}df%c;1C#l;VHYMoX-^^$2{9G-4D|_DL}($;6-lQT)$gby1iqlgKAGH=U%v`Y;)H
z;E^MbcnIoS%DYqdq`ZAQBefA7(gA435FU7IBce4?fB(gc%5d>aH9bM!jt_?r66P_p
zIWx$#MZuN#cb)rl3WPcu+;(>;+DBKC7m%INIo%#yJFnOp;v=&*k56j~f(4DD44JUW
zTYU1*&Og0a+)XV%uODFab!vTgJ+5CdYmYp(i-Ds2>r|vg9i^m7SR;V?BYcVxW;G3h
zDL`kGs}{bxH{VxiX(g@<_>viE@$7%)0$vz%&=hu<p72tQH-#Bl=_oKVa(ENdM~g`p
z^EwAmNX&mZbMY*Cj^e7xtoH@|-MRm9z-OmJeVnqQ@i}yZ=m-TbY9r7mF)HkW8@u0=
z{jE5_VK`V0+Y~LiAsqpHp$?$dLRTT)`r9`HLQ~#^4)-~%Pw4Tr+YwOm{`~_+F<y&}
z9XseS5S!q%LCwqaE_S1_-~&gJICB#H6n`E1>Gt3(@2^AUkN*b=YS8UF{WY)eZom4#
zr)fD^tw1E<!VPb}efRDZrLqZ93vmkqK~q5}j2jn3$`(r>6@c>T!d=8`VYmVk3Kh%t
z?FDI{H&zH>C|42F20{i0CnTtg2F>0hEoR29OwIix-PFuu0IK)3a|2w+Y(K!!$%*LH
zl}#DOEkGLjfd2_mcan#0(>E((gYCS~<gP_J*Ycw<1Ey4IX)-_(phEle;9n~NLVs5j
z9;2aZb)gqN_S+pxJ3+FQauC{zQu+4}%uC6KL`IZe^x)v3Lz9QMq>hRDCKK|-J?lc2
zC;qdw&6IWkZ~A=JYuB%Tz~}x9abwP0p#+Jy&zFKX<^N9wAl$I!+-jj-l9FnDp9>ML
zgVN8^oCC@2dy$eF6-F4>WF?ZQ914&L6&Lv(aSJSNRrbNR&zPqmk_KPNwhcnm_Ar@h
z!jm8l5ZZkkmE!oDH`maHkSV1u7tCc3BC;<mY|zgY+Y;s;aLzTEvs@g^S}2Co!;4*)
zp=T2AGwYTh7Fj{>^ay)*ifH6XeI-=?5KzIFcUKvcex>K<%kCC|*gC~Srn)jh)E*>M
z^Y^)P<Ca>jJ+N)_=CfAGyDn67LWBzK*mq1u{9w{r7k~a-ms_D*P>4jxcU=pKz~jix
zK?7ld+ypt)Sr~&Oqm~{8`4!k*ochk44a!pLt=L#AXZFwhCZKCRLo2Z}TPPF(Dtb@f
z0J<sEQtQ^uqhk47c<X8l-l|<H8xSFH=py#hN<^yb3vc>^lnfZu!C9@%OCaz{ZVFOx
z<Y8Imx|y{N4JUp3GIv{)v=4krZD^Mh8nb|q5M6>%Esko8w%vB3CXc-)GpOw2M;%f&
zgRcSwBOkRpVZ&h|56FstM~IW^J+k)kW)WYZ4d#DEIshm+a{lI7X=34z*;rf80<d4W
z<r_KytH)28>#Dc%46+AhlBpC-STc4d{(jV2waJ4wBCKK;u3u7gcl#Lb3Pa<hK9z_z
zMD?4u0}SD`iz}bcx#Gr4gRhi8J3lAk_&A%)Or$~!PHT*Ld+^*n2%ah$h>E{_(TCns
zcm8?i-x`Sn5qE@5mIEonRhvQKqt4(0-wp?M9z_%20iXGkIX!A{x<t@N+!47$V!iv0
zg80hc*Opce0sj**Qivs)+?6ThZ9y}}^lp{l_`WqQ!IJwA9?ZHfB%a=@P52{#8c(D~
zqCsKbzJ34qpjOyzk@VP24_C-h)@$90o_f-J`nA{^2s@y+_0ECDfiq^z5RislBS9C%
zj>eO?_3sj=yxsv_W3q*?Qf<MoK|43@W4jYy!~GISCdV*cqYLBSWd@y8UND$K+(0vr
zk&(R2NWMtZ%gy8;GqL`#yGQ<?U2%~BiVh}ZjYX4q+PtRkxC`1aLNs%##R#5^=&n(d
z$XOH>zk~us9e4(?%_TeuyVKb~q!OxD1uE1rTas4$QNXL=G0E=>DBYRjS^%DhWq*m~
z5I?$~Vj-|ap+k42-BNgtR|L~NQQD<C?K*V04y{E+^n}wxDjaVYQ2e~}k%~{B(l}eW
zYEi4rm}@0V2$P7=g-gsou%IQzY3KTaVLybv@FKTs6+j4JYXd|^8OPfilQ<k5-EJSO
zTz~o(*3uFN3wyCP5#^o2Qui&NuE&Fi?^*qo{?HLquaen>fT6r#^0C4A+zY|K6Gfh$
z!-K)jTr?k8l_&OTJpR3`Z0(Yp*RI{q%d5KW#yT32-sja5AxMXwN`G`c3iJYPuwe-^
z%3m(r^6o?W3=Qx5RUW5j_c8rHEr6F-3tLQrbEITehesAn*I7w#H8~oJSqCV;;7`f_
z{aZVvUH{F~*C!;G_*6uVsO{NfmFOvelo^Mo1z_lZU=1Rt<O-iO^~CBnLY+dKm#9yf
zl5^Ec^B>aw?=dXd@o`N_=dN8p5ti4WTOe|_AyCs3B3i>-cxZ?rCr({?Nx(8f7Du9d
z#qwSj(J1rS7&(~zi|io5I#RQ(w!Hx55sQGA2b<trDG!rI&Go*xSZDzv&13qLUAuq&
z(Fkg9Fsu_+P@{AzzQKo(9BzV#V2unBRtqg==HU)t`3QxWmGj{B>$>yf&U<mw#nRZZ
za1D}`UuD?~sI{=HRU|>xGiyM|y#7r>Ae+z;&o}n7C%S8t5n=(OOce1<eX^zb)atar
zA&hkmidnkkLdBt^bm<W$^)tBt1+*fN3WW9#$n+dHZ=QF3{qf2j(^c*@FPA(@ON)bg
zCRcyJj-+Z=9y+uYS)f=Yaq4o<W%ly4o^zM#iYMGh6o%wr9$sFD*?pm*p^n?PgFhTV
z(2W4qVas!ap-}w56hhsJr>CgO$m(R^U75EcwiiLvcFSX#%rD7b&%OAdnRmK+XAYnO
zqQeO>$A%lXR#lCbsa{C$PTuf3jsY5mlN4_Ev`du9k!HN!p}*F8G#oRG5Naez@sA%1
zz!4U{`9ZiAXk<=KATGi3ANMFOBsp14_+DR>6b8@ujA`ez{LQ3MG$UB`SWcyOZa#(E
zaTqv;PL^^#2}T#Tmm!~74EdWYbR=C4{+do^FK!wPt#IMDtuHL(E$E>l@#-SFpi1N$
zyjXQ{+X0HNC65zIcR*=@mM}kUj#x#8=Lj>K%Cel<$d^`;Cks-Jif<DTKUiVLX-!Rn
zNdzyNh@Ba*cBy9pst|qxpM#cvV9iDI3|trYj~?+OOetHPL{p3A3v>|+{wm97;z`Ev
zzI~pasq=1CI#+HbT{jIF)o%^?6iotco+Z1xxA5r{LqSux_58sX<B)<fAa(&s8@b?l
zq9bM*UL1erX-&YeLU$tWd~Dijq91N_MzBTTs;aY<=e~-wk#_%E*?}u>7@Z~HmhVeT
zwW%f}m);t^F}ZOPNUgzEO;M_$OA=3kY`>?R+yag()?kC>+<x~UZQc2P()eF4Hn1fm
zwk3eHWiU73n1|(ce2tQKxgi8<PI2!AuYVoC8j?g(s}i;w0^AS0pE!JURf!kl3(Vj8
z$1jdE%EU@#Azs(j?MNmSbtmm}^$dKyttowYj!u7ToygJjNM0f4L5l*}o0T~1B|a}&
zj`U)rWnze+3OH<(b7-RF#jjm5^`Ucv6vJzM^OY8@(&*zCyf<^FgR9j>HE1-|u3}c7
z4@v<dh2rSZPvaSA;7iT037bH?h#v6Fhnru1)ysjD(U;AlIes?xwlmh6haDQYoQ|tI
zl_Z}MUlTqrFye1^75!f0cm7$waMp*}<f}rGAl8vuZ!C)OvYXYyG=n=iWnm3gX=PS&
zD`<`TMe|-Pb|<9xHIJHr+^9HUN32PsFcH~@6zv%wL}u+yTXy(`kCa{-Rj&x<;w6>W
zXn1sJ9z(e3LGcAcDRO^0yrhuJwgTwq%7}G*uLa&+|NN#q-UVobe?R)epgTY4*cq|+
zbOYtFfRf9u`!ZN2z;D3hgJ44%v16uAJpn6E-`70o%f9N4#hVLviV^A|s9u2rT_Ucv
z8~nF&VDH{(By)At$)bkl_Sdh{N_+gRQvSbq*Jr&43^@6%=rY?|+HE7_=?rv2f3Dc_
z<lE2U83eNF@tE9*&KlnE)T`~_O(K-@YC`1MjDs<=$EJPyl&KtL_#^vJI_-z!5GaX#
zLPN&M^;T%$06fLf<)DrwF@iamOZs~H_?Q{^)g1$rnjVY2^cz!2jXD;6jPpDFs)-iq
zvFO$NJaxn*f`aBh<vFts`ui*L+^In0aK%O-Jf9QNrD1Ar!}dYDNu=TkXUaDzlBuKe
zFP;(%2|6mt^aP&<=CDk!P7s<}Zdc`-c+7;L*Zm$<CdAiIq|eDUAGeHKo#{T~55VJv
zy>p8_cO~5c=~$`>d4EFINu#ORUVk%VN#4`_TYvobPit-c_O-8Y`nUV^l(5)ZS@~ig
zgRotE4WaZ;OG_&se$|g!OkAuWYq7xZqM~sqJSu#RC!bcFr>Czkq!=_-DZI|!yqn}l
z@L-UkdZcI;364X_HwwlVaxV;IwN`Pq-%x}3G}5vKNgj);R3u8|vPd0v`D*;fo$&ct
zcpGxwT`R3P^h7u)YZ^<J5u3^jZm5#koQD?6cGyYK{r&ajA?Udou|Eia%MzUf#lr#h
zBq(H4b(>VjMf}Sf5<?-COo;6h+*&!4bZ54|)W(^N1lVH1iRz)hRA{M`BXd=IJAg^%
zf?^FNxAHmm$4Z-Ofk4=mv`FAU<?sA;mw*DS6zwXd*{3<6^(`rw<wUmbj*_?M)6vV%
zsGf0*{Mv~##S>)Q6307Gso%l~N?yu^FHsGI%U4Nm(-64|Ey3#~P%kHJM3Y3>%5Y{b
z6@x+!Zx$Kx;X&@~9@4io^7OvrR(B+17@i(e-aHL)Dd>*tmz|N_P8$yfiJ^T(2(1`#
zU-!Hwi|YA4!{~lL*O=qi_kmFLPO$9`j_X{qJlCs8wC(3%qQGH2QL<-k-WQjTpsIBI
zx!B8blto7m9d>l$=`rTbu86E>ic?8jV5edfAfkps{ZxPZc4K2B^g(CtNY^iSZ;(s0
zM4^IsAIQfO0LIKbedX@fO<A!nZz=T&hLvL*zt=_{yPam&IAe#H6e$F({GH3nA_3Ea
zSP^$tQ3VM72}($jzLa$>4h;#(pers!&AK&Gv&=u_1c#v31aL(gmC#gkKSYoxWj_(R
z$e<pNPrK`Wx<;n0WEiT;de#ywg+w2&@B&njpu(JgrW=q4AC}cfR6#nNf}6QP>Iy!E
z&`ybz3Jff6(?@+&O%HewgouWP%^`;KkXo;%{zUvjd=rq4sFKJ*IZ4BKkQ4pudkq@&
zgpvi-Xr**@uReX|QYHL6V{`!-PV)|OmYsD};q%*t!eXuh<m3Dv<qI(rWDU<w4;K)F
zvVrQ3yD5`3Zb?+{z2VEU(G*lonb9$pUG=c2NY35eT|8lC`lbS|SzdCEf`k<L5N)HV
zK!1zz-nAiDn%#@9e(Uk5=OdE_bD8|M!A7(6y_bJjRdnmsOL==e5APec!!ag4|7W>u
zkD>NHiXKaR_fL>tU8H!tNUrYVt}??lw}*{C?(wp!@~87-pUpqqvUjZgKIKPM8fB3q
z_x@ilCeFj&<YY}wT{k=?aC%=si7H|@qdT^^EIJU+f_1z#YUIdg9EGroLWrT?`zByU
zU@>T+RO&g0ROvZ#5h>1AlHI(DU>=9=xy~I~`}yMO?lLkS#l<UxR2AvXZXcgc7`nXw
z@#7G?w``}O)WSJ)+6(PC-2^?bg;AS0F%_on8oDlsiH@f?yY!lgI(#2Bz36|OWbt_a
zQPf8yb_|m2i8A;BcEUY+_S_2?*$eHlR_?YXOZubz=PfoCdz&>pOa5A}wPVNBQ=0Ch
z<4;apcj0EqPkwip{kp?OmtHM$vT7HeaFq?H|IS?F3dYqGVzzITG&C%ARhqG}>wKa%
zhC#z)c<!EL`u(RKSvg|Hj6ozl>1+?<m?aYtXmm!ikoTpbqp|VFuU`)!%nj$u3m!y#
z^5@S@36$1|ic;=d85wmar#{lPf7Spf0#MWaOdSTxkRt6862siw+<>5<y(oCB*;$du
z{dd1f?I>J)HnwV|gSJJ7yT-HaQ*?>Nz^2{@56&?+?=AZBND9!^4<h92(lQ2$r^dW2
z|3eP|>MOA_?N}4TUyHhiY&0M=)DNAGLD*E;3l}cvGEJ1<%DK2e?!4Zme}$;&-{Au$
z=f-;n%8<@J<jU>duU}n`<;DvoG?KO^uBGq<uyRo{C^@$2$+AJ<O(X+d4Gj$qiqB~&
zYI=RZ0CP5Ja2LUB8s`X^FDk->goI-5UETTfMRNy*=or%ukF&Dqa-W7+?*NcC`F#(X
z{)7SzP9u=DVDljCk!o0ebA3PI8htB);m~A@9oS<a@U=MO@Zo$|N6u<%WC1Cyivd#~
z0B!idpxC-yM*e&ss*<RiMt}sa_8@8EQVJ)ENH+D!eM_Rtr&6{%waoh9BDh;rXYFXr
zPT2ajH~y2ApPmdLWu03)Dl{-~4@pf4-@Shg9;Q3tPuh<=-kJ5;&&DTe1sp%_1NO_-
zT>*^5X4cuZZIXaZAZ_}*zI}sV7I++CLCrd)El}aT``NKcgU@0Xv4T?|7KVH2J-Y};
zk#iR=ln_*Kf@sI|r4b`j@EqDp=oyBCQ^l^l+}vlFG$ADGNTU>N_Cr36BkGrP=g+?f
zT^HdphrSre?fj`7y7X^LP7~E_yeK$!W%@>P8)bd__H8xwY8;F59|rfM?`|$Tn45$s
zP`19{_3KOkyogQ7+!x)e-!gfkG=#QL8}a*G7q+s=c=Q$|QJwcroj?Y=*xdYRZW5j}
zK@7{vgz?iUWY^pI(T(g7;b1xl+nG$XfkS_vWK_pU9NTP)n5wY29NuHzYgAej(}K#_
zgef;+c#K~|wXKcg$aQj4HV7H=Uq*(}>t^W6h=%@r{Hxhi?mBLi$>+zuBYiR~FGA7m
z-*{})Ql+VBfXJqQW-u_|@w<1Hzu(-n%bW9I?5}zzfi3Timr7g_?L4cwwR_Yzllu0r
ziJ3<?PLCPAk=cwo@_`3q{(IMNthnmN-q}{6a%baJ%X>lb%bDU-J|$<7YgXScbpe@D
zWp3flmefZFvXY=s;^^z|<VM7dj<>Q7)f8ru-;${>;@EzX-x`>_=v_ilV{&(6UG`Rb
z%3kvk6w=4!lbLImwHZ-bq{->;-=O;L5Z@^?G%U;}`?Thw0?lE=W={gu%bCQvm{i*s
zkomi2`NbhMvn}=Yeak~w*t~=%R4dwrbP2C>Teel$(&n!{#)FA$2Eb92`^udc;n1P$
zK#ONTaZxqEw{BX_uEHDZF6RFsUGL-IPO6t#7*7&cw3(b~Xh9KsDBjm=%U5BE-RfDU
zCc1;y^qQ{bYuYFOr)vKuA-41G#{~sT4%Hmdy-f)wef2$eXa)}6Hr$+MubW8-#&L;Q
z58#L{X5G>nPcojaQRC;YkMkCjQ+hQ1!T87F4Y9jNJ<07Qw`|x^BPO3Kv>5_Da=M_g
ziGGcUnrc!)vwx407+O9Jb@Gh{rQC_9G+R=_-^1`0{Zss)&1;e-8?f3qXzmQgbR28_
z;}&3c%0^wBKaa=6#9ZcsP4y1Z9m%=*Lz6=0(b3CQgn8b2B%E>d4rdTRrQ(aYeLG#X
z2B29?WwM%B)<69}Z{G+4(EI9T*VoKnjOAPSY)<(o+W9ji85odSmm4t?)!^vi!^5Jj
zl9L=aM488&_{O0Py6M*R_x5Ah()KfZ<DPZ%>fZ!Qz>10Ar>UtKP53U$z(1#x;F-QR
zsg+M<P3h82P6PMx<34F!mM>ejnrh2zYW4)2uZbQr!rjS1C#F?t44&>hM0Ii8Imb|1
z;Pj84IB_X{BO-vykWX4goS*2>ZozY#DnH}=^_WGgSeD2ZIB=8Z!&3!~Uw0<Pu@qYF
zbLY-oOkB4ilQJ);dH>LB9Fp$vX71ey->sUanPYL7wTq_kafJ2oW<uk{?@Jf|tsl}{
z3%{{^4WSz{;dEA0fkt}0vmd>k+S&O85wkz@_4EQUpPp0B1-AS~rSN_%U$<tBQUjs{
z(?4deBe9mgw5`Z9CNfMs#7Lu&V|oXxF9=oNL$cW)b@sygW&*F)`OXqL2QFv1O_?_B
zEa;uqPGFaymAd-+DWvieiI~oI&fy&4GL@UoV;dj=M8ybG8Dmor1%s25x^QfB7kB;w
zi0}|+TL+hVWZcCxH%bDQUf9KR2&$3m+jsbzZJRck6wFmR{NUlkHrNGAb_XfWOM!)r
z)W2ZBTjrZrkL*mP1GgFQjKeum^!CLi!(AhTeb;b13~(vy=&0yl&D5)EuC<QgjYQpf
zOU6>(u0lPagL?EDQ!q1D$JO{|8;J|=jlf=4QK(PVOrs4?$zId7)+l!y@{x^6!3Td?
zmdRZrkxJZAEhVxU6jGo1;$E25#D+*kL*lWGp{0>K)(v($tqHDk2*60Xf@|<S<bvtz
zA~FDiOYZ{me>t1f^Pp5{3OfMQnASHV9=6~PMf&KO%rwr#4713*kK4n#z2-{sEM)O#
zlg8x#S#)`sC5&cfdXW1Q$xK2`sYAZ;n?sYvwkk;93VHv2ye;a}C1%6UmsD`g2wqn7
zPw<~r<SE>lN5db=2aQxwkwUxih<|MM=Q9pvktb{bYeYTn-*AfVE~-J(CU+`i(Ww&d
zh8Bbzd4WwccY*XR(#^|%YBp1wJrkjQEO+t>GI}kbxgLN_=C6zL^UZ!+TwZ#sD{xN#
zL;qSewRKLOKd&fMucDQhmbypaDMHuK`p0CZuO^H-oY{TQAb&va)<0_aSBMVM+5Ap?
zDc-vq%yQ}m*T7AmSH2PRomuvgOaT+^B=p)zI1Xf6qo3{4Z1^@FVUH}Rj$CeDUS)0N
zDwxUst_LMK4W!Jc91Ag=0=;bSx8K{_cN#GWqHmV%+&ObfsNjNbHUgjEox2bDg>==d
z13`@YpuJtSF&Wb447ZThGzEpWB7xzGKJs+@jkvg;{;6;8?ObGHGuHt>*v<@L40&mc
zF7GeeLX|QfVUh$P&?iY6toKsbqKjfmWM3x!6gcn_?YVHm?|}ja2X~X+nfQxznfiGm
zp#1sY%>yRb4lzB^c2l}z(Y>9@c14Jsdq25rLIL0bz-MmTP+L2MEPrLGXBl^{*oXyF
zybGqM|JqX5SBzB;=~IZUB50TX$6Xj7U!(l-uWTjy9j|RJGYl7VOKJUFY$HZNKvqas
z0Kv->wsT`e2>*TFLcs|gObXM~KQ;@68wj6P1F4cq!-<n85A3=;!{wKD?&?ou(?dzR
zMV}R0Y}nXMPw?j6LlPJM7D@n=KI-iSRxUzY-YxFdt+}L~6aUc8+hc>(?M*tl-9i{1
zNxIj`!Avg{x!8RYL62?XbStTT`iyN~nnt}0bDjGK99eM2R7YRVM*rx@s6SqYgX_vZ
zv%~d!<(JYt(-E9PhXR=VS$sJGFWg%>!2_Cp1+i3XC+XX8PSsP<tT5c(3Pb+fZQPo5
zO@G3boqDG=2-^%&?LGxe0Wq2uI9S5-ZoBHk8!!c>ht!_Gs&i`UW@{KDN*@RXZgSTO
zu-EuBH;4e-`hTc<hKjKSy^nbzULoy8m~0^Esxyx(X?C9P=;W`ik$^8i7V{;$5m<`}
zc9r*HoZruX+q%22Ov9HiRg0Z83%Wj1{j=!UqvK@3)X8lm-~%J>gv^&ahRaDGtcmZ7
zZg+4tF-Fk2{&y6IIoB2rSEY}77Rv8|IduX)c$vWZHQ0ZHR_%Zn(qh}_hC#m(Cv>_+
zjymD{(zi2BP7!sypZ25H&2xTt67S^lX%zsD69#*=T;JXrhJ4EuvW1+x+c#3Ux2LgY
zCQcAiiAB87ARf{jasJ7&u|AUFn~wQe`V#As>2CEhq7cGAb<c-$e<}4azVRZLil8w|
zZSv%~V9wHsR%X)DX^$Ss*d{CL9}%1YlCI#^6<!%)_SSx|*)|gT>Ag7(1B9r&zPlgO
zTaFA*09*DiT)k#Z3iY`}4T09ZRP*kD5@yqW2%<Y;8?}$Nbm_V;rzb1xEaHOm0~zZm
z_I1jO7v>d{_!8Ymz1b`*Wk0EBD!(4FYV~Rl5|fTx5bHrCjO~h224D5__m@@lR~e?6
zuFe3t30Ey-(b}s|o3`$ovSzm*d{>=q50=%HE$HWP>cjLqyZn&scov$ZWph49+ZorB
zHb{wT9640y8;$<{{yJ{JmBCM?MdOu<!G--@XZnJ8+a=b5_iuf_4RHcP4n)W}@77TL
z)r&9EDZh^7O<CX1Na1kiHOz;tm(JJJju#M?@zSiU0P09vjD%RM3)j_UK*Zhy$p<yK
zBaBI@AhV#>()uv7h)l;*b~#~0G<}Y!P`0|!EhEf8m2*65g}uFffc-jZdI0Rb&qkBA
zoXd}*d&r9`+gGX8x`-!L=W6>BMvH`yI;#m~YxbfTVNZLcvl)tln+6F1P~X15rR5*h
z`OKbVwm)|Oqr|5j7%gliRi~(!TQmg7q&3LXaw!Uz0-koj5$BET)*WC+NU2_Gq?MH(
z;8{u1?)6jM#TB!~_%nGxz>&*WEM|SdUMoU7?~rIOQ3yLmR&xjvu44t*uD<@6;Un)f
zU8xF(MtB53;2|(shHN<a7iKDNX|tPLt#p-2QnPGLb&^cYD*H=sUpl2WoEd$EYOn;Z
zke261I6B;zdT{7;q6dZ7{;m(tG&R;(-IM!3jKl2X5vjm?JY0Ze=_Ttz-Md3`QsB*V
zuI^WX<avUgWN5838vTW?J;(QiOKI`}eFnBH50_88Gpi5*RLA6YDEJSF7a%9Maff#V
z<c$8?eEIUozhu!`cWEdoXO2ReGcR=z9ABg}l1<7_$!3C8YiMHU$5`WpQd-g&oBZiy
z%Qup`UMeSq_r*b=e51L!xiIy^10t{H-9{Y0AZSD<V~)i>EJ@IT{(RTaZqurxa%<gc
zz7Jv(1aDkFcPZ<ah6<?NknOpfNdHh8>?Kj@W19@AT6e`AwrUt9V2DSf22&>diLqxP
z+yvUO-oe3|XGa#+d$?spw^@dUU1>E`h+lNs!ninl9@|?YdPKm}hxkn<`x;EvtPMkB
zwn6wCB<^bL&my)Tb1K~Tg3DO~(DO^lC5eX_tyLp!;ECW%@A=WVXX`CB>sCrA`(}~w
zOx&=D1-+pc?LxgL1r-<WYXjCt<F)pIul?;4P4dRlP%y?`ldGuW@1nYuy-onU%zd8Z
zQgz{6ii#(W1alz7uB7cU;jEGlmzK?#15pPCie^8wz>Q=(y0Mwn_YI0*gI&G3$#!sE
z%4Eq;GPV_HEc@RQoe8<eRytKLs623boUc_Hq0LljlCTxTY#{+STQWpVtvh6?it6v>
z$-4&e$m%X#lVt*fQWJS&I34XHUDR|d)4jt-C{3j&&CZ~tbLY-78=^0z91jcg=WV4O
zxJ_>K=~cSe!lGjv3(pF6U_@ySRE1vBD@#g@m>~5V*7o*N{F?QBH6~3sG;`t))R#5J
zsFTck8GVjJ%FI#P^X{O{h7AX)&}npOGjoI1lfR2!XifM(EkIorN;%~v`6Czb?2!K-
zv8=5N=oKgR!L5>7Ek|kvP0E3;eiRkbdZ8|-v&lPqRp-2{o9$$F2~F=j-`52N{b>bB
z(eVQU5krkHTT~AX-0@BbrFAMW-XRAH8~xa*yEvRfaaDc5Df4_;ht65iU2b7w$519_
zNTnv-(T`i?r&pM&B-0S>`;~MrWv?6vb8e>Z=lYu@DtfZNbti*wcE<{W63+nBGp1+V
zhySY@RXl6;aA?t^@dZi&cO`Qy8Ild8Gkku@^P_*)UcU3CJNdwU__tQ2N`XMfGO%l@
zyua}UgMKlG2K4W*a!Dl@Dm+uV#{>p?_LZ5xV8L8lTiZR#UV@q6M{gp9y}G*k5Xf|%
zD~^PDdRiv>`ua)9%uVTCd&hxtnB_S3;|KT4NXMRKygUi&(E37dOzFSV8Y%GW(t@|d
z&OC|6fu0^~2Zt_PBA7XLg^#B)8uiRV8yTkJ{BcV4ecQEd+p)@8xbkr$^lQ2C0_a55
zJK}MR8&6CzwrM72wGp-jW~&c$x2->DNef|2z3Q&)V7b$(k;D`0@A3gl2jx6{YM6VI
z=qz*N{AknYC$7RR@ICjaWYdrB3>mUL6*@Fcl?(Q2tce{Ak4^<fIC^9Ctk<k*4=lg~
zZDXXVv3~d+*Mmh~&(k?N8(;<YKiin%(Fqt@K=|*bjWw@yD<+MyyGu=bmNyU>m_!)U
zWMpNPl#BqRT&<Q`vQ!t4bmAZ5(C$~GmIMJa1b9>daR>z*2)s$b<f}_85hoX|Ykfh;
zH8UPRmMK72mUXOil<yH)?JYyckMBh*atgU^WqHmpV3c7L+^bUd9I-e1Wp&9+(TT#t
zyJwsV?9a$c-ei2*&yQX;z2tM`<=t`2?Mvejh+Qp!M>b-_%VXnCoVrL}heo}fpwFof
zaya=bjXkSC5Pw!xwF#J}mViJg^G8e#15MVF%$QoV!)3cO;P`zOLU`*5XCqkc1dQC^
zUiOgN({?uyrFt5on9<+LW?OE(srIhVFP};#f6nQWnQk<&ILW;^DOb8ry}G;1gzt%^
zH<@M0YsY6O)_*<xyviC~ptX&SV(3EJIC?OJWXcrnqTiLv%VrBv3VyZyW`5UqLcn+|
zF(x=TI5OsQ;_wAdoBT_X-W@9`ODu7kVC?!XIBYU#Xe{it<hFFbzbFs$#_89;;)xF9
z^CNeE`Nd7^+=3G-rvLGsPeq1J9v+!R*vEQwH%M495r=&~n~Ji)Y|cmbV=BLp#~@ho
zu)o8@37KHm6Y9i=j3Wj372@XUtw;|QU$ZgsbsH|KUvuFKRC4-ve2l|!-g@@mk7uG}
za&K<R$jA_ds#@1e>Uv9;UP?=>o_Kux#@6d*mHNw49Xv=$X$=PaT(2i{9r_THq!Q)i
zgFcbFh!(GBmMESA7rWw6pwSt4Q^;~epY)LJ;fNXJ&Tp0zod9*eZ2I==hkmk?=b%fg
z+1CpjntczvTleLv`e~y+MK7zw9nT)nYB;<=!+1$&t>Z8_XVA@gA3D@QA}*#eR!Ps*
zUtVD|2Yooa;AfjnR&9P&m3AI$xfl|`-%Unt_SRJizX0|<U`U7JBZ>{<EG@0QX)cRQ
zYL<t@51PHGW}s^B$5p1>S5Rx6JQuAV*N;}5?yN5-<1Cm9H@!6cf<pGOQRVB#jqw>j
zaiZDBL?iB(h!mrdLF(uKIvX$hsI$<ghNyLh1Orp9wh{zb-6=|T6Nn;<K;T3{EV=D7
z?G6J(hS;yWzW4l<o}Hd|>e_WbL<sTYeD%!}gPU=<DW-yuY*H<ZT2ejyhnMZC5)-G(
zYNrb+#U5ab1tpkcKW%1JRa*&onu@uq21CpRFR;%*X*L3_Sb}hvl2tUS)V4yJPS!sI
zRuwS66OLNv-7qroWP6BQDnh|<ezcyfEJCReaJ#f!BEdjME0KBHpcQ-P8{x-^F2Kz6
zz?wf5OV7K%Os|_g_B)(W;`ZuS9Ms;!eMq7FwG>Be6^?Wi6u<8sI;(0j#<BJ7t5fO@
z%%zE0N6BZQT+vpIwy4ba{p?o<EheC&=vM1fv->&pv^uXaFR5E<_IQWGPO=TE4X&d4
zu0$yV#3WepI*rr5m)N|$52Q~0{h2E%!hUc48V3h|^3@gS^00N>5BUXVV`ZDJY%=%c
z4`1SR#5N`a<nH3_L%>8p`J<{`OqVx<s0t56;z4^bTUFC|1|!@@zU4@IF1a0i3$b%+
zOt@X6A7w3E7E32s3@}WzdF3aU(?wPP7#wXK|LfPs114`xDuS@?`0@-6%z;ObwwKs)
zE9|Mc$2lIVH(|{CI1-q4?|y)*NhNlZAD0NrRRT2MD&X!ANAjZ<0>{l84%QoOL+gw|
z%0kH^NMYATHc?v%I__D(x>6J8IM^;7YPv{Phc{Au@|7xF8XZ#?iF4H_|MKSeuw)Ca
z$DB8>shDjIN}hkLy~IUv{_%gtp$jAI9@p>;1;B94IdRgKiDSf6Z`1<OX}dHuGz6?t
zpi$p<_RE*67>2%w)PAMl)0R06*`8EIni31f#&b9ef-7Jf{{4k*h9M>EHUaSVWhmW?
zv2AW&)oTeKf#EU-6b#)ArZX>O4i?yep6w;Kt+FVc5jF2LPDZpPstKA+wMZAX^xs#a
zqEKKe^}J^FUxL{~W)z3GF5#O9+al64lz~f<<j#xRfXAiqLlby6;Vorxa3g7)c16j!
zSgXq0`AQocP8Cw2hzKIuas%$30IF;(E-Be7&Ms=rx?QgXkm78qIRPoq1B#8n;9yVw
zrAQ_FFM6IkanYhhfLX0Lc-lZ|gEe;vWe3?1fL1ZEwkHWWW2iebKn6XdcKp3d9-q3+
zvQ8Yc<#y|%D|_B{9eGE?U1rW3ITZ(u@v>H%b?j2(OM+G}3c0!AtuOKyNyeZ=M29cx
zolKUUUr@RC)Kp)$8&QKVA3T1%9~;tx2;U~7@roD)I%VkMO0DFT%K+a*+lNF<FJkJ*
z&8`un>8$uY6CSAZp#8I+giTxYLoXCCOgQqq|MY1;dIGm+X7{6<^FEEYsvJK7=E~|c
z34-`uQ2afcK7IKjMG*s7^pG}GtKcpO_UNE7H0exE*s){B-E@%%E*2K;fcd0$yCI-b
zXpiK6c?6%WhUDjb8{@1JcPrnrE`&YtQ?E&wdReFS)J1?^#~OPYR~`!r5@ytr-GPBU
zqKk0?dIzc|LB!HC=oaq$p+wHHV|!qur@H&ixdpjhQY$qadPgmKVHNAVcbxC4u!3(^
zV=ZU<EmAQz86{jS&VW#*d@XwZTn2T<eTpY$qOPLVLaE3i23XlpHSWuaE9uXw8~Y+u
zYMu>t;)HQKDM1GWc3!rI&Sdh?ZF^~aJA+VF<l9-Pht&5AM=Uwtm4T0<#t=eN=&11q
zE|lB{d(qZYrz%&+Fj(?-(fhW)t`0oh$19xQl0x^~y=Nm1B!@6W)p0oA-9c}DiK(j}
zQ?9r4-ls9bJT-Uj8(2CEW*-LJw*#Mms{*N-RhF*O6^?EXS<NsqBR&Wfw^g{Z<EL1}
zL<|<)hp+IQRaSLo`vEYdO=rbMs`}M<maI(+7&==RmxamAeOcBmHQ{B<QZF9TXSJ2B
ze<gpCI(up2z0+*4j;ggZlJdB|-s`!XO|5EQrf8@i-a^6~Q-b6)ty;NKWHd1!#*H02
zi^@Q!S#IFKk(alKB-J|AAd2YOjrIap_hhFJ`^_7|BT_Lygv{H64G)tZ{kU`cPF0W(
z%Zd&Pyhw9RT|E{v-aXaJo}BE!T^^Qh0zJUp`G76M_4)wxm+WN7)NKd^gkBeyw-xYe
zb$`MWRK6ghyGC{tmj1u~eV&EqMx;ktI+w&cDCfJ=OL8QZG9O%5^!7{c6FVKT6eEY7
zQ_c{*(F)4SXNeEOcOqP?v3U5wnh^)r+DE*yD%3w!@ZHME^+v*>MW2vcU_GL>DT3X|
z4bmQ>yv<y5=Ls{?J*)cyGYD(|4XEJhk&RuW(B;N9Iuhc7wp0LTB87!YIJ!~O_i83P
zQCNaZ^%q@ek!rg7C5hWAP8eNqL+{d8!!^NjpUOzI?|Zl$gb9=+fvedv(RmNoO`DAQ
z6I`o4b)f?~M|3?8v^q6O)$`oNi(jV>d_Y&DPJ<knfMGB7vk?(L$(JR!J4JlS%2ic*
z&up--BM;UPuoR=LAh2~D98N8b(tHI@=Z{oEx&pT9%h~zGiKMj5q2B{pPtaNcsYM4~
zQ24{3QubU*1+b&-y)Q?@DU58xo-OSALpn9PvzytH=hby*uaz8KiIYvpI2--&K?}Z=
zT~aAfXrp>+_7;~t*U!{^m>qr~eJC)!Xc3bLqUSyp-;E`sj4ZjER;*kpe_2xVw{CN0
zp~|0pn;<u!$L`FPo_h3%!c#sBddN-{q(culU2C0kye?w1Efh)nyUAV2`%$IsIqT#e
zgVf-)lr>}QFJ8DHl^@pX43GsF&PXFy6VW-2p+Fi3+KJ9!4BfSBn>tD;yVId;ioY>w
z`U<}tadY${Spm`g`+11Wy-!a&Mlec!oT}KP2eS*_z7@u4IV+$2(hCLaYcKJW(`>re
z`(;yKizQ2>4xKjjrHrnuT;gz5y=jC^_?qPhulBB~%%XgSZ0}S7irjzXp1jml9b|PX
zR9Iaq`(jF&Li-PJu@E=WquW&shJfbwcHt@lTrf#;IMA?8_`eT7bk1ry;zI5xZ8W8^
zQ69sCT|FlpH`q_Km97vi2@-SyU13PW<nB@_g0}W4K+F{s?PT9@)^GrN3fUXrDp7fj
z|1Em|7x~QdrrN8<XZJ^HEr(NlYPpX=zY%}X22R*8Y@$y*xn#sm*Bd(XW!$Gn$v3x=
zde&HFx=e5}(3Cppc-V=c30}0cJbGrW2Zp&%Kp7R?+$LU!Boo6MBL09|E7$cc$o*J%
zR_((J<+|*A;QsK4>r>m!e|l_(Mhn!M&IF5SgFllVH4gXv?Q+W0h6pVB8zpB*ek{nR
z_ayD(i$9>YwG}|muNrU8P^Vk;$!)zOwTX~mHh5lGa&yyCUEOw@lIM<K{>@Y&XQ(S`
zA>g-vdu42=siM(;@&-SO$qyVB(I3wpe*C^(s>!>V{vw<2aYMF`6oa!w3G{43;mw%x
zTh7*^+z@OA`Im1~5-aHHmRwsRZM^iPPRD;hxj4t=)Ji=r4PwN2PZE=Hj>Mwh;nC=@
zx$sBE<22(%RwWU=E!_tX?h2cW>rR2u&|LunySg-x(q)8;aWgGEIj+NLk<hCex^+s@
zk~is$gSjnl&&SW18CZUF^yyWzHRpgXr;&a20LD<+?L^_j)QeAFt=?E27MZ1sM*aEC
zCnh<l=${gnb6hGvzr7ysv44L`Mn*@$%UT|EShHq7Nttv74`RAtH*YdV?~o;zf$}`P
zC{rcH2a4IE?kfauF)DmX2p_d1zWX52@^N%aC>CJi_J8+oFmld}nr4i{BX0gZXSucE
zNCd|+BFloM3|tmw@MZJKkdWc_|23B{(pjWhFCz;U_=rn4;KT_jVZ89_+EA;AGl>YD
zcs<d$@7edfJw-{6(Fx>=U(c6pG&bq`;{f|Oob*epfy-ctn}*0)YTrCNvij10RbCjj
zMmAF5XO1l^Ej<IubM4kGujuIL(u&`+3bXT}S2GXviIlIt5ndZ|DY~_TcZ0DE0GoHu
zDO=fP!Y{}I%W26OE<+*GwH?|)2fV`*S8DR<$Zg5Qrk)$faA-d^T{RM@JVIv_h-l2D
z-VPH1j^g!80FxCQgxcA-VfO6|gLaDm^o4GV`&#TUfYL2XW*p7v&e2!;{HD#2qP2JF
z(4zJpp>n81<?N*l<)_h<vE85{Lr$E~|B_4_XHM&Ov{d}}GM(-Id)i-!$R%zUw{G4H
zz|@0T!oPP^?EZf{@7w`;F1l&{l?>TFeG}^X_QIA#qxY{W6`s5hXG(6HP1*A%Rxwp&
zyQvyvp6$ykVb6|ct`7PX%~+?eFLfMfHqk5e^zxeb<rwv({?F?-Z?-VsUly|eiq^VX
zC>@rqtGi*+$LH9ufnTe}inGPXxP}{xzZ-U`J~Zk0G+kKVdAZZUBhPDk1x!oMeet3<
zp#+8S{?ZEZS%1GjyP%})5dAm-)*~exYHCdR7;KpwKf?UoU4}xPt6+%6<wK5=(gp30
z#O)9Wg+j$NV9v<twg4khJ601CbR6pRPkqVIJo-!1y9x%v{P_#hy6=36py&_8QQw}X
z<kpI2U?yLJOXXrXD7|mUx|@yw*E*Yj=ySw3lqjNu6*<WYAX0dovWD=*A+%<;p+^?!
zu+FbZFbzF7uv3?r$Zv2m=ZXVx{r&CrLfiDz)b}8MK{w|Ue#!48wBNQ5GFQvXuRAOD
zy~i`m0WzbnLWInBZXPf-d|u_|iO|{(?orwPQ(dx;EB%u0DEXpV)W2j``Mx{)FHLS8
z(|oBrrNcYQp?e2$l^+4mj1DUtlpZMRznbZZSHUn3XjtJjf8OaNVqv6XogC`2?%p{Q
zQtC#_Y75J=?N{t%hL1##cP|xX0N?Qgs9Y_b9I>SHu4a}#zSlYFD2L{(N=DI}vptKE
zf)k2b_xq)~z4dKrHQk{fuTNQk5RQ`Na_niiNj`yMeDJ8AT+Dlkz?~|HOfa;1zQcGT
z>Q;cJVz^Rpyg<7D{d_*g*jKgo)3HBl@tWnI=A?^R8j*{yUwGC8Ls|)b@E-J5a>l80
z{SHma{a!b^4W15$m{_KmP4|$Z!*(#F@T^Nyg{Do`lETm^Y08{bx5Qs=X<i(V1ApHX
z5G^odvHhm}amr}FbY1%gr{FAo3$oIlRa8{y<g2q(o{*4w4VttgS|;!6GNjpc3-@_M
zN}7V9gjUH21y3a1!r57}Mbpm87+IK(Y6x23nX7ID(o5$rTOy^EG}ZQ9<%B6nT+X=(
zrnt7}kMi=4gl+%A_;oK0kG!EC8XCSMT!m3-^cIfU{YG}>5yeR`oAb7~&X0aB^1tZd
zZKS*-FRl0kk$_~Cp>@B;{;r#z$xMX42=M6p9(g&*#uhDEIy4u~`V#gjEw|-~ch;ao
zMZ~cW?b8Y%)F9P4`}gl>9){kkWLWM_K50bDRI=KM8)!IdjpqOovB3hN1eQ}sH9b<^
z;MPhqdw=IWfQ8T4U(x3($Sg)HICSYZPCIwfo$2lwRtfVKEI9W2l@Jrd0ZTTGIn{+B
zu+gi&oLHGZxlVt7mhL_EsHc?F6V1nxb^8BIl*|NI%U4Ql*}#9>#jvRoch(sGZYIUh
z`iS)VrOI0|GQgt{Dt%#tC8*jR^UTpMLORN=3#HTn_J^%K8?HD0-60imK$lxIA<3!y
z{dr+<<Mp^W`pL$`^C+v#wk%vj4I&ED9E@=FuKEB#P;<_)-N+b0hh`8a-@#T6y}O+0
z<^{q|CU_8hREUmI%6kxF%irk#9%{UGSI{j4<~enCtY=+4$dajP<GA6&X~^khU$Ya(
zotQ<QZ?CUCl%_!@58NhG0Jw5Hcb6zO%|edF>5q_bETvz{P@F2-4fwz0P(JsnNSr?`
z5VtW_&FhMZa^<U5!t|Ut_M1$$^Fa0P^gPaHZ55yR45_Ft&zXH>3yR`N{yv*H9T6$r
z!Pt7q&Not4Rh8y5n--#TvStfu6+U{@mak`Ca7+ErB9+ngJ4-Wq_!pMSZWx}$By$0X
z+;BUmk?vX9H%!BA$%lmLlAP&R@C5Vz6rH)0qS0~M#EG(0+|Lt!KxzvM3<+fKiK}0f
zd0%7eRXvqAML*n%;L=v}A1Hc_nG`g*pMSZE20*PXPLn9EVTiR*2j(`xYT$>VvnP68
zR338GkdTm$6>x{L(7}sdH*<vssd{HE(mGuKA(sq;47^WJ+^?RmQIQA#$%g%*r@|x4
zPJEveBcYt}cP{W=xzNmE)b!~C3AVW^QTR_QMW`(0BrZI8L>9USC|!To1{irGpx1CW
z9uke5RyNJxGj;~ujc-1Im+gU42!#m^<_CI)cYe?Qq7!m1%P+1bM!#?(EH9>e;{u9v
zEwfytamKf%7XgeSAf60JiRoPtlX5;&SK&_}2{`E|pmx1ZPlR}yWKYPvL2^XjeuTbL
z>Yk*cJ~KDi#DQ6MtRVGY$-}cheVA(QJx?FnAn+Y&eQxX)m!?|sYHigSjnKR5+rNiq
zHBrmcX*!E>C=yXfVv%~uJpK6%`f_~DKNld~&1zdNBo&}CJr&<1b>qgts>sx0)I4K8
zCv|TC#M0j<uz9G#Rch?K8^XK!M%QF1`$@W6PLmi>nxAh7a37RR)$nnpgq_XX<xH&;
z+ngM2CPd%AME&A1GcVYRL%ehAjC*HpvrNIAY5?U@QJc1sa4J(si6?$<X32zZ47B%W
zef2tGlC440DC3B>pP1h1D!MuQ6QYd@YN!gy&cmL4&%1c2TuamK72>rVM9<#}Z*`-a
z3!J0_bCe?fZ1yXkE$$M6mvFvOX+WLR6G^C6A;V5s!zV9AKXf@-pJ#^p`pC`iW|?U*
zzgTfpT!bzCo47OHDFA5HNbx-bS|!Z&a(8i5dX1VR{A%cuTv)Na00={9Uph#H5esoB
z_s?j0ZvXuR4n~NrYvH(+ewL}rpB<q~+!ymz?@t#$KB-wPYuGq;+_)4pRZ21)bqx*Y
zuu;sO*32_B+z)G|mL5f~QG|!#e2AuwIBJDzx?9@F5q9ZVU22<+f)CQvxaGJ`oiar@
z&1p?jSGO2`<ji9ckd>qQ2LUYuG@RdN82-F0L2d9^MCd{Y5ZbMq1kOETY%|Hkh?qy6
zBwS81isuk^MTU(J3k&N4*cLwX5tp|S(uSDtZTLEYSH}8OHFF#Nx9EYGe!64+(Z_i3
z`;j0^S8#<1sZ3<oAhfColUH)ASg+v_OJf1@@1-sTz@hqBVteuYd23p-#e*h0-A`#P
zl<Dx6RLvd?*d9(uMoA*HbiyjWJw$oTN-<UQ=e|F{o?Oq}K`4Ak4OZLOFve~$61=`M
zcEktRzGBdU$eBLw!s=+m^mHPtXj29p!Mc4nHPleTG}oYTxaKBAAEZ2G&5&DCet&%@
z3W?L+cGBH(q2$fQC!*{~6a5nO1Pb4ma<sGO+)kc2u@Vw5qSIgCwELvl|5Mfr%23>>
zWH=sFd163zbnOq?z&EbEe!=&Ay$oT4iHZy*YCrS)rCCI|o+_Dk^^>7{_0nDFOHERE
z4~L!U)V^IiYx?fECuZSQK-?840b6rHs)IXg+1slCckIJBXx7K9S-z#cuIzr3`SHiX
z>Zi|s*Cl1-0nh-t&USY1(6+6x84_}T(WxNTitx%)dB|CN|M{~I*QlUZh@3CKT(>$_
zCOU6Epy_BIu(O9X9V)f>?KlD@(Qb{{ZvamF4~S1f%_^+RXKsiHd;ap4xV@|eO4UH~
z100MB`VmhSkYVS=><fz|*D35j&vw#Q+I^T*Iz*zUJ<GJWxB2>=ax~sKH~bj|S0|>^
zQA~=a252$ipigt98hy_TpIhZg?cJXsSO1fC4&TxD8|vzYq98@&lYq{`G3R+!mcGq6
zSIQP)KNeo`@uR1BEeKmKH+U=2FM@vy#kI6w-?3VoTtzQC*|hSq$az!*pfC#V7v{yp
zNs9pw7T4Ew(Woo!CAkmQuST~ayT3gzBDyc|A{H#92$wUzyS!lhcy8*n)a<p#4U|5+
zel9IlSn~b}`cAaDwX`D<IQJll7EX79cOafgBuLm&=v7oGswpX^DJPXm_=e^BlG(_n
zJF~@I5j+Vtn^u<Hz=-WBZ})fgk9<atO9U^49YzmYO8p6#!`Sj-0k;1b%v2A5&X81D
zpx^^|H<jdlF&42}@g1KMQ}%<dNs%px+evIBg!Xh152C0R`C7c;QA0hw2i(7&9FP*x
zext81uHI|iQ<|G<G;{91;EpmIeNG}=Fdor~=FvcB;8Z}kVsiYsm>M7u_JV2%#kd_D
zvhS2l3g{-pwIHUQ2JO1|Pemjm$1bLKclxk^c9us#2>-YvKT$PMDfl#*3)%#a!kInX
z1|l8WE#Ph_Qpu)XLutv{{H@@IG{H*H^q(Su;g{)7ibfM8{FC`q`Wi1t;&b4~=^_#y
z&4M3?kQV(s0ARQs4G_q8{r0afx?rYhU$b~wy6Th1kFPPVhPLKT?kB0t=%d^NtS=4k
z0BJ1jl)7N~$lA+KoR~i(*8U*^YI0v4j?Y-E16#L1u#J36M@Pr|JOK|EH&Sxv-{n0p
zH0y}8_Ag5$*-edLfBDL7GAHX+Wk@A39+G2krLo{YAtc3_WH-|YJ&TmH60W9IM%cV7
zpQ{DFZ`#*y^_QbMb<J8@TB2u5a-V(+ZPmr)S?mFw{MV-a)E$SA9JZ4Pupd8=i5uv9
zYVY4^z|8M`o+Zl7;?kzCye&qP6E<{^pd9X@@)fSoaM1gb%Ml#4Riua!_SU{UhPDu^
zBBgdVl&StvWEJ`SHTv1V+-dzkFn(UB)wSKLP(`B~etu<d>r(R=Zh>BANr&;$C=%^b
zLX{t|rI^u9$f9li<}uAQ``XG>De#|56>Hb8_d{mC0$j0wuPAB@5B5Riz0;!O2|=}p
zMSQFg71wF*!*yeFQ4vfb7H((b?wFtA8w%2xW)&f<3I1dJXF9E>ofrZ#%hffpps}!^
zAccQVy|8GB(*<&hx=U{Wm%3Ah^oTjdsP|6LV>i+{AAQ8Yv73j+k!Wcl{4{N9reuw!
zRXe68Is-Ya^HtR<Vl5kH>5&YadRO1b)B|Sdp)y5ZR086%PnbC@75rG|v>Y6*SX}z`
z!?#Uyn<3XO{6~$>DPrj=(uSa*<zb<r*i-MJAYPZ$MoMZ^Q;VK*XUzsq;PS}abG}hg
zBM3~s+zG?)mEQo<7^t~x*@v+ovt!x?A3od$Fc!wl)JBelhO0P3MJw2XT$K}m^gYbG
zEZnXs-15MvL6zGQq*M+GzcGO&n0iKE`{6Y-=_|dJOLbqErU9gh<$^~u{lj#<bWWhB
zf|gAZaN=c)*w#2>nfCiZHmSPfFO-2Vr!Vr&MO0=tQb^bos=*x|kxW0oqT4}&bj||<
zQXE|r?2?ht{I@7J4@E4L%;9)9jP=-tVkURoA?(NKiP($(eecNDA9zP6A$`3G2XNqp
z^kUnys54Lzk3doFL*{UA&_y98>i|H4h4g-wc!0uwP=BEfxpwoWdBX{XTu;22q79Cc
zcgzsDD9{JE{>+M9JXtSxGs2$!_6<UhEw&Z$?fD<dx)ff@iJJL*nIG7~dk8<R2BEmA
zu1@6)KEE>kq!^1VJUp;Vii{l-&@jywDY^5nQlGRWp{#*g>nMOx8jNKB*I-1!b%f)3
zQZ0KmtdB}refJ*5m8<RSM5`grlUnf^&c<*|ilSn-T_C`nfreW@?C|9lFhU_I?xBya
zV_YCPdVRVn_(h&=u&W<C7;W?5{ta^6uDtb}O`GD3bhX2XkWn!|FUHXOAbbIZXf`cL
z%+(U(!ncgRBwv@(hTexX7@wBAdD=FEib-_KJ>Y^t(>Zj+uH^?8(YN1^#Mr(jC(&hd
z_=!@=BH^*&FBa;`@I5^wVo60S45AAATQQdRrtWI27Oe8D&(ljNfLBpjJ6C^^Wom82
z*xnM#n^`#3C{zdE!aGOgeV?00Stj=7&^_Qzef~83#iuDRs#mp@@VLZH2U>s8DcMEb
zU~pl+zFi{akr%^_`EQQd{bqGOsRs;t^vtT5-{I;lJwUo^U436n?%75Jnjl9ES8CsN
zQvb1zVY9*_LI|u$>P7S<?h5Ir{qsm+UxVhM3#IWKU0v^YAB5c|b--TkuVOxgbKNgR
zxc3z{HlFBqBY+RfnS=+fI_mEV6J^2ASCp3iwj7&(W%&TuiU)Mebnn~Oo2m<E*!I|@
z4__RoAQA+D8}r4qZj72Q&=T0efjmPU+Ubp!KL&!ttUw16{~6bPjc=cf#)t|+9MADR
zRVy<&bdbwJKM1>i4EnnR_U^rYy=te~_jM?%MR;T%ln^=Cg`G&Bt$_?Ve_|M#4uvby
z0Qn^o=lXKDti%dajL8LZ75y+bd^F^Iv7;05l5}<^@@{B=%TD>R1iG_?*pAZD57}9w
zEtV1%we3Sf10xH>&<6T$o?+u}EITAhXgavoh5QR2BB3~#sN~q&tbUhODWj7gi3VCA
z@AnkLA_u{F7Ws&h>l%U1At5;&dilKH=xjM~SdssOiFyivgyt3O%S3yx$Pv&GqL%G|
zte~7EBJkvyjocBa!^3%D^-ZN6sRq1|PNXo-1HI*bCdrYX)L8y#Wya?eOlPRNQ){Mm
zij0YVU~5hv*Vq{_Q0(LmnI8%s`rS(d?~1m-g~BG=5rgxgJ@V*}0A-oNS@$NoM82I%
zMlD3{bmsLzL&V9RNq)_ZX$1Gfbd5HYb|ZV|Cy6qa-dhN&5^@cZU;y#?bMJ)<eKI2_
zY5Ni#3C6Mu=Oe1|pu#JL?H5h!JaR>4$JR6PzFM(y;~`;^PXH31n}rk7doeN^LS7LD
z?aJ6GKE9d`mtOaD>ntLpfIGxV7P%a|{XM`jxu_>ki|JNkz65bq*~(^%jYyh+a#ny<
zBWscqWIg4PWG3=Hp*T>VHf<H(1|jucqv~b7AQt#1(!raNo{2VvVvuZ#&;5+MKxggw
zBaCD&sJ&N}I7`HbG_igd0*Hc)7^17Ew+G<s%ZnK44jno$|HAaDUX5UYT7J%j381i3
zC>O;emg--o_qgDu!vV5xY3WmU?%o}s9&u<c;aZ$6Y(<JuunK`Ed>u}%ZGqQ&Q(+4g
zIgi{6;3}!ci74*HwIt9}VciD><r^6}90*Zp)$k>3L-0Sp)!M6Ms^w=o1IT%Wj{-2t
zBW{Ksefo4D|2K^^YK4rC{Wgt~<6Qpqp|hF9J6QNtiN`Mz48E+#xmPtV2dgXlzJ`aU
z)up8f!t*{9B@?O^k=)}P-vRCumA&vy7DK|t-I4z_4H!zWy|52LdM7qLP}5bb<T*gx
z;u{cx#8f)ic6Hu)v`dj#v3m2>Do(jxqo`LH>MR}uS10<YM$7!~>__kNhbj7@I?}39
z0NV}4b^F9W<7c=g!Or>{7{(yM-tQ4w@#U)&CyrA{)ejXD(w3=dpZj-XY3OVGuzKgG
zD~(WRE?(M_GekZE;jR7u`HAS3O9_XE03UM|VYY?Q`S)htoqhB|O^0Lo>sd(q>wal5
zc<}%8qq`vP`t`BQ$O#wvF|fM6YXAQiA2(s)h%iAQ4eH=l46H<AWL>T%{yx$>Fmv_(
zu6y`DrM$~F|L;#Cnm^zJK}2mO;V#|>8{DEy&Yf(j%+NIPGhe~1BtAY~W&>El=TUuI
z{#ktjeeta&443LWuv4p<JiAsB()oe5e9=rOVsNH;$0j){v}{xUYkx3q2f%mI6-1a#
zm9Fv;-7MbK+P_P{kUvm~vX(+Xa=(&+f)R)O&nXzzjp^ueSj;LBA68|=h)$9*`n_do
zux%$1QddRA)cZMP0!}?()ry;+4*S3Nk&C8q5Eqq1q&Dp5{z7J*dol5AiOb7Oq#_dn
zHP<@2BFiZMAV0$AI)GG)&_>^D_pf_V?hs%qctor2&7X!m9I^0%pCkO#)7^pM&4jXI
zWWAF6;FiHPyl(v~QF(|;R&t-<iGBEZK?<MS=<+g^7OWwMGdmCL;#Q7KuNBP3`_t3O
z{NMi&(DNR71^E#)v3V2@v_gzfpMXRpJ~md!ZE_efNa?qvXlx6pl{0sUyu4Q54E*!r
z_M9CmiL|_T$5GV(?fuvoz6S(AUADQdO7ymfpPckHFoeX7RuVDgh-fqU|9z(M@WcQu
z{nh)^q11J-hU@S*pK1|h=cMbH^t_0|Ow_|Xpa|~U`h%<%KN_tCV!7dE^rGzDfkvW2
z=ydDHo7=1I>$0N~0yM2MPS0)mMe(~za6K)Od<l;#b$gN9T7j5HstEyp!l2&0*WA}F
zUnPot1atn)8;X(~%8h(<yBxfVq}VM{LGgdm;?E~s&(BqI>fOblg#F~;6wu;U$}4pY
znX*}l>RKWJ^4v|&l{ktKDPlP4`$_4GaLJddJLmrI<(7p!(reik?S(ITi^>dT!1`j_
zjuKj7aKKZQ+$R$nDjpK+CcD+WsZM$Ko2uIYax8SqHfNb!0lb8Zh~x~=N(i3d|K2he
z%Y^sa-c@hn_slqsPECJZB*LUnWO{8{hP--mj$1x}X*og;F&Tq)w~6*1LLa3r-&#VL
z#9F(PgbX1we<hzihA;~b(@TN@wih=nzxBVLh!o0{eSBK!3bN{l7?kYhBO6<Tm>s>g
z!U>}CU6NZ1I}thj?xv4Y07`9yCUg(Fx0YxXk@85vu;s;6w!m6z1p|W2Z8LZFFK9`n
zHyS&BN46R^R6Zi&JB2|?-JAUUd~v@1{m3Q}s-+GryJ4Hty1AjK`QLKgO#J!*?VFa(
z8eY~0#gx+j?Io}O_q73vjj=J7>}JZR^}eBjG?Cqh_7Hx8;+(ZClTJP@aGr%|=Ekj1
zXjv})%X7@AQC-Nhdlx_v?~fgOk@4ql>r1a5+E{q=Ny}k07B&iETB<}$LZj6<|GxON
z#?~K#g*gBZZVTzrLr^g>DVNOe*|YZnp_C;L(6vj%DcE$wLSEkF3-6w=t)joA80c;;
zNMXyjYx&~F@0W10k!7_SGI`HMf`3bp?)-rEXOQ4ll4S{wt(dIR@=tw*XOc)6B)gFs
zpP>ozUqqK0a9F+N9Ayk42?YaPTdt<t3Ap$DA5zNBeLJ!7%usvx*+Bn&j`(K(eq@t~
z>X+%mltAkE^2}QrB*&ej5bJacH_`w7rBF#bE^SJj#hM{603RE_rTf1{G~Gg=JPY6~
zDnE%Bkt;+2d(LuRB1%{mF{V2JZ7Vi+?Df{r0^_z={J-^IL-F&9>4Or{@g33WHA<IC
zlu`_+n#f$3wk)|Q`)M3+`6E+p(H+ZvYbBXr|GXoMqDGeNXv)i(X1H4Th;bMEXrr>a
zhOu3E1$WYYTLMGm;%h5fwVY>0r;Y0MZ>fmQXMg>CKO5KKE)tQv=2S!O)~g4TtWd|b
zk%-PvM&*bHZfwl?)vnrrD+(eXK4sua#ee%;K1+Gpw1(!`?PWO=i+q~XtzNJ0u{fZ+
z%=|)+A&>k`^o$xd?K|#km)?2F*O7N@?QUH9YU^c^m(!(sU9`=Te8acV+kD@sCCE?k
z=pCb|8{Bqi(3{d4uZ>sx+oc7)S#Y%MLugZAw%eF7Wt+P`xmQ!|BJQ063}_WTU(eRa
zbE|o~)=@>WI|m-@97sxZLV^Y5VLcd@86FmASG5A3l}HjEQ@_Qd{?f}WYWg#Iut!Hp
zfWpVp(&4mUqNzRV<>e&?OA##l4U(3uxQ0diOj`4gDcoRKG$O2vPzf`uAL)gP7u&tg
z#o5^w%kZbC)BOirB=_wb(owP^J~45+KTLq8X0Huk*)r-aZ$0xy9PriJ7W1ARjJfDy
z*j5rLa*nXnU0x5vDNZBLoVj0&x$H0-8=KEO-R(_6N@h~rR&u8QS>BhQ;W?z;6EPRl
z6Uj{Vqh1U8hVqW*s96?MC<dJ)D%g_>w^wd0eqG`iMpYjDPWd$51GM&aHsC#N2fLK^
z5N;JgM~)u7fbHynmY2Yh-a@nS_hAJ)lA|v<XR(#$8wS4db>FpRSHIyZDnWOr-7^v|
z8mR&_d4>K&^#$+$7VqmeVN1EuzJ;7l2k-*%mjk<cQLctlTtrgy%t`;>XE_eoVBuc-
zP)<RQBKZX$wf2;Zq_{u&AA1gpl#~>3RumP@h@nFtlv_w7eJD*&k@_m?U82*FrXtz?
z{q1?kk`-qbWtE0KiinVB^x9k;4@~=Q(`s4e_JsI6K({c>=8t3C|9_Od2UJw&7WRMC
z7!x&m6MHviP*AL(0@lQ696+Sm5W!x+0thNtAjyqhyN=iptRT&<C}6|bKm{U2L`6|V
z1gsPh5&i%60K&cBx4yOhv$(m54m0Px=Y4m1_OqXDg>uR7+{TebC{<<<fx8*C)dUjb
z?ylWM?nDV0TToco7a_EqMB>@tm~RzbE3l8^r&~swYCzSCW9l@1PE`JNaBU?ri7RGi
zmQhY36EU0DwNRA3eLVZW4GrF%1d?7v;O<PqI^(!XK6&pS61TSZdj6w23BbSj{-aN0
z!5RA)HB(}mxh-7i!2oW(Zr#i@DL|1q+egdqC`#B`nhkTw7&L)!ox{LO<Vo62Cj~zz
zx;$!8W`+K6h%KgF^2bmdq7eh~_kO*xO@9DyMrX3z&3>Qg+Ny29PJjPH?33`2kdfp(
z_+ZV7vipy=4IeDuHMH)g?c0M%l+#}>QX^RW@)R6QQj?pR(-ofrYfH<$NNUa|#nJ?l
zOvcVX6HWo{>n?TVciC9%_UA~8rm%ssA3xrQ!wMq;k%w*TBKt@tw$S5jrXj^IrW^St
zpy%DmijUhu(hZN?q~SP9O(1Z<Z2DNkU^HEdlWTfoy3_wks}3CyMkzk32}JMl{D-`7
z+`^~{`rpYnhbYUIEjtCENXz+14JFCb;k(;@qlDP8>tHL;G)TM6&8f#NIXCi3O=!^b
z++LopnWp`+H$T_+SA6oVL5Y{YOP`gn>WQ6xb2+Pw7?WG`U4_IO8k!~z`~eKl4hEg)
z7nGP#bBiY*(;3FS4^Eh+=IN_fN5CAyLHTAx4CG6E>h~Bj#F0lDuz&ylbsPS0D`03(
zSJc#CS8Bu@<%>?q<jH&GCTHmIUhj`t9g5M3Gu6<^9|4s@N2O}%-%xwWF%j5uDr$%S
z_@j1dS(!6cQ}&{PNpofI4>B<&70I%k%dQ=kWY${FjIn}(f<8vAl)Z<9ibMn5iS)Ue
zKuA*%#MQ72w2!RW$aMcJr6+M_b5N+(1afrF7bTe`Sj#!^3G3d!|2XdK*ocko4+Ao^
zYF97&>5;RqlE6TkK@?ZtJ2vIdSLaI{5Wh&0?Qufpq!qu)xIZDYh}H>M*du;#qP=!o
zGl8mq{`q&+92}b1zW#?une8H7_MJQx1sW{?d{k7FcD4Tk3F^vWpOA^1^&N6ZJ(t>8
zRo`5-zsA<y{t$JfNu<#34*2UmY(f?DvQIpy&le4?>x%gFD1Rd>amcDrGSv?ZmtEpD
zyV+?VyW-sB>z<uvv9n~2!z7a6={`6Fg!x@-!;zGiG=bDsoXDam0&SP4A}{+yr--h-
zds}JT@6S7<hYl6BbU9D9V*%b%F})tdoN8-0EV2RFGp%@HE+?spgiuUyBr(|TURq{N
zJ;o9%@}0%kiqduOi9xGvG$@lAQVT6FP4wPI>031iq}YpR=K($a#KT5>x6h?*Jspn_
zCJbocPrL}#v~}Htu1DD4B#B!%yPdR$U!Fqz?m-i0d_2@q@o4}H#S(Npq4q2gK?(rg
z!|En@<H5W#f3L^XQT=(4dH}t<9*xxAppxB{y*GN`+kY44Q0x@n@<2mLa}I2%`i&ZG
zb1%KLrsToGjk59oB&D2h>*{(OUC$%D$GCz2;tY@NAdI^5lpM!A>*D$;iB&9O;q&PH
z((rgXtZ!v7;v%}V*aRzw`(pWBeg~SH!%cuh1{B(%U@spCM$vLd!y`o~Bst-5?9O{d
z6xv}R4N**G+yZ~6xCjuw4~X4~igbcA+xa#H>89=5uNdac-kS&&$Nkd_Tn1Un)$7;0
z!J+HCMNK6wq29#`Z*OZWvz!?d-ch|~<Hj^Hf6vi%6wA&c)hPP_qvz&;EPE1=mH3v)
zec*1Q;_JgXSe{xEB#T|1@A&!2Yga|F^c#00;)<S{;@NpVQU)X%A7u~ZuI{qi(%s$N
zbH3Q8y=|K|Z=Uz)^QSZDe~!W`CU~C5OnBvm2HV0AsFQr@8)}7K&vwo-q;n)k&z~*k
z$9RzTE8b5j{qUi+|G$(sk8p+LZvzLF7bV|FNLb_`{Rh-SyR-e<ILIM3VbE9TG#WHh
zK;=AuvE`weci!T3=aA>+r4={tyPb+0OR0<G0JZ3Q(A9Xy?c<6v25)fW+J<)FjKGcc
zJj}E8%HBH8kjl5Gmsjf@jTB4vt9?9W_M_(t)?i*#YCgaVgDI{u3W$rV#LQ<qKJg7C
zXUIEz=3!EoJ<G6toz&8g42Zm?&5Hd)WY0}&+N{|eibh^J0gI1xot<)I(nOktETvV-
zeASxCIGomuUKRb=oCpH+y!2UnQinAjq@q5V^>-~rHR)5yM|w+d!*DL58fSczym&Md
z;#T4YmnAz7XxnJU^yzq~oBYf?sYgAtbj4&fFica(-&^9I4m^%J>9b|A-^#@(4Gmbi
z^GK0v0?{tX!%Fuv`d--^_Tl+~jYW5-ZOjF>O>hPiNT!z}gjn9SzM*2Pl$MtI8O++?
zu%dK^m_W~RF^&Mo(#u>Yxt8K%dZn*U!k(8@q!=slgJ&&G6&K(H3I1Lj=Nxo$j@)a_
zCB%#}S@>+7e)F{ysMdzx*o9~Avn$@~#qsN?@l2lli>(UtsNEpw3mUg=lv6BaR#8k~
z&H&;qvnT(egzR4N@$GNtQP1MC62@eY>fgUVl}U4hnYS!qP~BYi2%uRH#i4$?M$w&n
z4}5)$;|M)9B3dAd+4>#;BvwL=Eu5=e7UgqyT4trp*0n31e-nybNAVyMHHRI8S8TO$
zV8MTg_}o*AIc##q35t7eFL@2>BqN~Y)XJ)wxIX!JVKgKc#(Eyj*}c>P!ob&^al~8K
zIh*+>Wp^o<sNb9%#&q!m*HzulU>_BiI&vD%mp<DiJCtzjsMkxF8Fw5%e;6kg*0)i!
zNrZ8`m%p1(PAYt-r}xXEq9R+2+Vp<AQVE_iJaYoRKKPYS*h`q3wEFKnG&US|*#W%H
zq+%*E8GWfPL!wLwih?{B5M*&SL#*ng!q2TgLzc5gIT$HBew--1bs4CQZ+!Zs@nq+r
zTcat$;pX4|twUegC()ohx8c?Ke{NKM3%xXQXj-evC5G!e`=6M-|5hv!)jl(NZ1@<U
zsbn4;lYO5qqn~0Fb@4m0r&zx_HrP+o8^t;aC){2=FlOFw$+kPQkr<?F!Q0FkoG)ze
z-pqqJWo2cL3R*mZHN7mg$>8}TsO(}x*6-S#SRVODnQPsWtUj-<H+!(LTV#jru0xJw
zuCTvU9(kzK11YpQ0njJx$;ik^#X@|?6zA+2jcpxZ;irZlz0h^-)%_R`X9v*R3rOf7
z$>=QM6cq<dj|RP&T61dcb-Qu%rVWG%u$HYmcgj4OIq(C<1?(+1Ae|QCr9|HR(RU-4
z9>!px;BZ5G4xJ$(e_&Ub1^{#(g=c?rdLKQ4A*|t8ib!jJs958$*}!HW=nU+s#%}GT
zS>Wn0*cd&|P*@+eXYeGP{y4Sc>v^c;$TcD?!5R;hYi!uicjLy5+G~uaURYR|%67y>
zPck(#vl%<~PTy(g&z&=52pFh=8W!!>fpxKZzh-xLJs1=;hq(b~PF(}N4_I*F;UU0-
z=Zvg5ms=`H2?G)qo3(-p6H2GX!=%<!yvL0k89OR5N4vPJdaIJG`r1{$8QEc+i+0t=
zTD|(s+FC<YFOgaKv;G$$cdws6XOOKU_;oU6vLaR^kFb8k^;$ko!TfFOKGXjrp#FeY
zk(ZwveYTl7^Wcj`Wn;Q5dX}BN)ufBQ{;dgl>(;Kd!9zWTw!8m|_L%PNBb<J}{tI3$
zJC-8MQZBxwe|^O=fC=S}qydtZ1rmTUH%JLX%l{iO3XKO9{nNdJ6Q9sAx7p|e+BhvF
z@}Xx=J1T(_F+U@I`&lQAF%pCDB3`eea!x^1m~eRP;Rz>45Tu{6eekv7s}J#x1!9m~
zhU|9o$Ai}8OCKVGCJ%+EQxo|2-@h>L+qgl%qO+6Wd_*}VkbBICu;WK((u%J0-Q7=-
zjrwONAaJt%^91c34ZT`Ij!U>b^W;u)vKSW-20wH$9G1T9`||GbVH_k0y{MOM4G5U$
zK2Qm9lhhK<zN8RhmH5bY8?Z28J4`1&Y53iDsStVR&smT5&BTPag>k?3M`hy0HUl)`
z*v|Re?!c2i8&s-2G(5f@S^wvA?Ruk}npZ;iqqRPsp{f&(`|#^E<quUZxz@e&OV41<
zgTT^e%$n7$OPA}JzKW$MgMZ8_4v#hE&XoPb8)PiHKX1hKt~O9|Dy&O_GvQ$x*jGT8
zMF#@|pA#@8q&28t-~X?~J>|>na(a$_f&QvT?jE)(FI#;$ez%#k(K)xE#hF>Rc1&^U
zL?X+SlZ6+KmOogSsMf4tP66W)egn5ywBmvnmgLVZwTEWEEu0lD*YfSRR^!L-z}yFk
z0mn_>k?08#jFahy;QEITKESF)PMCj{*N9cxFC_|{_}InBck|}Q`W~ITba94h-Sy>5
zF!a#`5=IRID&niHc`4S~_q#Z+C^`Emm&)x{)hN{dz={kolf1@B&?<E(>J>jYUd%so
zHszwXH<?veqO{$+x4vuJoiyHmoUUsHIG}`RM8=gJ*45Z}0vm|Zh6w>u=t2+@w&{Rb
zp$1qG0^=U06-knP9NzxNAAjcR>+{rMQlKq5+g)G2y~kKv+dx=)6ZD(2kJDei`{BB5
zb*bK_&mBuT)vRgLr%$jodo}wjfEE7rgAAHFnKI{Ox-a^o#4?WUUX$3;*#XGM*LUtb
zf+(nmxi$+`ZrE8_ZAN_mDCXWw!!`*9!I85_ZZcfPLDGja=EQh?)_<!wwN_bt7TK;Y
z=>_$A2!w&PC>RXY`FV9k;smTD*~QBTQOOhX9*}PV(wPR>$h@83e(T1)<EQfe+4m=O
zDSO|Ra+iwlUns22h7PF-1iNI6Ou(vs68lAtAil%w22=ha6V_!?;}usRLc$EBcC4{U
zrR6O9#~*aSFto4tLC{pw=l0849RSFcMkUEK05QjB`W}Afoio7JBtfsT+ihG){)Uej
zAu3{;Kr9TW#X<`^D3-SgSmr%EH-U3Y(Km=Z*0ik%O_7lOhLOmFWC|jjtzG*+_EoLD
z5~QcA%TV{T{y_<Jnb02>l44AW)fSJ3&MZHKWnKEtyO+}UcWrPxG*ki%Ne?weKs2)v
zq7a~jAib4{>Na0HTrR-7Jc7KLrH5+;&apForUd<5uwh3%Md{V@%k4*6Q4c$D|E2t-
z&*di?3U}y2TqIxZ-(?Z1a5t`Bzdn4m_RD=#EXZP~+x!3G!J>Q~4(A{Yu6+g2Wf_MB
zFlE@)R^kwNDWTL%w;A=IV+nA^)ZE;buF7oKn6d;hif;uts3@6Zn>BAv1hSlOPiQiL
zuq8a{;fi7+9i}I<D_H`pN{fRMGO<^485~Ex*hB60_^%d*MCE@rk!uTaB3k$L_t(e4
zgBjTaAa!@OhXp{ZZcpyN820ePhYx#Luutv(jl5IE%XbxISz@C233;c|=MtWphc1+H
z;;_cp`}_b-=<4dmRxUuuJ&1ChPp0duqhFrBkG1%b!AJr}aRGV2bYk<5kN4GRte^Zj
z8R$HPX$>5GCy)mU!@ITn-njbVAIa33oFC6-ywdl8)fW~L@_bwltT5d^eXP;X_MAEk
zDql8-@9(GD>$P1`x32V3Z51g$^#{N@L&yS^7cif5bBYK8zRq|`8F$#rN3@3zU^sSe
zKW7FstXZ?BkL&>5VPBR%Mi;i1jh&s}+<v<#rbbY~;4y}!7}u&*OTOu3N;*Zt)KHtM
zO%UQwL-)QeyO;c;X^Or!l$bc8Jb`a<MJE9*W)p{>usduQ?H)=RYxN`&ug4Q$9m0ss
zc}#Au7*UtCUo7Cz;ZVzS^*!#rT*RXoMamJB0G(}|y}dtq8$Vc^#8qdsEAUYUfzpa-
zO%H_I;>F@tnl))Mt-|`bND%Mj6h?}{7JRX2>FO--RY~*WxPKl3@sqpu%bV2ra$t#Y
zOeSC;c0eLu=mq*78!leFC=8-4ae<s54@6C22~?Wkp;!ArW0qmgnm|IDIkW&&973NY
z4&gr&o{JSDo@Dt+(z(QLbXr|3jbD%gXQPgW7&w1umGM)$?vft#(88Q10}2ZYs;tko
zt5+rNEO_^Bk=0b~ov)jMeTISuFv!q3XATUGt~byaFZ(mVe9p4&*j?5gCXQ*_7R{Ok
zQEq1G@f7lmu3~m$j_&<pd$vxRGG!n2J^#NXGoVkv*M!MmW>7)atX;cs+W?s9lPV0Z
zPeG20QPVbU!U<$eH8QY#eY*j7DxSGZA^0PdHU03^u$~v*|3W`VPgi?D88VQ=e7_h#
zMEbQSZ^iFlPM+=X|0600N+>NomEgGhZf<^ler4e2dsy&7DI4gh*&L}DcL0_tf`zk<
z2f&^4y*l+a6{iTqg_<>MHp$=`6mH3EDoj3I8x2R+SB6H6__E+(b*MUkhvC`qxUdn%
z%vtdLQZFwiI{!W4Io<D^63OKoY|%#RE?G1sgi?#vLl~E%h?x>^Bya-u=mGqZFE;Pc
zc%eCQ2^kra2)^YK%*}n{rfbh!YNIb!H{{6wKbId<+^XN)9fLo<8KzE>COvY_3ajUn
zZ%;EJsV;UOZV2W4>!xkfs8EZ!n|V@X&8;b;+50zDe4MJ+K64E@nTc5dzet$Nq4|*d
zwtf5dYu2sfV%@fxpvnQz`|9r$xkbhP|FQ8}cBUVK2y0uaQz}IQTkG|KtW-1!@m62o
zK%Rd4r!0PYwaI($0oyYTTsR;o0e_^+aJUp>@?vX=k+ZUzVNr+`-ptSj$v$RAZ5v^P
z39i-uv+FEfKfbx#*p6ZL9Cj4b4@=T(aP9KtVbWucLOP0(d%2WWo4EC^e}36^nb)Z3
z-Iz&x>I-^pmagvb>XS9WqSnv2<J`i@O)5`$NRLl%H+nxO1Kz4Ff{{XQrAl@wR2F6L
zRLQ@XveZqs{l$;gVl-Us(Zjnn^CZB3D(xCzO3_AG+pVp?Jo%Wz|3Cfxm?Yr#)I$>t
z2)b{wt$1YIV6Tc}Z(G07s$uZFs*V6UT*P~bRbn}|rq8_tZVQN=DGt}x<cq~uX7n^L
zup!dqhmYe2qOiV)1<Qb~ExeL|^DBM-K2o>0_C`;%At+&NO-FTWW4FL(2?5R$kK^#p
zF)3bm?bS{!fP_LM?tZ=Lr~+@?&UX_j1hbaf^{>D2%lhyA_<!;C```WFy}fO7!XRt>
z(-<Ie=)hw1q##y#_Dp5mjC3v#?RKZX4VR#^f%bK49zRck{OD<AxM4f8`tGmA#>L%+
zt>YhLu`B){Ad;_1Fd+Jx_3O9N#2z=@8VNRDm7%8P_grt+e*MN$tJ4Hh8kq>B-j`Yy
zFWXT7njlPT831ueOrJC-x!RNH!;K7LrcaHaW9VOB!d=C(H`}@byBlB@P;~;0t54wW
zyXmlbfg8{EGL|oj8HssMBk0soW8;&FLEfJShvaCgIDQ0t9DmY=uA%#%K*uGT6eypd
zQ)(yT?pIcyzmQ1IUht_?hWI)}T$}-G;5FKkZPczULtJNt${e3Co8S$VL9cx-76*$c
zwLBt`Bfh>(BNjY*C{;>=&hF(*MY6lFEDC=sC_Wtwj{g_)mpcO&B-J{BeluLJw_BOb
z@_AN2l&xmq035<I_bE5`%g)K!Zqns{5GsI;b5#$;L5O*B>NHZ8JpcAun+pG{j7%c(
zkAQwm_QTJNnKQQ;y`mDnc=tc);9A>PtUTk9Kb9MCAiNP^R)XlrxeIU*zkG2JmQjSM
zU4aEF4ha0q6b_-;{}a=>@i)BQR5}&a*3^O{z!8YlT4}Q)U~Sfq)C%y4!_xI^hQpIc
z8d#)Z78?z#y=Tdzw@kCzzumzruCx$hDH(u1nE1B*3)t;d)rTi9?$#~$Z6@A>j~n#t
zc{IhCI)8%oEaD^%M;AH^TIgt_<Of2(4528=c+5>I84n)~VEUbfHo=Z;FE~B~BOO9z
zm;`!5hJ5zx&E^xYhUMxDdPI)1&2s~(6-ilPm@|z<|4^0e8A0#Pt&%#5?ZoOQH^z~Y
z7YJ6izB}<7S$S}ZG1E=fY}hcC>pJ<MRr~g@Dtx(5(!k;rn-4&lJ=9Vx=MrXAn39<I
z*!Xyvj&h3ZgTsui+iui@=g;zxJLKIxegLJM1#=z&c7>B~=v;V4@<AELiJ50-PSsPC
z&ovgw%VfpJNV?Ns4Lojux+engx*iFM!5*Iqu2|4(S-D0dt+CE<rED!NJF;@Pn5{Qg
zYjbENjbiyd#$U6iWu!SL)z?VsKzT5b#k3<|PKukqvczxD($kby#^vpRNt*`@Y{T(f
zuw=QS)ChExr0m9j!G^2I@PItW1)X32s-k!^YhN+|AsG@_R1b40hAFAPQFszPJJ@t)
zn%&-?JO!MpNHVF?;<#^2Js#^6ktf3NQ>HxbJB_Mv(+(Ya0&alXhcX%>En*SVFwm9@
z`)vH`^!@qCpnQjrvu%^mSc&;uZ3|+KiXZ`EaABmLguj@q=Mawxz8f~Q1%Km;34IAR
z&sMks+j&r#E=Vm|ofNS6VJH;bn<R%7VW3ZeOj?oZp2Yt6WkQ^ZPa>h2>FDU_VULL3
zy1#uKj{6@6907<Glz)%O2{wrX3)cz84Wr~i(jJOb-L0N{B`d)za|FT%|7deF(@~?w
zSXtdZx}oIF8%r<*T-C{xR)9JnvK7gp&yCkyz0<>UQ6xetHI~f=e^I&XT_PY3VDLZ%
zp2NOMqm|RFZCaqZ2?ZK6wwXk~XU?2a&7rfq^}r=>zI%Ni55AnZGi~`jvc(!3OgZy}
zcIJCg3xiLlNC#L>i#6V(XsVIAHS2|?3#CFvaPC?Wxrg`*-U*hb*1Wp-c?E|lHuG1x
zJD)Wj|J!_b&Z;)wG$=_}yTIuvn<4LW`Nhv&PNpvlKVC~ot@kCI0F1$IlDXlvfn)Ni
zTJ!W8G+4OJWK`!ZuP&6$PYtoUJbd+nNnkdDEDkD&{V5^1r3AJ^Cmbh|O$ptCacatN
z8glurpsy8%-KYLD%zkAyaRDmz=h9Xg@nCSt^r{&$EPHT^I*NT`Q!N`v?#1^EC3|-N
z%?Wwt78Z99Ndm%wyC8HMurq$hDy@OZCN+8A8HKTMtW$Jh!hqsuyB35Iw~Y8r3t|j^
znv)aGPBBGFLM3(`8l`_l#HBRL0GSr528LbSJssn^$;E&Q;c#vAI(6D_o>o_}HLf;I
zy?$YFUMxDx4@<;P4>IFCf4XsrDFJ+_`k=DpYV{;39+6H2QC&U_oF4lxDgUu640?%^
z_X*1k+U}4<4Nj%?z2IHd`Nz;1))jEuAc9d)4!x5e3|H)nNI%Sq=v#Nb`mf6g`38TC
z>ls`DH&rAaSyhQg9*Kt_D-CQMdwUhlMwO@}%o#M(PvlS6wUTW<%uNe4Jdm=HKw~|7
zjs|3v;t_Ui-3G074+&&+Km>KyK92oKI%#RNvb40^dGtFar07e|t5{mW{6g-3UV?RS
z{LEx5DNhSB=~dvK>3v98govcmp*h4&X8;+U3wMZ>&-1vPc!E(m^1v-U$o|8DS)_qm
zOL-8>KEEO{8#-g_=FR(9tL^tc01%|2)R@ASaJ$d()~i0=dr!C+wY?D=yD4M-;%z2K
z?ToRtEec=wza`8WUVWa?)YTMaz*U#uhEKy6^re_7l@h%8jMM$K&q*jqttl6)Lh%sn
zpK;VOD-VCjAt}65Pv4f`o)y!U80og_Q$CQ2Z)lC{=|eE>PeweQR2v2nqSw#C-#Aj=
zd}B#xAqJ5oCkz<8f4~$rd)~wIWghFlgLp-0DU2I@ShPacCQOEdhH~N)9xcF$(C#1%
zJzhg++yTy`o*$>PRNOzzVoSn>Nhdledw;29y2;zdEJWNEYc+r@<@CH=plxOWP1?6V
zLi0`qEF}>Pue69qLgpK_dtm9xJ~-hq@UynD0T#?|&&T-WJ#Os%{?EaTF;*v`*oZOf
z&<e^MFUy@&*hhysUcA7boOF5?F{`APZ2Oxtmey9fs;W=6tp`<Etnb#0`P<nm_&X0K
z-C5@#lo1YP4UC_Ne;h$NJOaBXcgguC9KO>gLEu4QFu>^q#JKRdRK7OsAdK%Y+G{j{
z63zp%U;I#}y&xrR8eE6gq#8F%Jiv7*H5Q?FA>66#vN}rFuWDS-Z(x=Q);Wp6Uus6?
zXykz%GU!aAAgHmUp&#`R$d+RDt_2rDAp<`sk`lBMwvucWD)U$q$PTJG>=_}WRPB=&
z!Z0I(Y;)QhtT+PxXx}+WSB`<ZU$*?>loV?RhuxHs_kI4DGep9F>DQx)1@r{yL1HBk
zpPQ;-6vm(~OqyeHmqPSy8gzXF0uGaJw4_AUwHbA5B1E#mk@P3k|K^)sCO<%(TY??|
zXs@AKvYv<)<zB3PWfW<-ld!m(O7$z&vakN1s3Tz;55e?yM4y9-!6|x64p&rX+H7l}
zbQOz{lVc8QRZvpW-z4dCb%3k*zMu$r@51u4_zeR3NpO_xOB&X%pM+~X3d<rcBjdb%
zN%Z$XarclhBh3}$2P|#Uja#=2Nj_~`0V_z(F5+d>sJc;)U_S~F9C?=n1gz+7{crA9
zaRI_{4xY?BBBrJfv42JOx>qxi5&@}F@sR5*Mcn=S69xIklT!FQ?BgTc1NlQrADHRQ
zM~@z*B9i^`Wp{6HZ^&Uc`78MHv@vhpxG`~zGugIOP@o?Pn#o}Z0x68n9NkDs9fsAC
zDyldm9DNEA)lZ0R9GF&KT?kNImV;vf+gHYE1hV$;*xZsNa5ocs<U4Mp&L@tCs?u?R
z9Kd>7(IgVKi))os53fIlh@;!RdM{!7f$gS%oX*RL9;49`h2VEoCXRc%-bEltXq`~Z
zptd-6ci0y6@#C`Y=tvT|r{M8#l>f|dvTg!j(nsfv5Cg_$>F^~!hSkpBr4YM+imPy&
zAc2&ypNe6rW2o%1PyD7P3Myip^&ANLkSE8lo`$nO02ZvPuWv33S5mTg!eBV9XsjLu
z1&6e1PLeCNr`1`qwFr@%iLbS}Aa#5D84x+VAYbbUw~l)A7UHE!*Bsef%Dr%j`qC)`
ziFd0}XErL2dK`v@I27LBkQTvT5urgHp>@$8K0*Zh*m1dTp+u6j)y;;y%Q0|qHSJP;
z)H9z6zJ(XOk(k&sZ8_a=d&fC=4riger!A*CFRm6dZ$?mgF)xCx6iPQq9u)#^ID{gc
zuXA%HWCJtkx{~S>am<0#NxGXd=4idIzs`ecoHlLapb?(!si6{BcQrAw<Gcy1jFkh5
znpFWSD@X2gdW$I6YWY6chofGHfeh%B+r^^-s1M}-TKoQ>gdM>SJRkD8bfX*@!MD0}
z$q7qiPlm%1vi2^$>G%|EGjB<CkP;KGwy+or=5X_!!LIlZIYT~&(wx05H<~!o3qY#)
zmO-MuM}2?JJJWiPg7ATW%;y?=CL_spG2yYxY(%uQsch;v$2f|Zjt##3_BKt!G$Qm8
z^ReRx#j-hwE@rVO4ZwEQIWq)5UYQ?&BV-kOo_q86nBW*sPtQ=sDSpW6@HqraR_M@K
ziL?YMmMxE35q)u^fDGL(+@WwTzXrtWHlv2a$b5E-j(RB1k!B@Dg@wrr69)xfZ@<(W
z`*7ymL&}v9E_A`!Z+jjlFcuEf?qvn)zH{%s*;b+`5B!ADz*hAe;(Y-O-9K$<(er(@
z_ly%>>hHOVaz-+(WWrM&HI2M?kSRVut+CQd`h1>8z#?$~5YMx7k|)9}bf#1&XnC)8
z%1{mXVxW+X;iN?7?b}C9GDUm@+a<Eo*ELSp!YXHQVllZyF(?AX$WI5;n0rn{oOUsf
z@LZ$9j+X-yrU=6YqNeW{TV-e~W1LIp(Yp_M!=X!X2L0APN#di2G1C>FT^=-D3V#l|
zwkD8<i*P(*WrmmOU3PuQr+ugg7<Cw!1K#rEu$9k`UvF;%l-tmvp7MSH_<CBmqvs@b
zH$F;omj7qt!2q69Jr|7rf?y~Q{Mo_ztkKGsub(|TNJNzTR||(t!NEBmiD%t&HKIn@
z|KXC$o-etrH0ZM_g#QKqsZV7H4(C5~<`iFUJjCdzFoea;jS&W)&>~X=*OLO9Mz)vm
zu&#Ywv5UNAFxiL=Z9_%8)w4S#8lfQ4AnL3o7mkPLxZ5jFFTlw4m!$01?U){H0ck68
z-|Kb{I@UPx*TfFtT)L(WqR|3Q!?3>dAB-o6978eh$OrLGc!Q{YeMQ5NLpX=SmEbHV
zOVSct#@6BvwG>MWv1-4y_xu$^YO#aa$dUgDqqk|(>Dr6=KtqbJudhl=uSZ$r0F<wa
zDw^^ppAhdLwl<$lvDrQ9eIZA30>nKy<J&K>{nFq*2=m@I)ycylA4ovZKggilu5YVx
zs@G3s(ga(|-AvT3Lb&QQUjxw0FrG^|Ir@wbfj6P6P^WVcW`!z_&;fOrgJjp2`Vb~(
z+93qA?^99m5E(6?354Dv=iKS;v3<hX4I-qlqJzl&#ogZOq~JX(^7P%ckHPYL;Ng*|
zCHw;<p;CjaC#`E`AjB0dtMv}k<pYJly?5_W4UEJA%ZpRTi9wmLV0ew)6UqL29+BqI
z&BJ`?(3rkgZ#nMX--@dE{!>YOtW-7I4PG{0g9_1443@MfPIjO0HC@Ta?!TnyAzV0>
z9`KFum|#Q+9r%4;8cOy)^!~hDtx2D|?dsIt{j><C67-%7klnXf#tUjvs9h|_$2i?P
z6?*&<XMHY5&+GZVRlW>dRXe4h2ol5k&p-d1c(C#dET%-d5&}uc4}>i~0{yk{Pi3WE
z!)ZDSqV-rfLP^<fiNt;G*V`=?wz=h4F#`y^<T0OOi%O_;rr2TaDdTDvNEm@35#Slh
z&GN$d{;+R0R{+m6qoAD745~g)&`iU#KS*X;@bD+a_D3d-CKGD=d#6e61PvP#)-}9)
zb?W8QW-$^H1Ve4wWqAu9QabrTQp}=1PPxfPHT~g-sV;rU&If>E(fJTYrAlqh&<LS5
zXzg4><&BhCb@5j8Hg8gifGuvvumY4;bOx~elu`mNUWdXUi+L{R8)eKtm_ez@z>qW5
zh=v+=2lOUbf1CH!^ZH{Yv(&y)&1UGjn6zyKY9qX7*ow@;`5BITSO|$T!I|lqj78A6
zxly5VEjiL#v2BYu)~Uj^N6~M&&5(|UH2Clu8pmvtKw2q`yf@B+=0*)+sP=O>0?zOj
zB(PBB0%P3)@w_>Ad9@MPYbuo62vYqDG?gX+kan@~mb>Slec4a0LgYeKV1=UsOi~4b
zQVU`yRADvQk}%J}{%2NXU5e>DNUlyiQq7U3O9>(++}9l~_a=1>!bx=!3SmW<!tHVQ
z=3gDSsA<JofKm4bT1N-d5>O8O`4ka3epx!CGCP6LSdqbGyBIN|VxK7AkkS(Y`#1>>
z<AmHZmgOh4kh<V(EU3Dc(~~dNhL&inArc6uMvpJOAvZ8?0t`+v{R0$u*&aeAj4pe1
zcEOCeaYofg(^e1C(;Yg|iwYO&^RE`NqC1Je^TocaAfXU-m{SSCpZMfClc~XwN1~u8
z_NR@~F~!SA11z2gFK^MOfk6ND{T0jjqWrYc><nn|wLBz@b#Qpt{jj>`*Rs{qFdRB+
z{N>XIPZY~hLekD7Ob)#1c+QVzRobTZdPzRy;9c&a)({;60CXYFuC%O{Cx(>gj)vB~
zO;!GC@x^3Vfzk@r>u+$w!(h$oBszZ#iOwe4B_4jCbKO!~ew8b>b(6J*IIU8X3{)O;
z2|Y)YGLuCj`v3&=s917@?-1Mz9U!Ei>^_13lhKd;8AiE#$&+utTnJ<F%mJ1)fvCUE
zyL;?sU<C2MEp_O8z!8oG(w3)9KLwC?!OU%5e{HFvt0+UaT4EFk=ctHaT4@Ow+E+@h
z<u4a_Z&6W#o`W0`<p`g=?47fAyEZq1S9Fy4=T-&ir}!Q^cFdC0q{ix#_L%dy`R%4n
zpHtJikQ-6+k$=k^yHMeqsWqUU+)`(P4!M_g&&V-gum%MOn`sc-?<U<J>?O-$>3$$5
z2FND)-j*?c9KNCwAsV{+;qsD6)bGb?QgT0^T%7Z^@9xsi*B^CR`|IX^-@E!l_XDOQ
zttPfu>g+ht)G0jZ)bRM0GY+Q4|Fg(s&4!?mLAJx3ABEaY?cf{|pmO^5MC=9sM;%r~
zT~w>wetD9xI<4?+Y?iBIj(ytliwEu*Z0WWqzwkykJA7f>YDMw<nUa7|$^ZKIzej4A
z&@KVsjFNut)w%V%p5$<>g~7GYgjl+nUnwIsZQHezKB_iZLIvo2u$E7kVSJ9zI5ARc
zjbQZkD;|&MSfr!mzttL80hpChY_M3F-7GxR*0fxFdcW`0u<rqMD_+Nz;**o;<kC^2
zsTPKPo#EC=8S0Qku}EM9O%W{}qd|oXjlWl(F8KWZ`Vg>ET{Xzb3O`3Tt;_9rbOHr1
z#>wcYF-+FrLfUUNd$=U)yFrDw2=M?kg+iHrq2BN>fN9Jt@T-i-XDzPOQgX3l>s$8z
zyJe!kaq<;IWz$6|6G&54;L_j}<RE|Buwg^zifFy#!qAeP1Qye${=jvjlo3co-6m^Q
z`m7V<507=WSgE}xxw|F)T+^6PYZgf-j{J}+FJ5#@I>ZG}a~6<+AxG&myn)Opr;3>n
zk^EIh$JLCcqj7Aq`{R!gz$I4Sq9r&fo1*Z9gHU$Y1mZ}YZ~&dV)K;DjCmR$~zJLFI
z=?){Jq2pus36IjROsXi3igNa}G<wO?lE)@btCgFps}=KJRT<s-2!zP~==iym*mp`(
z7utCo#iOz2@@jk<R#R3qp;G70wUnnbhOcRAg-KJXVfJBf%ue62x$>>{o&4j4U0x@D
z5D32=Un!f?;O(SG9{f)e#qACuT`_GG>gaJsLl|qzU=qKI@|;gSv+{+1L#fNSqEdX$
z9l^6)^H@}}Q-5rDs$cT&RUfRhUxoa8vz`w2P&*2Wic)~GrXL@F3*0sK0Qy$8bPxZv
za^(#e?_e%&NP`S$uG~G(@0@@0cY_b?J(XIBd9vl*v}mBxULP5FT)<eMB9`<ZBv8M0
z-zu9{fy0LJzBQ{?TSd8LHzjP%f8E7=Z<FeMJCRzm6(bhdAV1mAy}9IH#2eCIplSO3
z_xms}ei-|LRnq68o5|Sw--*oUu(joNg5ua<?Zcqg%E_%Y`zR+bFAUDUsRojzI}-qQ
zrCHZl(mM=-T9;4ayn{uZ+|yyn<GNWNI9)srpN~`Us~<4KI_lAbihV`Ul`E;*G7%|d
z=Fslsx!qk$bkjNwpx~ruyBg<Ue!C$n9z-5NU0}6ZQQmbb&KaJU_`Ndp-CwI#NuMF*
zBuyZP)0Bj#ud%l9uAEkv7ha-&ZI||Xti2s9tFLzVH&pz39id5jCP&2Q{K)fBE1;J)
z*`oNAm(jHA&b$`2HN3jwhf81YtZ<3S{h$|+fs8|~gG`^T<XZm{NrbfY^jZ7aJC(1y
zhp_`Aqt8UWf`G)T`3%zK%(w;=7}r)}FR&Y&2x>GcU^gj>Q#+|Vu`!GEhIDXWxRA5A
zC2kJ4xEK2WH4kFd_<Y@J%eSc844W(Q@#`I1)>G~V4*k7$*YU^p2ELegq>l0_5h$fE
zOuuR3u*Za0Ub><!ASu%wZzR@R+f;9=oAyqZM;cZyZ02fFFjZS~QKKws5A<W7Q8}Fc
zzArP5PiECBwYbr92HfTy{$D9CBxH4GkcnC-vXbfUstgrgn5EC^Y8$dCPQ9reFickl
zRjqWSe0S3&o<Oh9q$dCR*P9&|>nQIIal!2QZNBH1cV6?wSNdeg(^J)}cbZ<&(`Ka)
zMwYf^b?3i&5GAAmRp_s)fABM2cu<@JuO&t=3~DLuS6-dESw3eli&gd7B3&0S9r)q+
z&Ad*$8f)p|a!eBO)>#<vaO71wcdjsgqx~qx1C?HRPrfu|oAZUTR|BS9xOnmNw2|Kq
z9L(vI@^jeO!(jxcgwgk2jEq&P7f$<??RD`(gaUWKJymLI^y2_OM-ew_Ujx%g^#l1U
zf;UsLoHU@X_znMVSYsQ?`!au*s=k&dO)gtZ&y}1??Q1CAgnq6NRn+P?9h+7gO8Lv!
z6z-=lkeyod=bwK{h)NJud7=~rHO@eqp+bmQ!zeMdHgegUzIjuvE+@Z)OWH4p-6f@J
zx-za_xiV5Cb8Uc?DlHx{plYlB1Z#kanb(Ile_nlf<X1Zm6M&OOsGjOD1Yjg~tu!2p
za;)e7Tx(<^wzep~|D%x_|9}8ySPX2YKmMPm#IxGhWJmv7+Ol@Ve-8W>wG8Qe%%gAr
zHf1OGoP>5;b*+A3NT)Ro77`MZ%zoIsXCZy-QlgM}5;hKsRo&sk$U}CbT}bHX#6j%m
zr!k9pCIv`tyFi#gy+^H)c2j(JV%0@>UYFAYFe%(lHnHi*k(O9h;bMN*ve!`R&*pxm
zATX&^bt7OC1q@m`mV8niA_!l&w4;MFZ0ue}ws?wqFQZkTQ_`v5xbb%AV92rsVX8wx
zD7@ZYZPB6BSP*wo5P=YX_+dY-C~|{BaihsopxssXPJ)W;lHOVNRX*>d0&5LPV+<mK
z(wi))1A2=cjJ=_g3E7sAo<WahT`l+eAggcynur?E%^IP;6CM{)S(JQ;+ymxUuzT6N
zdB#RYu~5Ybat{fYndBhcKFV#Qe>=~$SDhm4;v0QQ_TKc8&!f*o3?J0*{pqeg<`NLU
zK3BNmh{JimTz~!|E-`U5h>U6SV<$peX&5}Q>>9pG`X&Kf^uoKx;2UNG4-*Gi_@%LF
z$6Q!es=vpBPUTb-HshT!8OfYU1OLxF({6@_hBwP<Z)~uqgR&_C6)5=vO%c$wGeL!$
zySrhfnK!<knmPt9r*QCDuGx$HmrVn~W%u8$lOtpj6S{4dA59Cf8pfEHIG0gWGzGVM
zP~ydAqN%W>S^yI!Sa*2;0+1fh9^aDQG67&p;+N9;=}rNt;cJ<BBN%}sda$ooE@Y&p
z>>_rQsRkgaC$y1C%ymhgOG2NF<Lj{FvsHkMzkmz&=~nxvAg`(a{<ewj@u^qK9@5%k
z&Ex4~zr7$)s)QdPDv}m#%{7o?i_TLBG@5?djT<&h;iIGvT0e))JQj>8_RYUip&P|s
zzux7t2gSH#!~mp+I~0YzXht5cEUe#DS5&@4EMmgUbVNYJ!a(R5baVMY{H{z~Kv-_`
z7MpA3ZRW*oe2|r$9Rw)>KHuG<#a2mQt$NU?(uQ#&2gZpQBwe(ZbTjqQpOU>sZ?P_&
zDnx`Iz{kfhNu4uc2haA_I|J(HA?$VGNMP5yTJ8V$L5qMeI>n>|nec2Y{c|FKrWW22
zjobS<csb8YpxhU%lB!x=e`rQ_2p-q141+&EqERaK<gC%uyljahk|CMV%n}0jb?NzD
z)W$*xU`FD0QxLKWmD($=lMmP|wJr)DxtI|vFq?H{7RYh3kK-dAKwPmVI$T#fVb*TX
z5kx|FKt7<G=tzIWXWG!$z0x2Rj{5maWH9&59pX%>O`r^poK*Y-pcIZOcuO(un5~C6
zo9cxv|L5lrJ*oLo`)xIRA?78GX>snzdrzW3fqdW{coDz=KdL2`mFfeMxpQ+G(~1NY
zM;aJJlR!R}ny8TA4rChA*+Qxvk$Wr_6Y7NHWlh6S@)H>rna3Wztn`H3CvlJ9@o>8F
z85(&;pikY1R_>>7?ti;FN59+MB=pCl13+Q9lV<JOb<ce`^F21p_QdqKvS$o74gm(n
z{*3jr*>N)ON-O9pR7~`c$JE%OSjkSkd3W;Y?sWU`ugUC4ZS%7U=U`<<|9a4A>A7i{
zB{=|uqzNk$am8F3jlelYcsA+Nop7X9`~HK6C><9{&F<a1e$fsx8VxkESJjE$XyG4q
z6ZR6S-~Cp6J&~9h_t-Nz;g3+DY&n;b!b|rkQUm5W&&GC1ND&~Nb8xTaxG-C-e##J@
zV%al?NSB`l7RHCFLSbihwGR9bImjtw>Wl{r2;SL&89kt;VmLK{)Sv_?ZhWh-m+&8X
zwr!S<r6=YeF#PPiJg-O0AbTB#4W+;QKY>eEX?+D$ft?SB6c1TNtvVlzFVrG{b2zVV
zk3N#qNwJBxB8~g&MA?X&{~UI$ygn(Mw#ddZCY|rGk2urj_u6$|?OKYBk<LBB2!nz{
z_E81DdzXkZBvf5>bXr^z1{)ij1Ss-)*V4y-*A+|vFLbSBuxG(j^Vi+ubYIaElfq?;
zLEE=s4qhmPBS&3li)#6&(j2J(wO{gjT|x3{gT*y=g%UdqIbx@|E6oJs>Z70OD<uoe
zcDj3#ixZenzxqZx(!tQGxr?baHF!A#&3f36*Pu6Wg!1x`%TcdU1MquGNonqllMH4g
z?y;Z{WJ~|=TsxwUf|c;cL=!_d6T!v)m4`cX-+2Az@~15w&n}98HZ>iLR0_0Egd6K;
z8k$k^LTn)5@lqn^JGA@FT!)}r2<ZQF$Q02>eQcu7EKY_{Ic*tItZj!Z`^PCl^b)S^
z9<%S4zYH;Q4I3Eggt|QbTN-ZH)s6$56itwRLtaz%Ai2CviQ9nN6+(I_dBnB%tSyfD
zrr~FX2Cc7c!7_~6*+=JcZ3}hS_~ZL|vO78_QlFfc;7ywssoc!*F?tzgvm^~}lZ6j%
zIsEK|oxP^~a<Hy{()nMW!uM#M1tadEuKQoh4^-y~N^D)qtI`J88@<lsG_M!M^aZ4{
z!79yNagBy-(WjVZj}(ibOTRnc*w#J#%tNo6Ein~#Qp#Mgn<ige_kwjT^vCa|p;O-=
zsOi#oM8Ghe;@uFji5(zt4~=h=lFlbTC9rU+4ACYhvZ;0-v~<C-O*o>isC($>jdGpl
zki<7PY|zK%9NqVGu7R^zaZv^Vd%T8>(4cGdX?CsTbjmG$b$-UHhBE$Tpvx%Yq$GF!
z3`UB8*32t3s7Fn8Mp`Pjeo(K#{Cv*Vdz@92<@Ba~q|_XW`f1~HQZ38Q&YrO@=hn^x
z?2L)~yIou3V^+ll(AU?$`KPJP>(Y$#OZ#6wf4XQ*Q@#CEb#-ofva$LdS@0lobC!39
z$&ISh2*t;q2`o<xvnWU2Kh<NvaDS>g(jL_N6%&I;<wHW4XFm~EnB6Je%bua5Kdk@H
ze`4Jo)<|eH>Fm1;5q$US0n6SW8(dywqYbK|v3KM7FC1183Y~oha)Enkodps<Yr>1u
zOm1NVh_~tY`tZQ{)EQS~mSGR(T$|9YV1jJGEW-XF_Q%WvU%R9q9MxvmouG%KHLpBI
zMqm>UpYXRiF<njP<^7g>I4rQF&M$EIFz5_M^`7rOijUVkL~G#X{aMK-YA;GK9p10V
zt8JwQmrtsbgIcQaFefxWZCYZo`rwG~XMGx44OuCh8YJx*A-iYS9dP3NAJ>VVS-erW
zW9yMte79lNEenfRpdEf&)&UTIiYeo8+kkdT2O0sbvkfgnlP`cv2=x(=P%8*AvvjRf
zMT~xW<fBmr;q(3BCT&tl*2RpgsAl<_k+UuR{WZC@(Zs?A&AR%|Y}>3^vCE}Cn{d+y
zm)xFu{O-9HTccyFH89EN+T{R=BcyuQ@C?Uq&Nbo(a{~)=^Ay0U^~VuKUS4LuGnCNJ
z?=1x-?b|oc2u><`{o9BU_uLmuG%_}hqYtU&BW8(QfAw8q&BW^IA4S1W1_P}1JKSyJ
z=|k~}M#Q0huA~ri?QjvHhFL{zp6G%3N!4rGGI#~%4F)5TX^5(X+WqlYO3XtMwq|X%
zxxda_E#=Cm73p-xi*Me&v#@Cxi4P%dnr>7A`%8#I^q4$cQTFnBv)=b1pJ^Y~z+4Bb
z%s-9rG_AVm=nS$y))0-pA4gfuMwjtvUY=9(Jrsnc>#F6dl`He8nBH=mwCG)Q(&AH+
z!~4A@9!h?SPVc!<8TvtG4Z3e^H(+LRG|p<4aq#3zf6TcCJF#EJQd)MU%}=^OrI8m~
zIhHf|s?4-}amSP1j*}hdmQ8F>@!<Gp%uJjzr0_WoH=TwJ8U7)8XNRJ7MO|LcoMkgG
zf-p*#n4@m}rjf7DOIi8;f{SC&B3I49R<&C6(VmR_h~q^WTpG%bZM+rJK5zv)DhYw4
zj=2tLt8~n^Jt$fGXT{fT+)q$i;V#=X7)R?9Pcws|l;=D(fux#Ye9T6Jsi&u(88+L(
zqv`s!Yj2(i4_3ja`c?XcR2dhR4~SnHy0*_e#zY?^OPaQf;9vjj+=H+6KXlA3xtjF)
zXwahuXX&y*9%)u#xy@Nd0FNr9h8(J&6ZpvHq}}1h{$i<dn>U`#-<uxQ|3#gEFVKVH
z)3$lDW;2=&I(V$uHmeAYy3QshCcllI+UUFQ;@@~6vU2|2J?Cq8A3buEi(c>$_eP@s
zq4;F!QX=GM&;_hm{Pm7s?Ls{)KHz+XrI?!v1H&*w7sKrG$e$MJa?%#ft7gkP%?f-q
zRWJR5<JR<#DRl#O9*?k@Iilnug8}?pKYZNSY5C92Gv{W6pB#f07TMDDrY=nOk+@E{
z($ZK@M*y&RVCjwAhP^uc*L{o)vHX#oE>Q0&x{eXKPaP%>BKMJakl5#~=nBOL3_LlF
zVAk&9it{QFoS^mg4%5lkl`uLp6thaE{2cHz{%uYj#m9d;+(ryUn~a812;=}(;Q~*8
z<FG?CI*Sg3o?7a-QYGbbTP~FD4#?bFdpbHm`Y`C`bYaViN4kt(o0y-Rm^i;n;){nv
zmOm_-2LEna;*ItULKf1}h*ef}830p+lC*U0aye^6m#WTdZ1O5_?`GZY4C9^kcAcDX
zy!h8CyqWypB;FZN`E=b|2cav4kbz!XLUK8$zO7BP%V{u9GTXUFrn3!=oj<;RE6KD<
z0B%CYK9!JhY$8F!PiwDEU57v2^Q_$Wl|O<kZZ3Nt=rM7yM~TPWThE<IW3KrXEfp9!
z-D~EDgk5Ms-E?Cr!n*j9(HGBbjh+_n?)|AKz@#vmQMe+Ib1f3<-qdCAzV}D9Y(nix
z*wlb{J9!;Lf)*r4`_3PKdI_D$N$Yw}ik@D0?*(&2{vq&K|CV~IwxU_m@20dQ<T_ku
zB{r2v-s!mP?O!VFofT!>Q@JhPR2;DOW4WGUSVyh1$T#xw7Mehk*mBY`kT%@y`59xc
zEQ_OK6F|K8H3G|%QLzPtwsY1PnK8Zs^E*B272$5b9K<uG`n`pSvwf!j;Lm&-mEh@x
z<?)NyGlo=K+U)ZEYUY23FuDDwBriys`HZ?nzaa;3S&fSus!co#0<nda=1@PUG586~
zdCe%O>=`tLX02L<qBt8uY}{{12SwTO6I3%i>NsLO)f`4c!3JF&wNHC{F-`ViKdyOD
zuwigor`*rcZotx?E+nj7Vo0?^u$1-3Q=R0v&9#F9blWhCP~+aI9&}?r`{g#}$lz9t
z9q%7+wEPndYk)R)4lXrIqVS8OGR96Oyf1raMA<l~Rq$&=zwc?h0nVSedGZhc6<1{D
z=kJ<%&MCDk4%|KR)Nm>2V!hWP&^v2a8VT>+NMa1<ZG{*u=>ch#MJ$j9QpC{GXw@8g
zs=+HxmBMa~Y0Hp=-qDLM7ToC8dcdMXQNLH-=K+E?>IH_^gxW_}htN9JbJh;ryfEqY
zgrJX}vvkS_PHa?nYap>wIY9b^1X5Ewzh7=6E;kn3D4NjWBL&dodaG1214tu9A2@#L
z=XwN3o;@4f3I{VY>KRu_?v!wNWUp^)V3|XEyua!AX+P(vWfq%<%r5_TXFz^nm?7yO
z*CC9d4XS0hr)0b*?OT9ZF(v4tCJ+cbgni#{&;oYJwTM)wG1x;7Qr>2|mP3I`ju}I|
z?j?g>B1BeH5G=_Z8cnI#z8lwr1Ce0PSoh~2e~411G=THpB_30LO<)Le5Yjhu*3l&I
zPL5RX2)WK?TCRKJfi4g-PcJl>74Nq-+aJ3TlxqZ`DyCshP~s3;U%`^P-7LdqPavSy
zs07sU?Qt}Bt#$wT4|Zhxy=WTx_lp0D7c>Ov4uEBSt_h&a6EN^^)22-~KppBVGxJ+X
zr7>x)Qug6P9%0wc1jgQ-#^wd$l!Rcgv(MFMPu2^@2dVU$QFha8lD4qvahmiSC}Olf
zN=D3>W@`<_AW?%7c)k*P%v2|XZIF$&$>N($_cxd|pRxc&7r*)~dg!aAs}A<buH4;d
zCHYd*lTLHB22c@DXKG>Ln&#cZ_`Sg{@x+ZHi#E+(1SMXiCm;NXlNaQdIB!>OIeNY#
z&-hPlVadhM8P7f%hticUIZf|}XBO-MK;D%5k*i!@_VUcepIWy!_^%W2Bs*K^nOWPv
zXSeGjWEW4kNd7LmabyfMWOU&VWhBxEm^vql8!~oBXoOS-H>t>8N0CX$V7_9hlw!8!
z_fU`S%bXpWPlic{w);^<L_!nq6k0F-WWG6i7<H`JrRZb|^g}0unMQ~~9t@@avT(NN
zmbgkQV7#+b`{?a8H!YeTf-ZSw<gPARGmB9kdXP6VB3*~Ed*I6W(HVJ=|K*VZY61yB
z=8(^F^iGvdAk|5(*J0USDI^KFDWor+FJz8fp7{r;=F+^UEs9a^*b4xE^U{xnea;k>
zGHW}Sc*cpE1>RCaSE)4m0Llr0(2|(N0}LC`+Y)Lw@N`i7cN<@ACVtH&jYUvl%iDW6
zVncMug)$|*^{fw=i6Mh+xnnD;83(5ie|2KKO-aBPWbW8cv?ttV_4z;@5T@HNjE0IY
zlv*!!4OOx+<K6v?T=xq_k+goMQ4R%i!It_g>xqQB@D8O*d|L7V0|L!7gj@{<-G2Ww
z_=s3XDr;sMj$bHuF{5m-&(8NBl{bN{Am7s`(L*h8EAaoG+|LiSM>$jdnDQXK2l70D
zCk+QAG?{@d$a@fVviB=yEXu&4U3VC87fJ_OzxY8OdA=h1$UV!<$_WV_dQJ8jT=v31
zUD7DR9?o}s;$2|<Q5nBVhA3JlNEb@&eOsY)^8o(+c+>MV*zO^r!}$34=;vI(sB*tk
z{VX&)bD`b3CgKdBgUygVS%m0};9<!gob5E+^ce1OSAk?+zy!B}z)tv2WbmhGVo{82
zbpb$PUIw1x_YzUK7wrPcyfo5ll)doL($|AvN3ipt#idf^0#bij?+6^XL!#9uO-Zd@
zl?apr*qw?uc$$^9Wkz!~Kv^WyXJ#hb-@S%UCb5F2cpL0--h@7v^zyZhycL$Q{cwV6
zNt&ygXFt8qYMc?Z@HhOu4fwW*l8I`9bjEQ=I^?wy4O<w7*q|WFc+!L;VEr${hSA0o
zhz$?JGV!PDy`ai=8op*=%0k?5AR1Y-!S^-K|3<pm;S5eiTAMUQDK?{}Pp>Irmh!SW
z4<I0w9+*)&Y79<4<J(&fP}hJVK5pnxAKXGPu4B^06_e1Yf_XfS-aWR$PeP}s?fJ>D
zA=fWm`WYx`ERIX?H@u7)GbzSl)ntwqPc{^=)+r-8g2*AhL<DGy=`@i34~baiERs!k
zgE1(u5}!_#)t%jML8Q6mz=5n`-i?i=o#q})$!$F*g;9GDOHJ#_mbGiwvgn<OI+2Y3
zg-CbX$&)A3T{kzVQPg_TJRk)Y!JmnZmC=F;NM3IA*c++5)@;nUQ0*$jw&6FP`WFQl
z(ZU6R<LT~IxmR*G-LHsNv~S@b106)m2hx*OOgh0w{i<{$=Bxjyi<REOBytNq)Fl=x
z_wej0OPmx1<9@A9=8r9fWbi_2f<t#7=K+<&5#FZ4V@PmX@a9bwZtB^j%v|w#Frvn9
zPQ;^WR9MelRB$Jcrt3#XErFo%x60FV+@)}V-q%eL_O28`s%~?s^}C(0h0QH$E5}-3
zHZe!r|F>vsn%fyI&zTiPWvXR7N7e1EjfP&>g8m3dl^N9)953pdAcMQBK6#h*ioq?v
z|9*=0*O*n`|H#RciBEmoRK^3`WBYm($JL+5ZQ9~CwU46gR8M-pN2=$gGE66eR7ss=
zkT~!Fn;>@T@g0@yjB|R~Ie)V8K*v9N&+gcUYG4YJ(i=W%KvNdE@Vd8@y&(I&i`-u;
z3T2q2$b^K!O{x<ppNy_#*{;i<-D(8~KzRYXU~S5tpOne3MJth>bPLVXCr|cMbzD8w
zL;IZDcjOy6zv#mDGiKaId&aXq{69Wb@a-dH?Va(CO_KTGOEgTZ&)0r*;t~6({S*#w
zI7OIhq*P2e@#TKdm$TzYYz<r*P}{iZyJM3$@>B|^E^3&0yg|c;hO%tGR;Njm&sZ;5
z6WHTMEv2IYn=Ez#qdZcWhe{Y^rYXYwNdQ+x4tgEVbadB%Xx(~>&#f9k&~78h{eM8D
z(osYCskkeuP{TF0dxzE{9C+HmyX^iWZN*Hhb_zhbrAaMWjmaJ0S<JGB=>1yD;`kF?
z15x|bN<kAM-aWUi)0dQP@)-`}K_Z!v8d0!ClVfKabmoJF+e^~3Nt^zI1NS3M8|X(K
zB1C_UiiiU9YL1nwqD&071?%VUx2vRTLvgP+ztOSh0P0Fi^60MiDlbmM_DwIetBz%S
zCOKmQUC-0~FAusPn%{4eMlRB~_qWQxXv$RK0QN#i5C*h#E>}hycenfQY0_CoM>wsW
zgN0!vYjot~*~FSB)fs5o@CZ#S^R&#XnGtry6tYb<m^8}KQX*fn1k5HcDQK;0-cm+(
zA`jTHvT8rs+xoQunEOs4C2c1Vz5(o@U3)1wPt4b=K?8Y-Cm%<hQAZ63GY0#sWzO9u
ztZ@hIP+M1`tJ?1)u)V;<G=2343G%6@SMdwVq`F<f)v8PxE4W(1RlsGb)L6=~n3$6h
zyK6kX5+?giwP&*3RV@=L4{7KAM{>v3i`gy0%k+|NRxwm;q)0<*StZ(d#i!h~yzp<V
z^UfCB+$`A0X^|^-EbqKF)hXiH^p4sS#Xo$mcSl77mTZc+%*@Qa5WGUb24`M}L-svQ
z4dbYpPLHt5rZb*IDVBL3FUBjI=E)+k8i+hWrKR%%RiSy6^r;sU;m}C51`H!6lRBOp
zbuGc|3Zu0Sf=H<~yO+K)mi^a!t<HcmJBHRcSS5@K?n|$&+8Pv0#=psyFVN78q?^4=
zxPm)sTPTZi$)M$-TE!UC@la{;V{1{#Y>m#nN|gBMOc3Cq<I!Vwhb1cO{fYj96BA?f
zR$F}G`*qy&pY`j<08&aDD=-HeK7_1VZhyyIqEit$P=(@k$+1-=a4#&lC~~@BVj44z
zu-e&wA+T&Zqvy@5qc<jY94!GJQt>x2al%)MNrH41Go9h8Cz*WuqgEu%og{UY-jl#;
z_E>BO?!1_OG<Edi*3UpEs20ki`s@^X2DGSFS%_<Bw0F;l34{7Zhzm)mU3K?A!?l$h
zSB2_`T7u23J9f<0egHAa`bR%+3M3RAY}>*@U4uXMv<>K5GOa7<dD8+0>M)B<?6Y(5
zhvz?0{4>)4Lmr@;CmA$$({hfWjDOWp0|rRNS3tnZ=U%Xfs4AS0M-xa^Lk%!q@LL0+
zx<@a}*`NxgE=FvaJ?zoVsS$REjhj{ICbDQTb2-0bpi77vWW9RzMCg-XLKZEv_e6j=
zE=+W+n5mM>my5nUe>MFk*v%MHFG>QY8hU@61#+x39#0)*hf;$^o2w(EyTp+3$ogwP
zvaqnwU`6wZ<TA1Yflrs+y+5i3jW1U4>it?F6Va{{nnARC+yhHaOCEW39-@KP35K7@
zz4S7_n!H#tCVDRp_?)D0{DHv=GpknQ3=U-uAc;mamxNsorYS6nOjij@KT-8=okb=+
z$h20lYIXIA-}HJJB4!fDt8&4Y@~EZ|M+nVnNn7%r15l`SNhh>ab&AfraLFfL6K$ta
z08(WP2$@cPRZVp3__+*NLBxvGQfn%rdXDI20t@AH{V|VQ64tkm%4q}{ij4KOpB|-6
z9cAqGg=^oj1A{hg*&`&BdTP-M5aSpX9qmWI#-@n`Ek*a7kIvzgYXU(CW)nSXscCUa
z$E6sOAaW12M&e#VSYc0A`jX5zSI9%qP{xr(YE+cmtVNiR%q;4XqC|D3QAyY-Y*>|A
zD#Db3aEibGcD}d<fz@YO=*Z2T?2b?UYfX!uJ=D@V%k%>6uBTUHKEtikn2`W=mGIOK
z3Pqq#6Q2xYWHm`sX0G!_ue9>$EV;FN#fLexuZHt^nm`B=`LDe{pCwToMVCzMcO;0?
zzi1Vc3vmN$`SjQSd14EQjYm<+Ht2VkVeCgRPTgRzJ*BP8AHDG)uHK38Tich$&b9yu
z>u20bnV!g=6iq4+4^z$H_nCDBMyTAv19#$u5AeBh&}}q=4CT^OmKkHI#LoKr@?}ZV
zg^#SUWe)9O$BPc+rg6t-)rc+%r1esi^o6phnu8_sdKr->+%B2U=Jq#tDZ4_5MH;5{
z{V2zdGBB3R=Y^CgWxNCttJ)eUvY6$#Uv**WqW%v!fps_22v-G->Fsrf)#|uMrk{%j
zc{?MQp&Hv$v#o^OC0$@{MaI>kxX-eqfNwIxfwGN`n)F0NW%c;?b(CYHS2gZ1md%D%
z!U}aszdQZOt|{WDvyz*Z(}E}(Wjbm}!;uh|D5=tw`|#(^{9af4$Ya_r7)fiWuuWtB
zUd(~JFml9<!80GGWdu{A9W<^{{h;dd@WAgKJhPh#o<j8GtICBIC9o*O*#+_umq80w
zvb{xK-c$o^P?89+mRRkDQTI$E^U+b0|9=2d5z1KAg*9cF)7Wpa8fpmaIY=(lV-F<A
z=6FHeBao#~4oKlgYQ*r+3mrl*8dYFk9Yw`7Ie=v~Ve8^^m>z(R4?WGKOatiA-s#^+
znY@;;E|>34gEWc;L>IMJZqW8qP>RPKb;x+?&hpVtfriSl-k*4u;Vm4O-L9mocKiw7
zkCa3i4c^_=$jzlda}i!EhSg8|(GsM=!uUW@g~O$*mOd~pM+dbd`=vdqam~!tPlyvJ
ze2x;()nOu}8^JTk1eiVY@QORpFS^ItX$11;d~NHaoWfPq3fi>GSMCPE70O-T|6#8+
zNx!XYxer)HY@58McMh`IMX&kWdt}_wjz-wls_xGa!*yu5!qaOFljJHw&cvz(izf#S
zjQX%l8}j6RAq*=8AxOavmmKNS$HLKBeJMIE|BxFjE$e^Q$(d5yCtdWGU?xgNJSV#I
zxDd;qQTiH3PlCC5J*9C^XjRgdfNAcM>oD#yv1}Z15zkXzG0kb}NMd@?j)M(yaMR&{
zexbZY4D!J`%19A16IE=2-F!N6uH`fH^){>}X=EO}p=FaA+~d)wU{46ePxM0a<~89$
zMKG3aZUdraZvKrj`XI!9C-%7?nnz?eQ_sC3hQkfyAy>bh68_YwzbH!#rI{8ktd3d&
zJqk-=GOG6Hi%8GHgDV@G<mdcc#Rcea1fLk_d7i;wUVxJ#26F?|$v}7I%~P_kJFs&E
zTL$5~yd)z%eF8Bffo+!UvB{UNH(OQh#}u^*)qNT4z>_}cd7dyy$T|`}0i8-Iva`-t
zJ|ACGqY!?UB!1A25H0vIzKnj!e>nJ|JZ?YVe<^O)0FY=#?xog}VHW0Ci1ulvLCTyW
zveN=85qa-^=Fy@>3n^%F@T9@U$f&8}b{h(th^R^j4m(*2tZ#SIj3O<@(l?8lUH;&9
zjkO;+59$|_evY1^Nz`zci5_QO26s{0NBzQvNDCZbiGQ$rZJ&&_Gz1XyBF{?{c}Orw
zP=;uX@A8H1+O<nPJ#V5Y7D>AQ?VMU2XGxxhF}9(`LYBf9EXf;BRtyC5X^UL=*V5Q1
z&a6+@w70N}q*Ysn7ciks6fgntNR6ldVu!f|r(E-&|3Ht7lokOzbltDN?rLaQbRmlA
zrudI7B2pu%aFS__n6KkBx=vEqbfWA9h4ShPBjwGNP}@+AsDnbk4p~-O0Gr#Vtt+&A
z61LHR_?uMHc1(+H6%RCa4nyv5OQC_`Gv(w*@*7<&h?-L&ILlleP_cdk8dci|Wr8tt
z2BlY&)A<~9fePN~^vz1gM|#kDMOA^55dwPc1$8PO6?J22{wEMSp-rd6eTmYLwn?NS
zdVJcPxKLWtSx}MOkx5`28dh`Y1skUQ!mt#@H9k53X1UX%z?2%#IQ9tDT?qjJPDE}}
z9w11`B7Nmp8?s&=u%+;Si4!Gf>k8B;tvG2k^aw|T;Cr)*uKZlAr5E-?A_aN1KO@vj
z_<Iag?)Ppdky|{?D2q5im){FHQ!F3gpkKTEpKDx;{HB}8I2EywX3^tfFU(48Z_-Gc
zRA5xLO!?!_^l#-&RWF~9lC4(MD8c#I==*)DY~1eu<*r9%zIl9EgwoKxQRTAftw^fB
z@aE}B`rj+L%R{Hg_nWp1@ZsIdaC%Lu%F&AQCerG^$>~pFg|;L>ltw^JpfI6nBCkTp
zxcr9I&R=`|##fvo!eD=C<z9s4EHND15cA44q=Hqa#<t;ihBjh`%V_PSJnPM!hM$58
z1cCeSN(;anPS%Kg<U5}FKu(=NL2|o_{en`(Xxw#;OEUkTOWH^|ZI}dk+9eVa262gb
zRcnFLtIdC3Ps~q|Ud3TTDTT75mZ2NMyRI~<e0@RfYUeGN<Fm@W<kcji1`EuamWts-
z@w_q^mdkDSKiAXvN0K<I;~yS<{?tWH&XYttuZr=J61-Ks3hMvwcl5$S9=4CN*2Z2E
zXzG~yR^M~Sp8Ki;9{GsagEY5~Af(?S;ATF9=%lh)ebced|FbVnTw>34SYEVQ6*u+x
zY!G6x=aon!pXd4ge=m~HdYLu@;4N(%8rj*RsZe#(@4a7Sct&nt`1g=Ei^F~Q*G*m5
z*ZEq(H%+F6UT?4VGtgVp@^+IM&QE@v`0Kl%(&5|Itu*!h=G$p{*T#Rnadq!4Bc@%u
z*GFfLaX?|&ft%Y;%`TqjdH%>vkBmoUpOy_SDO)<<^~gdm?;*;e(CYsG$*s)&je65;
z?@ng2RVCbk9w6nNDth=Pk2+YrtA|VmqZf@vMo$&0)Q}k5aD^=@3A4^7)ru73*HBjr
z#({W#E-%@l!qp{Ep0H*iJ5^u;lSj#v#>$tOB{D@g=QK~kGmQ#=9?LYLoum4t2gb%}
z-!r~Vbz6GO5B4sX>P7+cZBezEcgm{0_s6$$qq4URuh^F{_iCsIHP1p<=?3xuyHr?m
zYeelneN6K%AGg^?yI+p=t5($VdF@G7o{V|^JD(<s>>blK=uNHYLc24fy5ajxJHi|%
zA#xzTrBVZ)ExguhR@16AIB?@?qnPOIMDI@v+vDJ-UYE<iGX?V<e{k1)yB9JPAjU*w
zLCmmD1+ejT)cF4G?1wL((M`u<Y^X4YqnPBnww;)}@zM}qu>|2)(Ma_FY2&~?HV4gV
zA)(kxC}U1av$kS1olnHpJ-sSzq;l-xuEEQ_1eFjmN;Ly7&a7+MyHlIF$9~>iab}M7
zJK3a5csvkG0R0H(?uwMIk4FrBGXBB4y}~O0JB8X;0Knce_|p~|#+m98TEtyzv%tPG
z#5?rT*J#4LMd>rPw{^Yp{nZV!zK7}%<B!$W3}Q^2pn2wEP6&N=snXq(M-?&|b2?7F
zC%J^Y?$m9zww?Ch<R5FY0j3hTB7apV;FIug+bz1gFE&*Bj`7t)DU@Rl(R89DGbTr+
z2DB>pXISGIiGxb?D*=Z)I@J&F&|<WfH+4G_*-32;Gx7lnA;vF`U0zd(-+QdD7HY`f
z%8@YSxphBxfjU|;@nfgX+A7rKNkLz>Zj4vf)fSHC)B;7xNfnj(M*}wk<f<c)-?jQk
z+-U&Lw`jwlcXw~?9yCDh_GEPzN|=MIc5Srz7ka#p2e{Vx?ZX-mGQ<p-Y1(?EShX^8
znGA$rS+&|Xs}Jz_;BsjAx4!?a?Pf|`ozy4DHD+MAL+PZcdd5d=zQDGV0|ZXXK6bg$
zP1$3rxb{n5Y-URhl>XVOM^UfFnr(sA?oXC&L?F3BGaXfmTQ;KyZLN7a9jj00v4owy
zr;XS!upo9_harR2N%a4JMB6QP=B{~Tt1f<Y&5K_KK5qPM$Fu^t%U#qf3byQdN?*AJ
zKx)-~F0T1l)&bv&U@=94Uo?ThrmYNSnx9UexvR>sEDx-HIwn&pH=WoBCYsts^r7yE
z5N~9aU2)X=($IG7j>r^(y!4^jLKU~N&HNqo!s}Ct)vH>TSv8iWctzo^M^6(5yf)X^
zQn6)#k7c8QZ(^$K=JIO$7Gw8v|Joim$V2yNtx)?VnZKWcuUom7tM&DLBJQH#NbdVV
zr53yhhdYR7486*oJig{TleUg?*eN?X?vL6z3n+<vgc{PL@}2LuRc~bHfHw0Ze+2b2
zJ+U{&^Je!mfh5J1yH06PV_zJ0OCHrfn<{wEb1=5U7v7mLt6A@gwr2g^H9l|s#Q9M=
zgx>QXd(ae}1VUALFnn}sewqs`|7ZRS6ZFnWMSLMdv*uNw7Et5U&Os{~0Flm%$v<0E
z<Lq6B_lSPJzj76Qf@*x?`bC`TK&rEU)t-gGLK0bC)!RB$JE@q~`HL@>O|ls9@{})w
z>D4@-bM{MLMpWJL&1!XhpT0?K>m6+72+PcYIsOaX*osw0SARf_qiY1<Z%TA%;Qi^N
zszH5!^oXWax{#QHY8Ntjy<6I(#o5GUJ(?Pg<_aFTT>kFM7guW@`SiUePt2WZKq!!@
zWw>MZh5@feQYKwfVZ)VedupuFzvT(e3h)@KT_Ylr74xIAwZm;OhIj0$H)YdF03Zv=
z6I#^w2Qj(jkv3^uWi)uqt@_aEh5>9m`=db*-E$XS{C~uqiC>QC+y9%@3^ROZ##po8
z*2<P5WtqitCmE$wT2Qi-t&}Zk7GubI7fC5g32jJdp=>QuAtYogC1p*rv^?+QPR#uN
zf#3alUY_r8FV}Tm=Xorj<8yqD!&z?GdigEiKbxp_TRs|LpDN&KGTtqdd#Lv#0w}}f
z!8hLefqx0~Zj3>-R{n9`$u}kHH7Z<W4uoU>{?H$M^wCB<+tY`WMuKr!j5h0uPRkTX
zF*c3wo%8WjKHC#GVx_k{&Tlk_km(W_-{Q6mh<tt9H{?5(pM88(-YsRkRauzd3)Hd^
zO(J~*Yz+r%81+7kj{K;<tPxp=o|N0^uX`5Lm|?0u$B3)tp!z+z@lxJ?yzGNQO6ZoZ
z>25+>`6{(y(F+>pBEaoFe0jjf#}=7QD3ynvXieFKmBuR9BbP?E9qzTHIg`HL+bW-{
zvL`Gz6G@;kH7_t7jJ<ag%>^UxRe>{ZCdYHzHGnc!hS;L<g9``UbdAe)n+dMri3H{u
zlOTN^k-^lz>FRfWOY8cSdTiqZEJ&Z^+r05IiPO0awG93o=w0*rbUE&`3#}oxPrNiz
z-`?A}vc{3@Z#yO5mWFCPX~bbdga_PEF2yt~6`T>2NY>O)m2Y&>uNSf!seJ~5y%2YJ
z1*X%_*bOp~b~5b>d6k3%+6%ozJ(upU`r147*;}y{SK=8pocFS#Ko{cI0x>P8-_YB&
zm%>28{k2*$sdZI}38rLM&6K(E7IJ6jV(!3szvWe#y(*u^)7-W*gAZLX;j+tdUE|r4
zhpVnliQu71wCZ_lVfu7xQxt&p@SD|cfM=z9HV$XWi3)KK>q5V6i1zF`sy<hDg3=Yt
zJyy7NsPma`2ix9{|FikfeG81Zr(}jf1P=9>^W=EZhHp9=4cR#Cm6h1^b2+tCFt$=n
z7(s41N>v;&qIS0T+q}>c>_-Nu;byW&1K9aa_5h4mqNglg(^dr5q*a8PkKIErG(JTx
zEBxeO4WDywSYz6S?!0__b1;@^vc!L2KP5d!yip$7BbRz`?Fod84NySVrKL)d$@cfB
z*5~A_bF|S#zh1~0Nu6^Lg0U0T6s<wW$QYN((H~lkrzuS=kI52e!u^NAJCtT168;kx
zj|2Kd@E32`7l%)M+1}}H>hCf#P!)#KJ|!)89OP!{rJ{y+fy&~^>)+g*mo!&ejv(S|
zo89+<?pg`4w{;o;-amwe!{~ThCZGDOg;9U7stI6%(m*_Yj0B-myGsw=Yyc3%PO=o4
zNIP#~B2U+#d5N>%M<=NpW2YbDy(3(C9v8X?*s3teddh+1&bJV|@?H0i`GcY8>J3)9
zi5qDetTcm75e)Iw;ND+9s1(e~{Z&@}-0i14xp4%6481vs`PW%Amx`iN#p=e?wEs-w
z+H&lzEK;7(+9-O<VTPC%EYRikOI)so)h8f#CdB4VzsMmWdccOWxcj;M%hVGR_ql8~
zH#G=fP*Ynt)UWvWCLPDBCPi5dpcZCb(N3a8&6VaEH7D8MC7XU}o++U^wBS}b0b940
zU}JK=Z11`W96LH=3O}`*t0+M+Zyc^Au553R=sq$&#R}r!l!{sMo7#WgL4fEDW6}dj
zsCp`ejDeodKoY4w?}L(F9kG5^zOE;#uC0<v@RD6F_(+|WhKtcJaV0YHwQ~$z@k)uM
z@Kd0Bapn7`9wa(Fl~mBph76`j(}dPC^yi%Pk%9SfDJ>dV#aQ|rBDWWj4z+zK$4mhC
zt7P)(48o~%8wAoxoi4kVqi(F8HUYl36zMZFBCJN+cHfM{7N0RV)FxN~oNN8?25rYp
zlO{b*uFM%{yNBIcFl{xYSim#eS=mpAzPj#c&c}^7{;Lk39c;Lmhvs?i4m~h#&`@n)
zYWO>DFz6vBLi)x)Aj6ZBlN%nI`nej22S4>i>z3a3L}oQTILz}!1p=I3UcY&H+cejR
zJcY$*mNdO@CQUK{D4F!lPelCB=H3znEFB)+fJ5FaU99Azp^}EeT;GB?sR=P4Qy`1F
zI?9|W$-R)^&H%^`l07dQhSklZ`r-wqnYE>Z##D>f@LPBAaXm;#E=O-KD`yJ2KI>&u
zQLQVN6J?LV$;|8-8qBe32=`!N4#jJ0zkje+?qY1`Jfo5*lI-I;RPydHQq!ztP!e6x
z_fSPM?@zpM%#P4mv;s_XRT02&UytbZ`Zs1Bsyz7=X^&iyz95`MRI(cY{|p%|$Jomm
zIp1m=Oq7CZ8In&xSwT`AAgb=UW09Ez(+y|Ws#LlTc_`<{^*iCO5KKx;BarI;_wn=8
zY?;j$`4e@#ILw5O#|4$=J?(z|9A>r7Wy8*-=OKamk?yq9f1U2xMhIRGvu=a2dM|i9
ze-hNf1vO;AIty<h_q>D4?j_4gd=sLWee>vJv;30u=NEKyTN-|<p&dspc53{cB54hw
zBR7PmkWyT{2}g${DTt>!GyYC{7Ix~2N_<;4(XJOrLD6jZNNJuH*rL(iY#$?T_fJY0
zJ)r{H?@GWb_!gdX$NZQ8aJ?8&Upb;+Wqoby1p9;+$Lgo_)6$yqXc#)O&C+Ej-#-q|
zC_>Jv(UJvJZ?~}QCHwYlu-SU-)ErNaK`SmeYV-N?=U)&_3*A++3gP>6AzjFyZvC9J
zS#0CM>Gp^kLPI?K?310c-8gcGiJ@9i29;M-V4qfoylo#Is#lW^i+*c9<gSn8-pUtZ
z;)x-r=odKooy!pt^}QuPPvh)XQutU^)TxiICMnOVB94snE<2-7G_E;OkjrI@Z685C
zwzNF@*jmJ$S+3~8Zp9gR^W@*|-kK3$N30$zv%`6%nq6m|uJy;Bo~MSHaxo#_ez^|B
z<E2IG2g~gwGZ6R<!A;`DZQ1%aqZZVV3u{+(yi+s1_FZMG(to;b$k?^I<94xf{^q9E
zX^T%ix<<M%eT;_Bj91jnW}?|U;zli}aH~{*VDK9}+PBB+D<SKo_E|tI4I0x|MkK3Y
z4j^rZ`%N~fuT7(w91feQ`1>3v)Nvg4nJCCX$Q(yLduUZ!1sio$5fP1!#|b*~dnwJ>
zZkYV3`yKfoDBYz+O?9&jaEVFSd;L3&seyJ&A8d&{an>4LgbGp;*dm!<_YB@d!6N$F
zJ2jZmkLcU|&4YkauDo9`bJ8+_4@%F($x>-dZ97TZlmAyxfr5(x9e;ib&e?g%EiP7x
zsyeQFtV$NFH>aawD9!$T{f)Tlr&qVN(U@B0Ld!`^Ldo_TSd}>hZkLgn{6ffTJ{C?q
zQSo!F`YiB-9zYdwVuM5Snc)OQ$jaAt(WOxdDBkO0Q6$V--NDmJu!7^au}C#3_0{#_
zN>R*hHv3vsL01<5pxp~@vFe?=y)q6mTIyE!A|R!uJ!R!Z3H8%cG1D(%SJrAN#hRm3
zx+tLp@8!$|uF5A2H>cY-#nwebBXTq~aC0KPA0`+L09Ud*R3s}A<;QW{>bP;3K4`>r
zRTxI242EsakfL>st|8tR&f`=aDKRX1dV3hE>o@0@!rF6q_||&njx8{Yx-0u0zxxM3
z$?{$00&_v2@KXqujw4$6*A*=~Zx@PM^u163;6(}f5BY^@bwnj8w<;;XB>eU_499-e
z?KD!?-kk2Ma~aro9Ykr6?_BoyUR0<!R@+Us_3r|J>98v`;nL{#4@A56{mT9*qvu=#
zS=4?+;tN$_Btn6Pi|5_DA)y!=?Fyv`p3Xq{ESvbo52UP*Br&gepyC6RDn54TU~`Ss
z?fdAME2CFia4!k435hf1UZITfDwk5)jFj3}S!!b?U44~eG|)i&;0FvX`pVXNBO<8y
zIMcpK6^GTf`%cQrX^V3dbO21i*tlW7ikb3$9!;~vi#6D&QXrLFWzQWi9mgQY`;*)W
z{l`0qe;=SO)wg0`bFLuu(PGvh0tR{D;Q98d;+Z4fS&LDNK6>Loqm_Rwkx%4UC|vFr
ze`?YIBR&YZgYvy(HCdA;4dJ>50;^mI8r+XvktL6D?T@Wp5845<Nu&@F>}E`J|Mrxc
z2mPS<)cFx1Im_YYnT3K9_C4&3nQ8jw*e$oLP8WUyt~mYrT=cO%lw4vL#47e%M&EFY
zJ;cD?AFu6mNU3wnt$$K!cS(<En$v8vVd$fMQr5@lTkWdPSKHS6&=f*_`zH3ntmp(A
z-v6iDK&OmNuCRGVnKTroTvD3S{Qclu^HYgxn0R)?j;#tgNR{Od^@$m4Se#x*A%P9#
zYHS|bQfYL{=(0MDC~r1293laWutiZiri4_C`W>cJ$-RMp+O1V9CtdrNa?e=YQzd@5
zbq*o74zNMdz!k51$%EH8IBD`tHRSjDk~CO|7A`R2V^<kn`<s&}tPDuk^ut2*Q%am(
z%o)ig0I<C^RX(k8uz3USU>Qj&qrc+qOQDQNTGZN5o~nit@=lq=9BK$rMvJNus-X{e
zHl9KcUv;i}(1r;OC$WLNUT>hzS&YlQfOS_XlAa{GfMQSVl&Ypr;^)sz7oVYiO|8<x
zFmr_6sPln0h{MHjwQS&-Pt`m0fy9?ObRH3dSkz8+&sLvK5V!*{-1ywu;h$-$zss`q
zhYa4w<GKW0bdWgRKf6Y$`d0sXXF9mXa`4~dfg0CV3^#7Tr8pHGJlI9f93ZWhL)s6T
zwW+oJ5|MzFm-oMvzRR~E2~rGUXabpuc+Z|nX>MXZx1OM&zEC>kb*<CC;SY;a4&`KG
zCY<7@{m#s8<O0OD%t5WsKn^@9w9VNDlEcBef8X+b+<q>}jzzBpjF{CSds$5E)Ld=T
z*;k)_yy19T3Z@c+{nFe(L8NZ2a!Pu9G%FMX6YO@r_b8;a<y9U_X+izJ_#<^~5GX5D
zZGZTMt#djELOf{JZ)A9O@2|$^Bdz?JnIO;blgzjlI8A~KkslvWTR{yC7Y)M?7--Zg
z0mun#4blJeP~)ph@BZGzO@f;DLCLdk|8?mZ$cnmA|8OGBTIIg{Ny5yINFW0~=r<lR
zFNc5cnSx}NvH-u?8fW(ShMttM87VHWsNX*VRM!@8E=k^}xM6M{Py{eh$-$-VXBNIR
zfIYGE_$TU|-sju_+Y0sP<qz!w2_C5(sgv=0w=gu7B=Xmv1gYP0@NY^T@86u-?h&b7
zHo?NghE?H6XF*Z;@%{eQ`8a&^;NOHG*&OVxXPO&ohoS(llDb7MwR(5KQ}I`$OdG_~
zw0-~mkd8^#EY?<n++aiGsB~!#<SfSilc~N8p9eEOkbHgg&ay56)Cczbb~g&I*%yH=
z>HdGeo3H)s`2oZ4br^nz5yWVTeVUa@dLeiPSc8sqEAW~+Lt?jv*Uo=BGKL(WuWz#-
zhtOIdOlCNy@mt;aR-daC5?0I4OO`E(f*NTg+?z`^BeZLTSX3rST9GYo#``%jpmvoz
zpvO|#F2tC78*j)0ZSJYx4j>XS0-;;Vp!yeB_?fus+4NXL7W1b^XC^VQ?V}p;d}sgp
zE|6v^5*nFAUYt_@F2%j#Q2Jgx(75DM(t;BiZy-uKNobB4J`5sXOd^`itgxhu+aw-E
zvS+M1Jfy;2*5YKkU<aKT{ME-YY8%ks15lOB7CF{83km22HKb6Jm=O=U)vC*P8mSwf
zijPD@bQ8Lm^wm4Bxf3b@X4C`*eH93Kgv##3d$wR2QtOPM;Ly0Uql^JwU3`F?2Ft+Q
z7ZUuwdW*L4wivsly|l~w+Dh%sbZA)^kI`+j3pmJxqc`S6f+&4fz>#c~U{BEqgEeh`
zfuy&@*dxSG&Lfqd6l*_Bc^K;-5f!CPcAbrBeF1ZRAih8A`~0>8rvJ*?hA1+7M+6_X
zZ#jfyFhn<JNH@?=Nvj2w7ervc)uHJsCk&OA+s_(4ozfvZ>7ow44$*qq?*N?vvOi_n
zg+4VsT^{u7iE!u*1MEDlDV`>(QFMU@1VxuXW^<nU<Y`21|GwwQ%_P2B;9N6QVJIRQ
zJn%P{1jhN#Z}l^zh6NM16z*CE0Doe}Ei!3Mq^#eZ=~Tz8Kl~@%2%+qWVgV!PC6?`#
zJX?^6I>c-DvOUIKkm(S%BuT6Cff+9VON2mxD`ll}*|9qSzqAFUn`Q#nK>fo26b3X+
zN}1<e%rLSV35b=4GkXLTZ+D5YuIqQjMr4a~NTB@`)J<(&H@6~X)Wv}czATkg7Dh5k
z$<<O`pX(=u!iIIRimrQFU)K*$ogv`nk2xKP2Y3j}51(r_rR9Kzob6zuBmzEF1&~Xc
zeUT%X8&T&30@yr49?Cvlsf*a7J6$E)V82D=wXT&<NjQG~ounc60@#Aeq4v>aNrVJ=
zFCmySCh{0axs_OX#eQoU<pD7DW1Ru^#GQEK0VUZSdhB;7+ET*%xqA*#lFrQFg`-u)
zeAs6CnmG328}f<}MHwCL`h|;_Q}lCmIL66p$ZfE6Uy&xweqOj^aD+*qN+JNC&zg@V
zeQ^WolN5dw3_l`{Wh`3H09!ZZdso616>vj3AK$x@^8VdTeY#2<nPXIWK|O{{*X#8&
zSJvOR!%dDw&8uT6cEvSd+5S$a`Yn0)vMes6N^n(mJ1Z(^J$QIIZlzINa!lf*1E<R&
z4KCSL4o}*Y<F8Uk@q@rF<zf{lcrU~WWh?*TjEOjWtL-&Q@mcqlU@EjlPE{|RYI7wL
zfPyruGvt~PjLEh2xN7mHAlG}Voo7LJ+2lP;w*7WCj2qO5Ov9uFVg?7x?+LKwBq6F`
z@o_mrwLs!~ag5|O)dO^^mx{frQ2T=~l4db3Yv%r<6o*d%uv#<=y}bPTwAJA}DBm4y
z`Sc$*pn{w5vvrW3i{%h+ND-|Qr3LZl-w|eeE-`28L~nA-nJ{vdttv9!6z&MPrYFJh
z&q;ysMGh`_OvG;g&<3#l1q8gl3!~va4};*91wuM`NHrjl6wFO&xpAkmT*77-@G<$U
z9=;*k)-AKU8blNa25df_GEwUrU0%M+8Co7C5>G@}8G`XhDmSKs$0Q^v{0NwwI8F)V
ztO9Mx$!$vr?dH9O?OiB9rtkqsce7}xep-W0dX~UAP=#BjB_jtf?1ebMpPArm=XH+V
zQi=5bKl{aaY`p^p<c0xLrvS3(iv2*FF~{$(P3BVtMB;0M8-^~t9x#dQ$EWHlC&O$A
zp<@-X|GZBQl1h!!Jca}Kaf1Vf>A}ef>I%!MBFF+dZnZRyW;3oQP&@>VVYXNre&GQ$
z-{lOa2~|i&?UwpUlc<Ze`K?ygzq?a;x>-N%S2twTN=5>kizOyr?*{#UG$>*)WTQo(
zT+iZbVt}3mYiPE`3xa=;JO|0+yB6VfPQAn&wzggHlibRvm=xAdI4uLoNt4lVS+&C=
zl#)s^VY(;^iBS5FX&d2V@L11_B|lqDucJZ|HCR;JV8}ZYISaDtMyB}y+{=_$Q8hoM
z-%!rUWo)BN)!;dMNj6gaQnuzeeZ>Ey7@WhrwPU9f2q?Nq^!bV;rE8A{NAB&4*YBm8
z-*gLjR;F0_Ta_tE-AXJ$SI;h-;(BP5l%ExEBoDe(tyF=%?OY(>ML)L_FjOU<mp2fE
zBp=ijfGbq0@@)2mCyC{j4{y8pq2yVLY1FE5zpU)}V~mK4H?t!?EIlkXJ1<cV7=77d
z6K~-t+74T`NT5Yrf5$+M$gWv$)^qnj{nVmXK7lSH8k%J!yuvV%ER!$Dp}vuLcDa<7
z4e+hH)6f2M+B6E{Ko(7xVmzG@tX%ct8B~^JO^5E1D^k;vh8~GLnD!=HyY<_HDoP@y
zeAjs#Lx{ZjU{LA(;Z6ej@3exW5yH(xu&q{oS}KJ&Afq0XJQ5q@*3}hfzy3{NY?H$~
zovagHyr?gIO$4o_IUkQUPIEG)3Ne6zfY`#DdeCCK_L0wufdEpj4|rD*H^G#CF7Z85
zeDMU#HNvhdRh-5_K!R4gmdB1i3Zui76lYaFEADmg_0B(=+w3Yeg7JR~9*t~|%Q&o6
zF#sg9H+SeCitf-i9fl54%HMv|UO(!$`{Px9rbBy+xex>}GA6MP<sB1De5-_~QjtN_
zPG2bnLp5xs*N+msqM2vT(ARhWtPDu0=`JB59L+$SZ|f&1Mna+`@#t4=q%pPd2;MIY
z{%{%R0tMbs;(!O-T`oA1N<kbVg@l0w9U_b?^%n9ex3k}kX$YHJu*gxdQfCZanAY|U
z*1BowD?~!2#=GhcYH0E*?<8*JHMm=|q@LFfSh%dZMmpzw?>HxVT!(krRZnxQTXBSp
zrTUF~?V*$jD0!xabJrBb0;+w}X}A#>iKKoi1$+Q^cHL3i>IWp6RhBul`;TfGocV{H
zZIkAj+t&V=QfT##BVnwfv9mW0uKedOa-2@}9Cn9dzQJ9e&jvyQ5a!S-=1{ueKzeJ=
zC%}?OOAz0<(Aua4bwQ`+a8T>2Vpq=KdUB9+ki>La!kyClo=d*a7~9Irfp9#jm_M?_
zy<jf;3BWC7R2KeN8<IH*3As?7p}z2@yr26zXunVo(@uTVA6kg8;M(?|I`W<^IGx+M
zA14<_#pT#MT-?yUpZXS%d*@eDN*ZFnxv;Gt^}QntYJN$1U0c$TC{YbVIyl!eI%}JR
z6ZW~=^|S;Z!!oP1d;P)0^d<WZ=ggxTAcovsSWOaH%=P@mMv0>?`95|<{hd&XTRoJ5
zdNE^Z!6A3KNFn6}f+|j{siqerX@M<}D!ghbSWF<cYF3&uWn>74K_x^K31U{>?Vxi2
z1C}*x=8DB^=9nurQdSYvWxpQe>m5{B7lvP30?sgbP8|k%8#^ubN;}?go(kM?GnXYD
zn=R>h=Y`S92TQf@E{$p@@(J`mxPeobFxeuLzCp1LQU&2TUqB&r6Jd@Mpyha+@MW%Y
z)$pAX_D{7)gjeXEV^CVm@AL{blrW6Azf>x|-@PZ9x-?a9$9Zoa2dRo*AV;w5-s`0B
zi{}EV%TLQud3IwE>Z9q4e`6(`rC=u@6u@a-ixRrlT%=NpwVc78$vV}fQ_Y|5mf=!o
zJ*bseE$dsr&JP$`k6XzWJoCDX!0*_Tq|0ps#$ep?I8HO#y~7uj@VDb+MB0)yMPiP3
zc;4tO;2Y%F>T?_3c<+A^lz(SfePI!vT%jb+K5}8HwE<EM7mTy(=S^HZO2(vW?QonS
zJsny*c2PFe+Ht1RDz)<U*3#!XxsIRYI$niAVb**^%`;6>NOdWP$||fL1GM9er(K|2
z=6=;;@`D4BG3Oa~v$Z@XsjtA#-<gY}DOWL^DUALV;b`5n#eKR%HHlJ%7RjmR-<!)q
zifWdW)p%1efxW{nJuHp&kUEZgz|dFND;uYrYMC?%SmUAkqI(K&4rkFRQ_b~W5Rv=z
z07SQGUIRSudSUYi&b}OW&Yr~UZHt^xkdEnDQJH@nPTe?{Ox?ER(4C5rTeNLk+6R-B
z49Qfb^tjXSop+^4%JRCTbP6O6C3L|?o(BDT0ai2<RAXuJ<}t!|7?q~Bz=cBs9(<;h
zi}%hDN~F~6Bu&2eOJ6@3o{@lSkK@XI9MC{zJlI1WD3YuUlPpfuO{nMt0nAS*nGej<
zP_kW9gqyLy{@8uCp@RH)Fot?c%r`E(DGj#t>{xrU(~VWSN^tj3A=n(s-+IKi?9n8t
z<8^#jxMM(RcF?ga+}Z6!aKb37gWVtECjnb=>hBG8>}BopL}D6F*3+j)6T5zip`)JZ
z7@lw47UN-47JKeRjlq=f<&hUn2}wU@wDAk0jTY+EO;7>IkF7kr)W-nSmf#MgjQGOW
zp=NilJ7*-EE8oj?^`&T5@0WtjTjef{0k*?YVYxLT1A2jyE>^(6ZENW1swx&H2Y_dQ
zi#L0=SvrCC?eEi1U<iG8#Kq7|Rc4$W3+ih##ktVQZ@d)z{i&hL-Zo43Hm@Oz89#aU
z-(ob^7zXyurXS~|v3pbtfxs%hBHT~w)4OH#I;tEo->vHU-y7Sa3n2!_s12>_JX^SR
z!buXuFe-WiQf?m8G4TAB+T*f=tvqnfev(7-);__JY*mq85w>pa%n7m$K=`xo%psT}
zTqz+kH&qE3oMm_a7D1Oe&&UhS%%vn!0Yjc7<u^C1IQV0;ESp_mMsm<Ax$F{{e9EOu
z-%MW|g5j8hzEG_qlUDEqV_h^hyF){L>6Sa?2V)(6N}?8m@UapRt@1qfdNf1;H7L#I
zBB9>$RNyTiix@zI#f2V|#ny1FTIO_ua!B-4zwG%Z2O6&WJj}XYDlr+<MueO_5NyW?
zfB)uctatqNLl}c>%u)#3v2_kaZXJ*pSK&{SpBc&%lv?Ddl0t5RgvE3b9zE*h;Ir*^
z3F#Yjtdd}=gzBHMQkmJ+aURhNkFQ7O&-JF!O{IN{m9Z?_u9TvIEtHZ~`P8+f`Vru=
zUf9?2NtLK#^4HceZ%F=a_w8Ipb^o>Q7aEb*g;znGF#}9#DQT0rz5U6HRjjEL%Eehq
znS-iqtKPl2+k-LhAjk#_Jfbf3+fs&i<EfM$$upf&(PXw5ez3z0?9cCNNl>%1`lSR*
zm?{()L5JlrkXnybT?id9LU_*t@;nBD+I+fD=qhq0^dQR*cf?*u5UW&)Im3pi!yxTm
z!EeSrDL+Y*axovvVn(;R>yd)aYE8GXsxY{7l0->5Pjv<2hGbi#Op+>udrRc7>)A4@
zyICjFghS!t8h_O?Ygo#WBa3!@0~_RpTkIzq+K;%>3Boy-jZIm*uh;dng9QYWe3tZD
zNIo#d;<|PhF#lQ;%4(TFR>3&JFt$tPAE6V3kSLV|47Kn3^vmidb=H;>+5Imd43!8w
zOxpOs0E<=Q`ewpf+{O`eMr=Qjy{ks9r#RKI%{Q@ud&&Y*BzoacU*(_{)gvrZBaADm
zEL6qB`{N*TdS9s?y-<iMLg|-;0G2>~rWB6n9xdnr`$YYVu&Wfx!vyT5I876>9pB(K
z8djYIsge?)1WUBzIP(~M*U3*|qArUdA6978*sr<cD5;UX00tQ@7y=y*zNg=aSr&d7
zN$eClG3Awd10wd$3{SFmAutQmQ@%TenORaaZ<-H!8$#l{m*B;B$4a<^VHDpJD6Uq@
z0`ZMHwk}K?6po7EfTT;2wI>cb4*?daB#P)Q!HlrkWkQq;5!$RMn6+oUgz7J!6dFqI
zwb}1ejq*LjI1!YZ0&(75^W}1m&|S-u$zsi|{*sGLefy9UYaFxxIFe6W=CCMQLZB7w
zh1hlfKV|AW@`;CUo7!$CrR)Um+8tX|3PesV$x8+EP=;fYXuLk<@C_B<L2ztsxt6WO
zPctYyi|nUhmf<-!xyl#Vw(d$0sHF3^bLsvyv$WjGF^f5@mp_`XbK`$+Rb<s>)#6;#
zd=sIZ{ah^|lP)ZS3xt>ywt7Xy0}DiOT*n13CI*`U+ECz@rPF%o2*9EXz}mY8j^bvT
z)z?WiUCZCy`Z0!Q)$;CY1F?KP$~(GZj#OcUS6RdAlUCZRoFaC3+B~5kC=(9s9#&0&
zNiO9tA*2xsuJsp=qg5vaAnMQ8gX1}0-1*ZgcOh4<=^3(({Ks>ZH`6#ZH0nc7EZ>-$
zP**>sE?M93U|aWg<1Y+2{l832$FFwGy4U`J*W0e8H~clte*S0obW?{>Ym9$AoxkgE
ze{+-OXBPKJpB6ZGTS3)3$CMSv+jt(2S&`PUnmfGdabiSh?z`ILS8u$W<I6ox@H;*?
zrC-dzWX<>8X+2$I(xLqE;~l$prx-1L(Pq@&uVi3lQtq41$bkEWhC1xs5I^~oUl>(U
z{Pbz)lUg(u151_=06%Htj=E|_mETsxw49bk)1n<opJL$f+Y_r{awSZZ`YVxCIZlRs
zy8dB1$+#G-bd`4dmjsyQcoK8br?$t&i!R0NIy*JpP2SZxuCdptc>tgueUu0A3nF+a
z%9EZ7c@HAb(R5dVoK8aCb$}(Efikg;(5v{&UiBOJ!>6#ZIZpEt&*v)r*o+{U_I`Td
zrg7365A*gK@$vcrw5p1@lzL79;@s1RHh#;q{*A()L;RWFP|#2W9NE_c!6fO!<l02#
zY~6F5Ryh?eGn!(?tFcS<H@JQFpVqL{U%DfRg85uS)*6d2r(-fJ*KUM&GFLJ4<F9TC
z`fnFq$EGxXUPQ`Hm^iqS^KJSu?F7-~xg4Xhbi4QH@nHcIzWq=0*WKTbp{OakY&V#y
zJ)i_JClIYNj0S_jH?C<ouUzZMd80ncXYt=lrov2jkTzooezSC8Q0bBtjt9IM(7OS4
z=1o4uPS6BCzI@h+2C9h5B8L+6X$}X}j55Y$EUXNxd;i+I=R-+M{$|Nkn$c<USCW@)
zL?>kGx&WOimgI$#sbtSMWF-yO?_Udk_|Ls{P9KM8cH^?3aoVQ()hkQp9$`LZ($o(>
zH27IQed}Mk9{lGfM{!R`{K$Z1^gQd^7KyemI5&CaFjRN1oc*iuM>Q6Y`OnH^EJTl@
z3eaX22ZpZS{%JdcgCT?UlsyPQGGIWL#*e1fHtxZOzj5=&w`dI+VkL$1w(~hrP>+8>
zE$&R1Q{!he+7|tHQ5oJz>lXm{Zu-=3Hq3t${g?kPYKxZt*=aueMVsQ}isjNZ%p(|>
zCZMqhq@s-}P{q=QVy67+z}_1R#0)jQMm~|@23TVuu%Vi&pN3{f6rN4yQpj-Hk2g5-
z+J86d;j4vf2g8O9XB_nu%4D0+n_KhshcB=A^Iu)*JOv%;{Fh~;RX5=39KEy^D^}<s
zj7WuN+<$lV!G8a}y<ZBWE1k*mMLJc+x)6lveY`9m$NyfI<{$URH{RVQwGnLmFO<pV
z&`>9GJ!G3rtfMU*Z!v6`=1|eGLl<NG$KAJSSfgob__XICgbrk?f(^4t$3O3}V{qe*
z{=bJQLq#+6_~0QEQNGxdJ>*<NZJ2NAe9V0ybUS&aK4x2qEG4tA13NUZA8xI>I_E0(
zwd%^Wa!=C4&Kk;Jf0|@IVL#!Y{ujX%X)bfbc5_zPRHDc9{<X<P_T7SQ@FBCNACejK
z@)!+G6FF%I7z%+10;s?F)>34`w<B4=#-|afA9}7u(!p)rf-Z+={}VhI)k9G@A;KOH
zuC|qDWQagMuh%OXvMlP{_}gfBvyUKf4?enulS?H_TGs|mP1Q(9N($Q6^8H0=q<Ya?
zTm2oA<INXsd5b`qg5Mm5yrT>pL%$?HDd#(c^)wOLb<3}1|E{m4);l#WniMGQC?(lj
zRo`BK#isx2CN0q=_JgK2q3@nr!!d=D5Sg()uj>3Nxeo4m7U5yrp*cuRf?xP4WRw*q
zG{bBI`RzV>;H0g;ufxgU@C2@|qbK(w+zg2;y>^afo-j6S50KN}F%1$`+q4G^5dG;%
z@R6!O?Pn3$rs-qkY6%=g#!}?W)_!>Y8g-pU1&{2(Zn8V+Wevu6k@Q~Nt9r&sti$5h
zH>k~A0$Qzj=J9N*^}(sXPfT52obg>;(JTyMBC_s`Ws-s1p26SHhqn_&+$f?(84Mg4
zQu_LNk;vXFlp<TVq2sUvG8Jh~JbHQuNGM;em8X~Liyi@KuAWoI|GFy_V^v;WUTE+t
zl&ItE(Q2S1^pI(mST2jbCT-=)5Pi`<%AJga;B*-x+!Z8bXh{CoBkWvC>vDz<!cu!)
z%eWE*c|>DWWK)%w(T_l`<gRO`fETXTO)}P@($emnFHb=C_L8T)cvv!J0o{S+xwWQ9
zuOpwX9%oYO9TE~EZSiUuY|&|+6eFS}>85~c+)?(c4PiVcgB_d(%E_YtfONG|#N|XN
z0W$O~cCA`4Cb41a=%&J)kYSN9F87g0wbOUq1|x2UYotTUN#-^0qG)G93hMd#&Yz8Y
zQ={XgNh*y%R%lY#*zcr296Q;dVgoWH&EKU5J%$+>DU<}`mP8$Au6=TNI{zdJk!TV_
z1n9mx7DrM$fi+nM1gH*y$n^DZ(s2}n%!p+2YB_uQPVN{ukdHV%K{{(@pZw_gPQ9S>
z?=9hKmTVK499L_>+gf(&)KA%m*;91{CjwC+*E@+u0U@3-JSOP<?sW1)tXCodZ*On0
zWFl3pBu2X|iWMT|fzY@VPBN8^21>(iIQG;~Ml{j{)kn$rtiK^|^;{jzY_g>ypwCmd
zhQxNsK@Xp(ZH9|u6e6C&&o6qtjNXU8=oRTFdZ{UOZzHKyMS=7ft3;_)RB*ALbMBlx
zfm|Bq`Qp1A>jbwMNF148&d&Fnh(u^w5fP`;9H>P;t0&H>9?v<F0Z^B-83cqV2(h#4
zt7<y!K{Fqomqbd#=TA{EB0-LVJu^V9Wk%b-f(a7iiTY%2Mm!@_)mkL7QyN|^WJrqP
z^Xlrs9Sslv5TOt$3^U_t){i=wy8Dv3A^X3NSV-_v4t@^;2Dx|<0mMs-$O$qnCdf~k
zQ{hC}`*~8%Vxm5o#6(^r<K}P=7R+c8k|?`n@vtW-pJQ72Ka9x@yJ{_#mpwYMczL8y
zx82#hV~|*rP*i40J;)B4O#hwwDd8BpWheBU5aOyi|Bw~^JcAuY!qoLBHkqjcbuk8y
zMcZpAx*cKuU2`fBTHeiFmqQ2=p*@7<x+`TQE<$h-)Tu;|86lnV)xnJPQ-!gg+lBUD
z(NA>A--IB`d9BBt$E8jjn#DUrDvm3bQj1!y4}sxb1PHshmq>(rL@LfM=`X-f3()k*
z#Pf1M&>cIUo<0U3?=s2dE;QE%ZFFIdDtC(j${>bl$^=DJ0OwrVy$b0KkGC&piy$s#
zugRnbS7=4Dt#&7_a~i)aasKUZ?F`gA^I!_9l@i*cktE_qma+D8YX%<;2nYZ^+y}yd
zrphG1>4641jSo5X&(%k#!aOr5j9PQjyEgHceA#-ShI<i&&|$t72W#QOWd@gsU#=XT
zJEdrOEy?&J;FRBh0^<>0aZC|~irC~AYmd$=T$ZjW8-P++7%|Dsbz?@2k}%OGdC4K*
z%{eW{4K@4q*T2{@;ow7U9O3AaHAc$h^I8tS1!z)-?WgQz9G_A1ufFQv{mkQ1t+>)y
zwwWr7W(TZ}sgOC`j$SwpsE2orM?=R^8!026{S^*rl>BT<L(LK1wurM_fz2|v#DMxC
zhkp-KuVz7OlHs6Cy4zH;U1UeZvKcu5KH<^3p%of*`aVT~{lD${S@Mb#_I<G_WAQ|&
zNX?-sVf6F8=cZ4jpKTDK>Y~L9COkghwq;8xp!8<c`or@HC=_Q0>ZU)urapKYPl$_6
zvE$bOkQTjuHVpvW+gtQGpbsw3PwC$H95gzwKqB~hX7o8T;p{%NLAgczx?(m&tLAoE
z5ol#yqY$=-7OF8AKV%=+tFo@_U7Sz%6)!S-s)s?h-O<H7yw&gac+T6mZ*SL~xq%^r
zv48#bxlFk~w_0?i!Qv{O<<oyZLZ(OJ{U)L2w0rk%3KK&_L4#}xiLGZH(ka8)q9OcR
zv@s9qbnSZDIS3!Z+1IQas+f(mzA6l`a}x3#8C&(j{qL6n?1+!g193wA8i7nP>>Ui;
zeq|b{M1;7lz$XUfSN1!$oK;VfTV1#kGK&74O32?gi<4@DMd8`bppN!2EwuCG$lxp(
z7I{DzAs`!zD}or@$Dx@h+V4moQ4tJBq7&R@Jb}V*Kr5)w>d!Tf`^EJD<pCG=7n6<$
z#H1iwX^zq_y3Tp1mdof1W8>bhKiAMW9uT$YPICK3*JsevH2Al%WB(Cs64dxQ8N&gK
zarnG=3^K|{CZ9AC5;c4}5GP+hCBrwEO`uev7fr5F{!EX|OAEP1nWpg;u1trM_NGM)
zf%>l7Xc5_@(q9)28bTZTuAjd^D-ST#{t;5=&z+xf9|f8t&<#aAp6TOgg3FX7RBbKU
zxfUtjD?}p?!|JaPnaCx3680QFetZ@o!ZK=YTgdo_kp{M7itEIGjx%ANV7R~|n*fWL
z&gpd|bFQj`7jtzg1x_QW<U|0!SH@LNrm}x^=TDzWR?kXrQ49k_myA~uZDh3eAzCR=
zcVbC)=!lew{@BQQs~@BUNbEp`P&A9g#L&x;g-;QEE|^E6T2Qd4o%FN5#Ap~BC*)8k
zUa<ugL$VWT^rhD6($axI(iR6WJLV=6!ek(%2z8T&4!8~4dX;hA>|>F4RFt7-V97)b
zMY`Q&w60H~=&HxuB?J!<=N+#yOPv4eN#T_DKJf6Mm2)QNaOT?-4mMA6lbKGQ^&Qlc
zReYKhAhah6e?Tgx@Bk(;mgP`1D7Far$RtKWt6ogE>CvMhk>HdrIsPo5b?0WNWXj}#
z6To3I!A3C~%5xv<!(nIgL5h!v9IPaX7`I8_44NmMP0g6q6*Qt_KpzWZCQ2MG`(w@_
ztC#c&JcMl~Y~TUjkZLhiCaNo8${+OR-Fm&@P342E2)G62_~uCj(-Lz**gH2Z7S7;s
zWDxImD8k`iVogztWPIB5w3op$otAMl0ScLUbMu-GQICvxK(_Y)-Oehrrp?R~jvs$W
z-aUz;`XGq)-*|il1AHI5S|}+=rKTH6gj|T+M^AkI81v*(=|uvoQZOG_@Q>LYVaK;w
z0d}u4TRR-$_JZ``lCF0|gv%Bymx%Ys9_Cq|n}~^-QEo8ie9+u^rO4MmSox=vrpMe%
zB+!#6Gkhtq@qC7=yxQ;_S1@DtZKiw;dYa3L%RF<YTV7?49zF0vD;BQ{zzd?~7#K2H
zg-l%p@}ofmP-7=e(jQmZ5vaOk{A3jav7{Ew*N|e}s4*Fj2CE(()_K`&MqUJ&mh5K+
zp4R;NPfc>Q7Oo7tUw3@XyP~}n$s>sK_Aw%~{kb5s^H1AI`NZvc$z!u5H<1r%`t7${
zNY@hG_Il1vsrs{@+E8ccps6RKapOaW+tIDHI$NJXZ!&3t6KIZSH8K7Gwmf?V^p)Ww
z<4zW@D4I{%=Ovi0$buDYjaXS<CtHo-G`nSHD{`F$M~@cJ9qAep;dv|h=4))q`a_(u
zM>K_~WXfQn$NDM95PU4$H7XeQK6}FmdX3YHZZ>Oicn;L2?M6!v+m$Ah`H1k7xG&Eg
z@9bsjC~XYyo|H2C7?GqNzqm)7+;HXol?rEep|mNw+#zHr`nC;_fjL;^<y*0oW<fB`
zF}s%xs_2^-N8X`dw7)uFQDsN>vmUFCKoNMi;See69JGb3w;$uY3Br<XHFD(0#3zTA
z3nk=o#UDX$b-Tq~i*<<Q$=w-nlFLY{a~;2tWe$HcEM(w}wU^2U$5HiJG3L@_@SK4)
zm1UkpzBY-A_GepVs=BXlu{|S-tt%|##p^<cEbKaNl!KgYHHKPG+H|<)UCZh*O`mKG
zIzD3F`J8uWGBcHAqG?5xdF|NCDdkUlS#P<qMYmV4v3$r3rTI%++!;=QQ#9RB|64<5
zT_MtoxUxJf*3;K3JIkB|dL&-LGS14+&ky_SgNsgB?N{)L&+6Y1a1;ks4xE#7%>yhc
z24Qj`3T&s}=xEG|si!`**4{cO<%va`rCK9Kq^R?9&s;}4TieTC-p8IxdEN0?t1oXw
zMhb<uxT3-<eZYnV$3DF^#Ub~x-F~;-IQjG?2*Z2-;CFq(oJ;q~=9r*f<?4_E-dXnK
zxHx=q5Q)|CE6QIb+1)HlNT0XDt9%%3GMzJf-Ss!~KK2Az=`&4-%)cr|b<WlE&J`@n
zRkl-ylltmgau&gJf%?!uWjL5PI+6@XIc7Y2D3&8XI=YeiyME}o=@TYT%1DJU|34Bj
z{5<C2mr-Monx71s!aN)=8qzajUH-U;RxNHbA8VrMiJ`!$9w7>|V_<U5nfvoPn3)&e
z$P_8fC0?<EH&4-SC6PRnm`6_w-s4o#E^|%OR@$?SHmqN-H$Z8;I6BI+zu&x*&1cRm
zHEh~G?iWz2%O~b9i+w!hSVZ-WoQz?AS6pJgs$rVIj}MkgyFYMCwsVE|s8eMVd6I*4
zXIk*k^Mdo4d_1ml*^(t;2C*qAE4P+x2_yO|ykwU^ONe#IG2*_(EmuSw`>Mts_#m&w
ze#w$0PP6ShbqXC+8lPU3R=fF+cSSweZimb3#+sT+8q0W(C<0!IRcO@qm*Cy4c&r$o
zrP)VZ3llA%?~0!Bx<fub8YnqVz(%-bY*(-Uw#T&aN=7T{;K<jWat;FS6M<Jq@B&dG
zE7`dBG77=Y28x6(aI&N;4=O5ztRm4NDMq+Bo}d-F=C8Nhb|fP`AowBUa<Hxzf9GdL
zQPw)ZuCw0WjS&WYtp`53Hu!2!h2oaS2y_QQC&#MLh189|#y%V9Ve8q3l!z%os(VGC
z9*g*iLC1dVZT4#`-%(5e#N1z)b5;{5rI{)Wt&Hu5PbMWTPplsQRQBQp`<si4{wZ%d
z8`|vFUrN4h-rO@InHrcmN4#SAd@>%*7bh5m=y|<*2^drJ!2It<`Is3$pRz2;b>^=;
zrVg)$mDG(rRlD$q>h$piD^~usdUYh1ej~v>Lgoj;11y7wmrr_yvfbRI=#^EEEY`Qx
zm08UC-ZBCrUnb&Em0Gl7Y(w)+i|zP@IfzNiAfBhMlY+>HWQ~BbGIDcs{k<9&S!3+;
z1xLI;GtnozmcUpgdIRX`-*8QdH@rtu`*dh-bUFj7I@`IlW=s_xUZr55k#1q2yDswD
zMbAl+o+LV8XP$rgY|6&XuD-5*D%EA9;rM3yiO(BfX|jPwJh7sUNTRoUH+Dq_@t_8o
zwR~x6uj1r<S;CvKbGf4B%&a*;$NS*%+xnk+c(N^!GFymq`Vd27ZC}I_HYWkqBUdb{
zt_IAWL+DGrVQ+(c;<JkdteC)XOls?;P@LWC6Nl{(l#V?gP6ML(dj>eawdpg7V}Fl`
zh@i}=(>uKXBe+cQ1y9P$wS3PZs1AM3u2iz9=f~j|mD#7@zSLQ9AA*O%^aqxp<bxC2
z^L*-J-W4WBldkFCd{LSkizwyxwxL5k`eYX^uDHH1cemYgcXxN)j_K#l6)vuqK?!r>
zonM!G7k&=|vbXXwnPgymYyZ!{S=*i4K<qZiAiLIpaYvLs{S|Wy6R$p0EKeelZ8!hn
z@!A_Xea18df?Ik{=m@YNBK@PQvq=;MEGBEe2E}8b-cv4*85p~<br0c{Nt|f9t^ZGi
zoDzc;+6!$&L&KTqJ&!Xr>BKS*T-+2kjecG8yPPx0HYndU;*>?dsH=|Z4^36$K=3G&
zo-2<Mot`8<Tt8Pp4>pHFDKaW{%?wg%UW`-rUS%aLis~DrLUYLV@zb<W`)5wxwETEN
z0+eHWl|oP&{K-kjQ1Z1EH2V{`>9Z(HgC3ijnx+Mvo*Zp6V^K}QRZ8>~9cn{3p6k*H
z=d-TG&)>4_aR}>89(>`7CY3oV4Vx)8RQ<3Uv>*mAuQVe_8K1TJI}kE|#kutK_W92q
zlYKcFnLaKK%F}T;dFyx&5QyxEwGopoV<W{0^*AGQosDQFNRTO468g%1e669n;Q8Pd
zddGzgpl!m>r1e<8N#HriXivsZ>m3#R2PI7z<0%6k!GqIYPqkLBoBUC%37+jKNDK+3
zv2oearOTgRA;s;c)`bSe5_L@A2PI%}9<_B;)(7v=h=Vr+q-XN?wF!^Y+G3lp2B@^w
zQ3GMo{S;iFrTB6(JVxAGh)%8uS7U({;VqMwE?ahcz!-3g?n)8pW=gK$aFRL_nwxP#
zkDv+fxL(G$&lR7fR?QR4K@RWHDs#Qrm(MOq2FP=DxD#SlX=rE|Nr2KLLK#Xr5s9%?
znZ;CcGkyB>F$SlTLG|!?rnOv0?pfKkPaBpo<G_`eU?#+8Oq!f?<w+2_6=Cms$ZFi(
z<x%_ALlObJ>@o-`f&$5OuIu)6Z4juBWscSM{?q>0-|3$QXn_y;yd%auC`$Swb*l=K
zf%XvB2OJ#+O<5GV5hImilnnbLsq#-sN&-Tjw`dV*@fL#0E%)M?pC{wcl9D&v+R1mk
zMOVKV^=pJs|CM-3YRKtBzn!Sag^@6`a|QtLNx4Gxl4iesgzhrCiHgcgip4VV`+Vq#
zl}AtXphWR4AcCqT(;M0D`<OmEIzRP_OWq~n1R~8$k_&^()jDtD$=X*_lxD!QqT=qC
z7q$_RU@0mwl6)dz-7;D*1TIWSNDx_z?Stosg!$*}rm8SV;H-@3fq7f}8rlYZRu__J
zMx(kfL_;bpJM>+zcV<Xq?+3rEgFLM#+Qkfbn@Ef5`ue>DO%1P+Im|$sItIGmB`BMU
z@I_}efrGPOp*1b}hzxW>mTuwFB}+2!#HgCeSS-oW8Cf^fys$k1p0w@CTHc4mdP&MS
z$f=T#Sb)&Gd3ab5*D~L_iNLj;R@NNSZZ&Ruy0{X0xdpH;#^O>-7v}@+EHe{j8n&zV
zC>iA!DWe?N95Rc3_I#%aCt8U{KX_M=s!Js0o_M6YlxEO#OicX-;8$ce*&xONzt5o0
z`}N!?86j0r7n($F9ACIc;w;Ze*ni*fV_H4mALSNtq*($Yas>g@>_w6J{@Je`mxHGc
zns)kgQ5u#~4#v#H1j0hY%jd#DEAtVFKOFoM_Dh>{Q+DmzC3p|xWwv!;T0I-X49Wt(
z9f5NQ?9ltnAAkPIaZ+sQZ9W=QW|1eQ<^?+mrOFX#x`>C5<tmiDJ53Ul3M)EcC>q>{
z=8qidLoZP*;K_xk<NcM~?7Ey6j95Kj7QU9>;?=owgQmgmix+>>u{t;se-KwtLViev
z?EtMul*u#gMlASHQqyShR|}ctb<mKkQtF^8S+HTVLR<f@efvnhM0uHZO~zwzas7EP
zV!%`C>ykiwd-o1M=q}0fC(<e@0|{$68}G-SbsWp_mrRCTD#?MA=P5Yuw;L?e6lq1U
zp+XV<LFicHZWo!so!W(-y>BJb;*O*!TCi}S6F&idbqQi$499$%D1da%m&tNRvFgMd
z=dWJ9+NQ-<UtOf-K|uH6Q`PS5St?U4edPNDNtVq-8}9lRsf*m*^I&}TlW_@R^nh8o
zK4-V-@1LWF#&tDGV0AC1EyySb<z-rIw*&{fy6t|sX8roB;C%`uVjN$N5^_L+cDz<8
zJ9!t7{-qV__SI)2*YY-^!>h?l-7WT;jUqA=(B{yv_;P1LBbhDFlm9E`Llsw}#eO4a
z?#n0{%G`c-?e-4cejyiL^!)y)I7fb^co{zP22=eC2B<GHV-t*{Ine6_|HOf~9>pk>
z2_{X`-j8kCjJ>2bsq)^DeSi87`yf#EvTO||kYAK=?YjM*(chAsj>ln}5!8+!`yqqZ
z(EShnVC0v!cnU4csEgcOJcFR%Koq$4W1VPKiiJ`<o*GjB-X9@dxBrSKk6n!`c?3!Q
zBD&ekh;fX|Ms;4~-6h=wBs<SQ?=XM$9m*dW%Ky~M@Aj3cS2#8oiWBfAjyt}SmT)4R
z1E>LzvS-!#eEjUcMc{ERt?*!%LFWXg1P9KX6Mw_Cnqn9(5ZartgsJ9i^Z$4-3_ARq
zD&k}Nlw3tjMytNI`<^n^j0tA?Hz95?)oZ6XKfE&4twCz`X2Vr<RQyK@+AW^qg$vEY
zj9#qhaFfVuw|ow)h(e2Amya^C9i2XY?yxrFrokT)z1|!KB@EtgASbVN>j0CP3_Q9D
zy05z;S-}tm>E*EA?ntBni}3u*=TayEhmP~_LS}{)$hxc<ys>5Q#-Jgc2ETN&_ip3@
zEF5mOVEDZIUbkNE3~4c8!i1ntP4QSz*C+9;L?6AoQVI)1s(E^yN*j{DWy2enIrYzv
zw3v}wPe65~F1~zgi52Appcma0UPMLG{vQ4faggwwx0e31W=-bH=OpA(o;&P64Ud4K
zn7y^+u2<MyuUlu|9A8tp`H#wlKa>{jZBpfBMC2f&9-;s3XT1nRt-&3C-J24@%?RxA
z=uP9*Xte0jGp-?)<*zz=!IbNdtdYjZz3>RM2uVovCXt3wq`iPB{ri{C>d!yw4p>_r
zn`gIU#7cYI6}s^mS2URrV152K0tA5^#<@#_&A9Y&W|)petglI@r;Ip$@QviY@GJtt
z4Tf~;%@Qw^HGO|OWj_t?1l*hEn4^Yj+)n@df@P@1_70!6*ofEIM>R+lhS60|kNH)7
zeRro_WU7E9&MJvxKo7U6r8*j~UP7R^shJfYZb$y%coM#2?(O6PwRW@mt)2fpkb=L-
zIOUuh5>1XRLzZP2Xu(4D`}kwzmmkS57g001utEN%^S6+^kI5`jrD*J{r>s!FXcPhG
z9c3R0V8zpwN+odl0Wl2J(c+5aD~^xnS?hV%Ecvmot(B@J4n}^T>{`er1|JT<@lGG8
zI-LV#o-4xqQfIR}zThXswl0#RUqFeUL#;1(j5rxRmmW$P!9tY=iMZW7=WXxRF?idp
zW*<)Sre^2Pp52GFkfMZo%nL!-mf#&b8btp)>rjV~5R+2f0ZPncRYzS5c7uxd*4MxG
zUXx9JcisoP<YUX+dmAYs6Cx>O{;AMm7^2vqfPdE;4$MBO32`0+6nLQt@?LCM<CFou
z?()wc40Lf<X52}X#r%W5%085TK{_nJFdD?-$HpQ<_%nH#CDQo{5f;j$y*vhx_oJA6
z14omu<&|(2cwg|R4#SpBQcVJgXMWsd>Te}q$}!nAjT3T__IMeMpu8->nvgwu^pKHI
z?_ampeqdZ>d_l%WejE)dYph}@iHyXfikn{SByS!FSU4F7Y}2u0#}vZcUPgb!M-6(7
zfAIs0KczV@TtZT{AU=W|#C<LeLcrD!N=qZi&b1s)da1%lTwTCK{<X{Ye_Z?3w}5p9
zIi`K1lu5uM2gea?m^9;a!U!@!yJ5?Ykk2V*(^Dd4f{zOKJ|AZf8eNfbl2AP!@(rpm
zqKHVIc*8GFozD(a&yLHaTFsvB(Nmnh`la$u|ES`T{)SFwkzP=W0@anI#q(v_9YK>n
z`$8(+l+oQt{2PRB<n#aguqmHil<^5N_)IBdK}K)T>`m#cVFbZ~mP`uR)4r$r7x^Qj
z?UG{+pbpVOm_**)d-fcs5}1Ry<cdsc`}i99<mV4k9ZSZ7ob01iPm;EUvWt_a=EpZm
zMD=l`Ku0rDZK=B>!QM!BegEFhlieEWX8-;)|2y@}KOZ*AfB&h;&i@W8X#CgIPd+@!
z#=mO#{|hueod5s+gQm~_$Dhb>{0~!AV~8mwV_{zcLyK&=l(ZRv^8S^f*X#aYXOhXT
z+cEd~ebkov%U#fC=P~N{-2Gh;OwOmV3dm%<X`|obp?%j=vxGM$pZdPx+eXH?-8=n-
z^drxn$5_^j!h+9UGqoS6;8hVGDmh9+#6JG3p#4_|;|M4u5*+J(lg{{EqG;&eb=y4}
zp7S|XdAE)dsJNPbmKwtN8GleCVz}NQ(M*nc|0?d&a^dM1ytf6c8+G)Fk6FEq%o*}>
z=gEk#9aisEnU!$`$v3|rPT6w@RS;iigRiK<D0h@oWxMdwivt5z7a$;>Xho%O>@=Bb
z9u*`RLrDx7N!Co`Z-f;7fQ)aHhlgd6v%aLI&IRKp6WsAXa+Q)3K!LHCI&~*=q@!TN
zIlIJ6dNsV0MrS4Y{v|Ss1BhnzW}F`MskOeoevy%p0C4?)_GKI#pX~#ZvcayK7U8lC
zpOG@Mcr@V#fTc39bPQD`8P5tuxpz`*62ou=fJ&>RwLsDrPlr<b;NW0|(lSzSN`7Nm
zwDU0l2v3-HVT@RNzgiex=}TS&XNAB+hOj!PXI>I^Ci$MEa#91t*z}|{qMbYC_=0Ic
zrZWIxEP&e+H&SiPmt+^WDKmh%t8BL(%WozxjeGu@1||J|WA$Ac4rdpov55(2U6qNc
z6GOKjUlw`<-obfk3=erW!h)2<Lm46#P$FI28xfK1B%Xi>UW_zoV3;()WtHEVKxzu!
zy_>p0Zv%q@=bJ!>aK~b$9tsCg9Fb@;xQ@8<%BWLW71i78H8qczpNK)(GPk_jOe;|Q
z=c4AgXzGfw3aXbN-r|{M(0R#l_JJ{j=-7+#E8clRBsl<z-Sq3PCsJN@KilrrGrE@^
z!YIEAOysER&1M&0kKRat<hlaxQ1(6Vmp**{{vlM1opXX`2MF&{{er;r!7MBpTY-yZ
z>-bS=Etu$*Osqh40HqhW{=$rW%j%Ce?Jhlg!cJ8PP}7u2#{|0LO+E66r-f;xmh7G+
z!{2`B5=pn$E@Co5u%{dTNV_23nK*Q&QFT?71;11Uk@CQ=Yrc2$^t^^@<Vytb%V=xb
zXc{^^8VpI0dE)&`{Iw1Rp#be;hA`aGQt^P9pj6?oBBVCK^vGp&Z7u?`laah&K$mIQ
zq4LmHl5%>!x2g$(PKjH(<upNv^@I)1HRSm%7-&4RIfaSw!fT^{o3m^Ww0Di1mRlV`
zo-+qHP>2jNA|52^(o4O=b6_N;HQ8aM-RcB=(oLH-H6t)%2J9Y^a#<1S7nGF;M1@5)
zn_J6yaYKHP;mC)L8oDu9`z8H9riU3Gtkc5WQs;ydv$P_LjpG*<jV@XUhG}B*>2i5!
zxV8NfI2e~Hmh*s<V!k7Bf+OA=!5jS|7#3I91X0|y3|zj!D00PY08Ju);TlzJ|3!7b
zs7Sb|dwlY4-V}62-)@PWot;I5X$NR~V;C^KA8-YUG{QGQdC2H)44ZtQVn`<q%_63$
z<Vk>*`TEUfVFApdV0-{FXZCSQAwodLOhvLM+YOlh7w1$$eIfhNFJ(d5#Qj3MwpCo+
zEY>1TP1ISmvM&>$BnFnrm)w>m{0ShIy+V2?#k00~Cp8-I@Q`qW_xn^M_0R0gj}e2@
zG2qAjEcp3=exbBN?}Vcg)PdnBl+T190{2ZOhaxlZbzHZC<MIDUccOm3uMSthdaz{U
z3EG9xJU}S|lNfB5-%DCLFlmO=JjK9bNg3Gdm3AHTwthqGE1BEVe#gA-a)Dqm$H1CG
zDf4v?Z)r6Fo-7et-15G?m$Sddno&x5+AAj64ttB<t3}aL?vD)=rx`zbv@KLV48!2z
z+Iujo@=d}pCGMVXRFC3cJPbvX*cJ5U2}eLBlDQ_ijowY8qwB}y)~y3^&U)m+C=d7w
zDq^wkw^J2k)$>Oma$v6VK=l^f9T``2eF#*Oi5in(4uU0KE&erQtq6iqG8NnlmF{r@
zU@$7p)2;~4&6a?G{1w$&^p}qxFn;><-N1-5^R;DG8bu=)qkfoQ>Ojlx6j3#DHiF}H
z1RlcleB1qpi@Y>^qy~xCidi8PTLXhaFT+#9tAl>0d1&-#0ivOq3=Uz5-i5paxv&9r
z-uJu)fGX$Z4Jm=(VPr}qU`VgL%EqVf)8t>1r|zDd_w@8^X-)Yvim))x=abI{F{ltY
z`C1q0#Oqf04wm!HlBJf3A>k~t4u*bE2n6&YcIjVy{`vJq-wU(q@pdf>&)^msy;y#b
z%xy6%nlYc1a#x$&qWkw3A+atO%$PGk^$SbvNh2t}K028$#qcG`Y#&Tb|1&PJHu1(_
z!J8FCh`p$EwGH|3w2U<W6{A%{!0e$u(-)e3nk3Q|GA=i$`K(umsZ%_@eyE$>j=tv7
z`%>~Ix<g*YG*(DIH}B?(!4Qe`M{Qxx=207{*goE^;*Ew64A)uw)muPA-9;j};2LtL
ze5Si+)aJHAasV)Uo=l~#Z0M6$wWJE0c6s!Jt^!$PtV(7BJ!$d54r#QAP{U^pOKoJ!
z36A%A<5r7^rIcP!Nt=X^B@EDl2O;F+g74xKkk5sv$&>bYcK0Xh3==tVeV_voc}38^
z+QqIzNQI0KmHDOuHDU}Oagv1PBb7adCuT83i)i0W2&H=UP}GE0tyED(B&+ne;|Omx
zo=5|x)cm3DCF>Zm%e=qcpMKGFAM(kvanUyrNs$?S{JNb@PZ!;&iv%0UD=ie-KlO?$
zC<yI_EU)H-HEo&kvPMEGxMniq4mbdsRWA&_TsfHd^I>$69y;Uu#yFtw<BM-};q%`x
zTD)-Kq4Jtzt)7h)a4&B~6kS_*<@rW;@Cn$S`Nv$%atu3FyFq~x&>L~>eEb-FR68X?
zqd+0$b-IUCyG&1Pc>88Xyp0sW)Zr5jT4?f=>Zl0RI4ob>(ZV93eqqgLU-fR<d`V_+
zw~)Mnnm?Z#Fn(Oz5$%ZN8FNZT?%5iU82-0SSyh15W}6LxaRy;Ma#Y=ODmVS`)##>w
z{PStklo3`>hA<4g<&b}#=G!cet_`lNn7J~yV0DwHC%kedi>wa~%il2&FoTc{+LeKg
zA=PEN67mQ<p;<3v)*b&fg?R4>7^=v42p>K0^z2om*i^!UB`Rq5ZaF?rLR_hsN$^E1
z^4M-~Z^Q<#aN{0#Q$L|Xm{nLP!`>``duV1TeK*`{NB(kmZ(B!dXssOMmMZ~GAuoqf
zzCf~nUZK>~x1D@KgT;@nTSri(DWg|W*P(x-73k&v)d%59lX9x2gc1P{+yzp{zahjh
zU&au&${jU&bRU2_?!&xcB;7NU(pcvD%FMRx^440G+IgLd#+ecH0iy&xFrXxNm0p4I
zSD#Yntb8jxxRjKXQ9H*OH+y;Z?AeF>8{C|jzgu*IO1iATw)`m7{BjCc7hWWwQE}CI
z>RoR7EgCrQaoriqh68L6OUnsrrERqnTU?HbvqVJ0<f`XQ9QC?Qqp(aCVgFRsFYm%G
z#Mxhl%j~aGO@D67U`~PsDSA<C?Zvqhh9?0dp~dvi_k->4=k(3ANy1@bif#6$Qx}+4
zRa-~U0xhxB)xi-4Dz#a^-W9DGyu5E;F9A;|YMqcgALs6zWWNjyRwCS&T-PJWz=0tf
zeSM`d^t%nqiQM%(E360&dXC9Y2$)fk?0nRGuQi`&RTTXB1tzrXq)DZC%l0RF`}gk`
z)CMxOZmsK`+eO&feTX^EMtfx$-C1*E0s`Jm2FL{anEO|5UQ%tP=Zj*cj$NYgelL=&
z#4_R3z<kX0C|AnY0&S4@#hT=bQ4yyj<pCMcN1ys7%J1$vU(MQhV!)nFoBoGQn)!Ce
za>5NM^Agm^Ly}$%o7&5N*DUFPv}zzTueU!ifZpjx%nmDQC$W;1QR@yJqWJ<$_AJWM
zLVhFS+xlQA-!sw4$!TwTJ-rkRIi0yQhyHt!VOCzo_(D}m(JmD>qYU93IBYmxN2Wz(
zxpOZ3%1_-}x7>YRlk1g5?H~akY%_Vk*|{rATNpj_!BWINd$vG%8O(=a>aKm9xW$xq
zI}Gp8??}8dbhSmJ*YG+0_x~CJi_Vj8C3=jL_*L*CqFm7sb6c=LDi7#%50Zzf8U%$^
z8pK3wZHHD)ps<!{bsn9OOL>6&=PLS`)X(~qfh0>wnZPNWee+;kbqp=`zY<<=_HQ*v
z!Wpc21nbbJd-qY))P$4=y0`~8Q0-|T*fSg3S*Ggkjg1p$cAWUjvQ(av%%mni?jG*(
zSky1BCVIFCHI_l|a_)~X%vIcpbm)WR{rJEC$uN3ij>paKtVxO^8<oNsZ}wM6Nmv?-
ziMpjN3cPU^@Vk_v<?xZG4jogT8#=;4`YIGnnlvHS90h^coD4(&0Yc8%^b^W{l-@?l
zJf;XhY-d1Gdub8NcbNZ^kX3yTMdudm$Y^smnZ}zkcik_)$qhE})6y$+k;S2zMluTN
zh=LOq2}h|99XvE1T_bVke9CC*SqxDj@;}_R_(Ei@mcG6vy&b<%ol=z1Tz*kvafyjp
zDrxPJWQHdsDWw3i^r8INvE#hY>!NA(-#aSfAm(QoyjW?hVn`=(tGmEVl)I&80fG>i
z#E#1tMfY2e<2nlx$F$r@ho-C5eCFV@c(^3`eP?6uBpU%Fklr7PdE-Njtl6K378h6x
zoGi>_Y0Bc}WCSZ-@CaN(HF>yYyLJZ%(~NHR+tw>eayAm-c17g4#jnS8ZhBfzR@3)T
z;K5gOf%ZrLQ;COw){`t9dzHcLkT%QLKx>qw@XXrAE~k;xe*J)5O|^Nu!2)>m;8$_7
z6C{NfFx*$ek(QR0ZQAU+ePVh0mHX;JKK4QJWRLeRL2=odCulim&H*Vhv9HG&zx(Vh
z8|y}es;#D`cJJu+w@^fi^!INm$Sj^l-!4VxeXW1JMg)u;!CHJbti#dCj86D2gD`8%
zxEMu$L_tg;wRBK|Q*8zVEtm%+RSYpS6u=?y)0I3g3rvGpK$*-VJ%N2~kM9)z7$$3Q
zyeD&uL6^ISpGcG8Hd#xl*iuB({WO>n6e$9$N;tlaNOCuL;XXn!;Sn%-ChD)h{xWe1
ze|i*f-vK<~it~CQD$V(Z#sV6~W<H8nnlZ?@4gr%kpXBgp1>$ej<$J129*=KWfDGi}
z<EldIYtmo7Z7bvm&UpNiD99JW;g^K^k<`F(G5-GkVOXs>WP;1kbbj)BS@l{H@%HK9
z!nsv<rE%`uYhtv$x!Ko;UA4)-bGm{MO*WXM%aY}{%C(5Kx1($zy^I1av(y4EAbj|{
z=;Y1aIqX<<EZ9xA)Pmiknz7#s?k`>sc!YFOEcPZ+GINL*>rKxbjj~O++*QiTBT33H
zPB-OFgo4M-n;>ntfx3P;MYBGP3>J~Lqxl)8-cQzB%jhKOTR@4xW>JK;!$~Rf_J%Dd
z872e<OA-{kO*rx60ph|pbwdSJ+>=$sz%%KAElR*PNzXS;k%t~`vVz;a9^n8siJY;p
zBk$_y<RF?f4vaZOw5IK_I)20Mw&bPxOYFY>`fG}d*{_1Jx^3;0LYKhsIxo-JapREV
z>gP!lWl;2rT)QH?dn-G4^66+#%WUJc+<8DzN4w-m3JG~Z+#GxmH_MN4%#jy$E~$;@
zq<R$>&luwoS*bb5pzTDxGPBK?(W4(h@wbjCVw5?kdix@rIDDjYlfGr-@<e0fwW1Ey
zJ9X(`njoYM+O`N7)9UjfdG7cJeok084<A0XjXbp#|61UjKAIhb$82AO)t>RlWJWdt
zABA~srSzU%eRf6^ZmpBY2(M(6kM16!1mZ^a&dC@@x_a#4d4bPt9)GHqduTs6=c~#R
z0nY}0!7nacc!{2%sPDTQk5$#d&8LT<>rIaqorVLoEX$;@A?OEr-IIO8mlps|d{?(6
zjAj<Azn-}L0xwx74QzApig|By<HNN7OkY)qHG9c(FGuJ0=uk=|>C;0;0P@2E3weAa
z;l#@D7N0MEbS1N7rXT=h3l>N9?D0Y<jr3205_JXNI?ux!7)blJFAisB`$@^IiQ{H5
zeZ+EQ3;%u&_HY38(@20t*w%Z?%p35KUlP7eVf~h)DYmZY%9Tk)zFnUThCJf8@D}_f
z0^L_3Nog1v?KFe07Z{rn@e<oy!V<>Qf2LHyb%6!zzGzruwEEkU@9sAi7K_)Cn%I2%
z>)6XgTk{4CCoH_0K(uk_;qc=~X1T_O1_wt;Bb@3b{mo8>ccO@iXOnWIT?ZlW8tnI@
zR5Hvhz5D;L4EpQQhohxE$J`~N#jj*HL$eInoY&B7UAGW#NV8|0YyK1X%rZ>+>gLws
zfUu<PiUijA#!qQR?REUBlZU653<qKbyiIRIRj;t52J+w|7X2DKS5g>Ow@j2$J6xUF
zLc%;)_XE6Xu8s0sexg0>(YUWFg;aTEDVSMK%e0f7ehD_SS)E*wIOi22KKECIKtVrR
zUh=AL%W$U-aP1!7aDH`wfWYRj6Yr2@oqO>(HQTgzTefdMK)U9%e8(R#7&HAdZ&XyY
zB8f@c7x6gT?3mbWEqiMpDg&0Gv1ERuw%xgwGP!9aj#J#d>hj5d&TxM6&s4oxxPIL`
z4^zV_GlS=Aom~nbC{lN-Fv8wQR@5M|p?#i5)a%yAT*7UX=TF;KcU+cne`6&B%A<2U
zAfu68uYV4@C>lvY<?|uD5YJ1mP6$#Ceik$P4H|Uiz?BYT8@T{oTNXcltTTCXzH4qJ
zRwKS$)YmxUS+i#C?fd673g4MShU0sVRzm4*`Th4@m2RX{Z@1H98I6~Bbk(T{oCvTe
zh@825<!QnPoiyjC)}~J%WA1-!*KW?G!tAZVw<4rj?W9-1{X_IwJ>Wb^%=_IY$%nxR
z1AFE_fAPYM#A5wI%FHxEZhO2GSDy3IDlNyg+V|ztP(1(<>48}H96&kx%eyb;)F&2Z
zTB^KJf^ju071o|a(nz##&Ao$S`0GD3dPn<(2JU>0R}n3Pwna3vU7a49+&b{hH)9Wf
zmB<(=Y2sRHak;|)_j<+wbSWC5t*zZJtM>xV{2(gUjz++8aCshtB>k=MeHO2vAfXsj
z3IXz_ZNC!)qcX)u(1TIccmL?2Q2Z)a0737>xJDa!_vFtXVu?*!GQ_^}4f;13g@uJ}
zX6AR%aU7gok+_hK94pmJYKbLt7N^s&KV=<dow<*a@o`iBTGRB8mmR`^5f1OD2VQJ%
zr*38jedv?5l;4xYMtA0gZGu)OEs7HP;B(hq3sC{f@OAs9?5R0nm6^GDbPhz<<WcEE
zl&!iX{^1H?sOdM~5VwLk3Q-9l=0(LaYQzHi51<G2VB6*n;4RSAbccm_uYb2{_3ANu
za|89-R%a0B78`%-TpjzeOF7Tz0LY&}Gy7wLtqo{29yK@xR#af~Kf>>50m7K9Ds6U{
zQvLSr0h2?2zojZnnXIs0<elC3E8A_vi5Z|N0{Mp>X`W0zexfvH;gmE}7V-9lOK;ki
zPHFaB&_!t%P?Sk|>8Rs}9^T{i#TAh~8dv(@>AXDMn+9d|+iqJ9?$%r9H|Rxndwg#a
z>Pb7fcD+v>nuZc2XliOw!K&7@oOhFuSv##P>C}t}eDGpT&GTy{3&Fz=UGi+FSRbP2
zN5ljAXI2)Rs2AD&9!1wJ7iX_Kg$PA#g>PStCogZ;iTLoI@^i@1;1oCj0YWvW3Lp$y
zbowlF6Fkbd0#8MM>E}y6Bz#7SPEA|1nA{MZEFM$!AcE+zOo|}{I#QUId1hlqOj&#y
zEehV12On<>9iYNbg=fqjk>(+32d8lVh#M117+pvGloI|+e_@pQ#nT0Ec5z>OOZPsV
z+AI8V?$P2`b$TeD2o01jiT|ta-2ZCM*EYUpj2YV=hP_P_Qwp)O9a3q88HEv1X+$CA
zPzmKw357Hm3<)_)*h)!65gm<T<dACUB&y+w(wR~b)$_h%Ei?P|dj5c?U(9q^>$|?+
z&*wf|*L7c?pfCn>7cq?>eI(4^)8gV#L8mDv!W}aUE0>mS@DEy!y#y`Dm6xyEZ_m#h
zZDrE#b^4v2M_T^qG2d^tnwpFfQ=a!Sr~0%x@^;j%9c~5t^p_uHvMsdEu;|U{F>k3x
z1gX}EWV{1yq-*CWWpYt8wj1-#D1(D!CUj$-4%rQI(YUnwGv5E2s28DWFniVG_H84+
z_eQJ~pG{wknGMpD3QyG0uUIp)<h6WzB|Pl)d53o^>>U5thAKj4^`uBjzhYw>8+??;
zhj`oF$*osLc6N4>?~cm8Mn*<9S90A8hgYc`B!+WQGXDq0=Np(|RE(|SUFu1%wS~#T
zz)IVUv~Mrfp`g_w$DSQsNY#j4-O46xx}zA^9e_cTIGC_P4AP0hdDKACO>XN_O87aT
z7gr?F68Oq_WQ}xMpf4_7R#pwM0lgxaWF1)gG^b@6)Dy{Rl0XUgn%MQx384^v*X~aU
z!XWRYD96U{=y~w!zA~4}rUVA^fFp*EuP<8hy4Gcz2g9s%1-*hxc|~=jhr#(NnU$_r
z9PcDEMC4j*9C0*rbq<fIZD{wND66E8?%cU^V}a4VwD^@zhwcpa@Uvb0nAtrO>5l0>
zY8xB(hlV<3{9Va>Dt_wO%j$ilpsOQNge1p5Zr$pimq(EjiV2|mC#Gt8j)(jI=EH#J
z8HXg>DKnoLUUK=o@I+0&Lm|m;$F>mL7E+;&@OvBoVmif6Hc}@cR6scHET1e&9&8P+
zWj7)4H@rGYa5x%)^M_Zm4J<-*ylQ*qrvYOb<}Cik`EXWd6LT*@9e-U+jE|%o5St<<
zAA`#h>Dzu6>(fxTf&rkixQxt!ev@e;^c_OH)y8d-auKpeRF(qdv9Sh#fYB5wfy7vs
zNy5QP0!3^*C<8oh{^eZzE>Cov_r54?WxF3Nc~ajZT|WS$)rL<y-6PsDFnt=BcKV$J
z8ZeaJBjv#Y10;&2`_l@G2@{5RTN)aCT2x~+e_aF#>AsEQ+tp7Uzk*W6N?~+RHZ2|&
zN8jYNcG|6#ck?Eg2pufkc+j4P8Ofm0Wu~>1n*&q3k2E8g+mkwXxB1n;W8eDelrk#*
zxtFz`0|zF80kf}?DUB=0&oYpq?+OS3QEQFRIUA&4*4bW8c_%14XZ?~I(&6*ZFLW@K
zo`tlX9#>CXcGtMHFe|k@8GPY3lZQ|t20by<8k%!3=LjfFzwzV8Z_{?=VBP3EO$`nA
zu8-V#AmLUahl-;w{4^S@LDab*owu)FAIDO6x%`>&Qm^I9r!u|n?ssu%n)?5oripdj
zJ5ATQppW4Smp!$9t%>P`L9tx}Nc&PvEu0`JQCcm}ien;St9UX`v71}CJ)pj^ENeV@
zEWy!j?z|F1|MkV->(=N!bs{Olgb4QfZq`AqymR3sDYGp$ZFN5k>cJ7intf2f_yQpg
zI`Rh?(%s0MnRc%B)FQevM7v}2rcEUaRB|=z=O2pXZVT|N60e>Blc3qe;c;8MsEC5X
zpzKCX{`e!-^#yfm)!<RXoW9B|@GMU41mu2C9c}7NmdouNx3DifSbPu4zXcf*-7*DR
zsuQ5)!J<oVJ?ly=ZR*HbPfAL{@qR0gAt&7~Ed1`FmzU@4uLjRrpzlW@s!3D0QkcoT
zB~_>KV8jdqkR>I}pzqqx#-rli)*PWt%X2GE+uG`C1&KVs#O8I(Iv24por2CkM;d!P
z3-EA1LrkCh?Jx7Lca$Mtfnto$Vu_<xTu^z(CuzW3zadG(9t0gYu;5gd=Zd30ChhwA
zZ0Nb0+Z*ShuouK8?1EnR1I8Z5w|2W#XwvKE4!8Rpo@@zhsa{OQaHyk4zkdBdy6>rT
z>gfy!JfA6jl--B!+uG8KLCW<TKbAsA4qgs;K@WiT@=J5E7!r|%I5xogrSGobZ0Bq9
z4#x~QI;gDO`h+sFq1x!H2&0DDZD=p927tI>7arL_fVpR!bx68+{T1Oo!FJn*4I4br
zb{wvHdw8RJ&C8;aCzl!6{@r*5RW$q6@~HiEQp-45cNG6mXlR*2q!W*eT{$DXD*yZq
z;|A0K*dkU=-@f+A6GJU*sPr>ldF{)4v%fnqVGiaOA`07fgJTGeE?kn@T-8vtV5v{S
zu$fK`UONk2wZB`P9JpS$`Hl9|%+)cRxWb(_4`NVVQxJ)%6D}Qd=ra=P@tnFA?jPAc
zl=kPJ88Dv~yf9Ek6vmFVY;{HGpjLVbawg43c@K|0?}fVm-Iwlj&H!&;Yz$i>3XnZ&
zop*{>$F+RlK6@;V7U++nW{a$&;^hIKzyXNv5{|<m<KE`3?)euM(nZn&x|h0Eo?OuI
zU6729-hzrp`Z{%A=2y^u#+PBcE>^gF{zblLL}LR9CcLCD-eWABRjTo&nyeIUtUxfw
zy(>VgxpdSVY~Ws}!RP@<wHpiX^$*7_iMTTBs9(+au4YusQLKrwG#41QXt-3)I<)^y
z9?;Lzs;V;_R;`P6>EUaT_3&}71*bnggpW>4hw7dteg)YNF2O6`du37jWRfCw_x{zP
zM)MmlBZ`?3u-N--+>Hv|Ej?E~`e>VW#KFe@S#o*EV>W0m97$hug63UP@(JiY`w*>e
z-E?FwqPszusl&AZ)}0_UsjqnWL%fTidY9-gfe4DuRprELy^Ze<<KmXLcO<1rVdj*K
zmZ%xUsoDJ@Yi}A};=`t8Wo4-otshQ}pGS7{vE7aG@4E4b5f8F`qzI1mDI0qjFezh2
z3o4GqT{li#cX<#AKL-Y6khw@&_IU|&r~%3{KWTnb^w0UX)cb+8M@81%DSAjmp1E8-
z0Nal*TMSko?U*Q;7W0;OjtvMb#1FH|HWS!~dpQU&%EqBuM~rIBR>k1C3DoH0xNrKL
z6JcS~H_kZo^VY4g9XxE=p{(V&1G`z(du-1h^bZD~yk4{K<&Re%YG`T-8|<Wktl#%2
zzF;=+H#(72kuqY)McqtGxqIm0*}fy`(7y@P<k#_Fi}I>^F`{;TXmMflAJu_@fodjS
z&mVCau&6OGx=nFGDKXK~avBK<h;@O7mgiW%S=R;<9O?3|we5we>aReD^aHgZ1qJJz
zT(0CH6pf#$ic0^FGw<I00i)Q%`hi=v{Eq99H_;TBA#48rvjy=Dlk`%Y1L$i7P7W@m
z8+isCbm#mb85Zk$r~t$VdT#L%FX%-ZhezP3bC%oJJ%9c?{A^IwaAu8SI!w?23Z42$
z1nQ|VaoS5@1OkesYXB{`&6C+m)=z9o^<s{cKQ~UdhNPRnVqE|lH>!a}O+CuSY9YbM
z;=?b_m~3Hj$3wwUAlG1{_pia!5Gz`Xkj_>31B^ywCp@~2xlki*XJh0ULobU?<8@Qh
zBHRMZwC`IirYpF;!jTbe<Wl|H+`PclEY7f|!^Bn-G$ogw)iT6O?*OhT7rjIe4rb(3
zz%^TV)q1v0TuKv@pJ8rgb@5d}Hb%W^J}_9jsDAZ3ckE~|?Han>fG*3J+Oi07vURN9
zo-4W};&{w(S1Mq&j~)<R?ALT1W7e=r)2G+k&9^YxarVXTBhGBRt=nvMYcfIcD;%iR
zOeS^byA^?nK}$d>1i+(ez-{$EaNog=rB+XF7<ndUinj?=cj_y&A#uak5t!9#YR=8Q
zR-N2-E@@ko`;dzZ`iQ#YZqD-+!=hh4{x$g?gK68AUFV|RH#Nicd83a=s=nhQU@yJ$
z(tv-$S9a+#CoZj+5L3^<?SYvi*M61GX#6dP7qWe*d*W8@kXiL$ha6bBMH@XNet9hM
zl|-O!|M%a+SvSD>=R3?is1IJi_~#w<bsVi~JF;EBzI|`e;1b-z^J>7guCKy3#dfT?
zw~Jzl*$#K#=Klh(9zB{v^LPE_YoA`Y2U5z2PB`pB;~WG$34S3^;ilsFgAP>`mMG0q
zUnU6B*?mPPG=%u0!wnB$uYj3GQ4;4zCn3se-@bi+AYEoD(Y^hS;LS2`nl+@2%=ZaP
zl{1{-Od0Um?m<anu(stya^Wxcf^kvm+WTS&u!9}%7j_}~aQCFTq7ooljElZ57D7oz
zyvDA8N*Q|ftw>^yAniPS`IF#U-=%kMQ@nE{v}RN}e+?(>@2_yG#19A&&yA8pvOFGB
zdKd)i&5GYy_8PqV9m4O~NruV=;J0lTg)*KizUeW<lM1&7%5GUAyar2TTcQ_nq1dV+
zUrzd=%^K3;a8Wd#OeLlzh(44`VEYSkvjb^=TBI7<Q#79F-7xrlepDz7@$(2p#)Y2F
zJJku)+Zx`g_U(glTf5yQJBSvEwt+`x&JjzSaEOT_2S)Oq$aAvUk{n6>t;nodDa<Fa
zus7F+g)`&OAJ;K%cu*MOvy99lR!3yqll*4|AFc_&Quvl}-+zDeT^${pu%xMrD|N6%
zp71YB-?$MBnrLq7v8QB%IyMQJ333}nU6SZi7cQewl$Xzlg}e6T`cxu2IG?zLEs{fj
z8anbRZ}s%qvy+&h%h?seW6JgNrXmPs(MX$Wna9z$?WK1S71NTU<VbQaEx{z=GbNWX
zrURC{kT4}HMWi2MvjFdJ5Z8jibtn(HoZKj-1;@5sl0j3^X<WNDRw5gAb5JD%p?ov5
z*Cq}!Ffg!rW_Wg#*ZhLcdy6{Hm~@X@fo`XER*DrTG~d;4xOjMD-#)#2i@k<0Scq_c
zmIo9cZ_ET&e;5&Xgu7g>2r&3G<xHIF$1Q0(wom6ocz8LtKYYXkHD_6hXq{hz89gKJ
z`!Z<G5YXc9ILc0}u)t`IAi?`=nZajATQ$8?Y-vu*-Xa=HBiI)Xw4B7&ND+@hvZ}bz
zNV^I*HUI9kJlqrJ-#<-VA&N;6&<c?Uxr;Ryry-`i<k&}Hn??Kz?{a}MfFXS}8I6MJ
zCiE=NtVq!xATuqpJ~I*-9}zW}*lD?>smV@-m!0yUW+VG$TlMDlPraYCXGe%D-ISRM
z_@INnXnHY1m18`*7xu=J=$M>5nPA5KHge6-2pw6;F@~p*m)H|uMHEYQ_mWLq0l{EP
zy05=CNlzS{!pk30bFn#G-hR#_p0T`pW~bL4oysyA^I2qHAD!F3Dy<)$P|j;|!5=R`
zcq8sBd}jO*<#v|C0cDY{aQB!HSN~;<az};LJ>IM(SXh3>LkdV9gU$)>8OfU2G2zTu
zH1We&hn4RKi}kA+mrYm9F%!oSs_fX1VGP@{qy}ddGY$&=konEPTxtwa3=*1Rdz{2B
zYlBe7N-fL9a4^1^%M*5o33!K&PU@X>FP849$JaKyLCKpP81fw~5;6cYS}W0>3mpzL
z{|tB_?wQ{rGgwiu3lZketUA3i7*2Rp%PS{(v3n)SqUTlDzdGM8h0~F&Av+ReVPP=u
zGCYDOVB1bl)I;3yMRn)P*|%){NVIH-SZ{e+**5kkdE+x1(;>j+`meu^Cdk-OXdCey
z2bK;*qSt%DJ~$Nq{K4_nVh~$Tz|2q1WogS62=<DVWAsEGYkPDD71@LIArVlR(K7f^
zUZ;2Y)p2mj#Q~hw_BT$Sw6iTmSv!MK#pp7m5wmAQ7}-*jSRDN)hy<@W^V@H{N-PA?
zEAjF1Ht~dU9|KOAUYo!&Sja{btPT~Tn^%ZfwQ=D;$gyA<X;sY?7r|x5n!J6!0sXhj
zd7F~O01&w%T4xw@{z$ut6~;}R)1V86{xE?mHdoy6&?S;kBWmX@t)#Ng8GKtHa%2ud
za7Psy_l=a{d$?bVm3nu!<;Ad9wnZPy<(!SLMBc1J_Wa}3`j70H#zsXq@5OXTbq<}-
z6#Ayz2e?oM_oqURMtODC8Z{8zKhlfoAsju_XS<e(Bh*O_bG+vd*Fln1aB)LZ`9ybt
zDNaW(v?YY0M!7KXXxa|92`evW7F`721$4>npy7I*!cmkvV!9c2Gi06KMH;TI*56c!
zLKQB|??#>q#%w=D@nM#*F@PsCRjBbXCcpwW^D694O*IFKvS#L(8!IED{ehUoZ;67w
zUV;TQ=r<c&6K(-=uq^IUF<wA=LNESX2fsbw0kkJSP+Qf*5Kvc7dB9XadOaRI?w{yU
zJRB)jMC{4c%%RX=&$S)ii8~}|<rz`V)5BvI(nQHSvc#5OiJ;A6j(L+1IFMAL5tG3Q
z8WF?|Z8ajxL7G~N_EE76YZ?xMoRH~Xs3<v0nmBO_c^pjKZZw4)uhl)&aF$hkV&cEG
zwM}KrO;od_4s}h3!h!;E`jZ1m<TdH!CG}*v!Jkx+)bfyqcmuG?*ePYs@U^cS!|)|@
zqxu&=_#Q(SXyeuwT!*`Ny=c9_egu|BsU9leI-NOexQLKncrg+-lJadbVCL{sk-VAc
z^-=d8hiiF;4BSw2;J=^o)WeC#r&ttn>I!9Q1TF#<d@gwrk<3IK@tw$m(67Jzw4udY
zQk=}FC>`y|WZo#i5E=^Fivz}9BCB|wiopX_Imoq9^C+r;StMW^p0?;mDGKCivlZ=x
zC5xQWmuYotk%yomv^cx^U$st4?{HAOAB=qtX{Mg_C6=1^2cW_^&PIzwAp`b1{YO8|
zgmmJEAnxR$a49Od)l-RFOfpWB`^<ghGWPPg)4h4A2nG%-c<58;$=-WmBG^0$G+xeD
zL?(q-W5gO?m-d;3)#v%xyHJ{maT%+EJ!~PYA!-}ePz3#48EJOc{{416FU#>1ZC}fN
z!5vHaKRb0q(m|sVbnjzNlKmyGampXbzl}pjs1%S2iHP(mK|0`($T@Fhd`TwJ9wsg@
zR9WOg8y0(&gL2|c(b!4V(J4$(s@fql)QN<C6Rx1`eQsKE_{}=ejqvFf(#AFn{_OqR
z`@i{A1z0PNk<6q{YZ$C*7x}eg)*csDs*;)f-OZ6KEy|@Joe|N>-`trgmI?2^e)^5M
z0+HfEH}D`VLCPBj%MVRI>Y}B!RAayH4_5>CzW-&#vepl9%um086M_ySUtk8c=bGoE
zN7_jsIi?+tHL*u!Ny+>h6P14$-)j6IN%=pf)f2Q3_EsO(93a<fy)&$}<}<e8d8fY0
zUny9p;)YFo{eYoQ@heJ<6{qD5Oh<cS0Z?q*yl$!T*A+fHRFb`J|6rRA3CwEUB`#_H
zZ|s}&d_G*YX-n@9pZ$A^!)Ykp&E+_{J%<i0D%~otc;6mz2Me~Zxh$KWey`@f`(*O7
zE6qRs>74-l1ew%foo`*zMgUz{*Xjp<Hc{-xsq38Iy`ubO{4ZUUVg&y&VQjk87p(ru
z>2kCE*UtfmN%bJZSG9dZ{y@?AFV$+X-h!{{l)%7+tywj!<GpB9-(PDQ2yz_yb@QO}
z=M6usk4pa!m)RS<ogBVPM^w2uNwuv~#ku_=E}&)&3PZ|%l4!glTIrnN|K<zTOX)hJ
z{aT`dO~RFT3*lS@7vKE`TQ>RK1C(oh)G|&*4|hBWE)zvt5fnyvSOiS;?v+jbq)21P
zs*kyNV~G>XpKt8XAFkK{EJXsJerHg7CnI@-%Ky;F`l1XG4LWP~Ha%@oU*%d~HrsQ8
zrTO|9Jr#;i9p#JwOUt;GUFGli@-N_i9S3YJ$00B%3~9wFs+|?lvj0u}#3vqF=5MN9
zJxA-K<sY8ZP0}a(%qSIYi#6I@%>gn&JLd~VkF`|>T>hG>zTJNeHJq(%37!5%M$=F-
z|5pIH(tKh|imHQq5<)2>sGKhccs;GIUQ**Jm#qBqf3tP$A}WQHZOZTb<zHRrk2R_s
zlsDloKl(FY@|PcIf1z53fBC-RX1fpA-(P-Gt57Y5zkJ{STT|6D|H}{l`z>@I!P|?`
zE2{gVq7%yB_1DmySEHIomha9fq;M92U{zs~omk8|rX-5gpe9!JH-A$-8-=qc!edUO
zvImuD7%UiO{8EQs{2asw+z~!TUL<HaB!wW{8GBW?>AVJ)V^gI%bLZF`9~5#j@rA$C
zqAUI%VFek5-gjQx+xov5LzI?8B~#rXoU00`Zm33cMM!j`TzO@toj>_-W2YOF_r;r^
zhN@Vfid|wW1@mrFnr4&);*Mw0k%;nBDiSdZy@44=cEn^Y8JH8rvgQC)SvLHGjB{r`
zS3!OpD$ZDCt**}Kc;yI*ws~h%>7t<toF+H5@iIibC{pF92cRdn7cT#%s_?~TTKGel
zqWl<Dd|tL$6>k-%#a5C;DPlls3HyUb8lOLx2}9QPAnp(@Czgme<-9-#?+zI89im`e
zGwg);`=7G@&sB#rF$6)_{Gu1HN!)fwU8+xQ;s6lBPf<<}lJXP3EpK-9Y%JC)*fF$V
z$GT-EEx$HTxYqpYGrqNQwFwa-luwDtv<;@&886r17LXlo`L&RDP#=lGQ<RU=f30fY
zDV+6i$e;|$LzQs++_`h^rDrkn1;a`%t`Lxbf+D53uCKtz;1{h9XI@Z*mID;6T;oRt
zGZdI5>C{}<e`)Kg7j>#0-=q45NxW9;Bfqf6es4FtdGUqnTk#LyypS%mZA^F-sY|Sg
z#j*>}jribyC@;_S*Dlt-6@95(Ast7t5ajVvRA!??m$R~(Dd(ljqbsd+AbKg5P39oQ
zO*)Yl2Mq`$a@1EiYwvbiTH1#d>|bD}+ITzrNKH*eDm|7ti0FQ?Y8KTav|A#Xm=>?P
zya%|W1kdV4+c`k}Np!}3F$zF7-4K|GjZAVy51Hi=Rl&>+Z+2~x&nlmNVcW;&vn#(<
z-QWimWJW_~#~?-ZqCc^S!^4WIS`C#SUHO^Y{`C*|e-Wls*ZhA+!Yi%qhB*G1xi9Yi
RX8CRNaaP|)kD0Une*r_c+CBgP

literal 134558
zcmb5W2RxT;*gt$VgchYzvL!-P$es-up{U48q9}XIx=|`wA=wHc$<C%tDnd$jAtNIr
zd%WMP)P29t`~Khe|Mxtf`+1PxbzSFq9LIMY$9Y{>S5;WInqf6T5bG3=96U}C>pci!
z#RaP6_=(W1>vQ-&tIjGb93&RW|A{Y64j~9uLh+!Src?AltFzOIhPxsiZntEg(QZ6^
zY}GP~9qcRBM^>|Yv&wsMcc-mrJiRHG{erJTTHdL$(|PR;4?G_~ORE03A&ha89Bt@b
zPs76$VRD-`T%&y@D_=cXQ797bcj5Xf2dd%OPfZsGOREDXoK@AnIDWkT;%pXf<<FPs
za%w}$Ki?39XXn+4-T!{exNA`7-|s|^H|78PomFD4<+8uub{sKnC!h59*OTD7GKxRn
z5S|=KZfpL2M=0>@8u<HN#gSvjYVRM78XX(Uw`?zS9B#|(Boy?*cuv&S*Vot8^?%BU
z3r*yukp29loO|i7WDiYD_r#S?e~TLHsWEtb>2p;W@8n=Jy#qVz%3Zs5ZQHiZyeYS@
zG0P}U+KoZPg3*DU2rvrP;@je|YUvX^_dRx>pP{DPS+X$m{Y|(4jl>%Dp<L7Y`yPu6
z2d=I?_MD=F^8Mc@;0eudcfUN)m}Ob|^W#4&DA&iTDJhwK&NlYjvb+C7sz$=$b>(t2
zTweF?-J6@6D=8^Ck#;g&zqBYNrMIKu#fukZZnNuIq)X1%#2Rw0>~Ko_yBM0Z!=;mL
zOpYyk&sRs^ioib&V3L>^Y|h6&#Um)Hs;=M5O#8^o%ggx9?W0lqcQg3&Yh}h46cn86
z^IF4SFxhrl!f}Z4tRxY@@po&USX<7|{1j=mtM7UpqHKZ%khJT4&vxMIhYSOm7iVOr
z#~V%=9KEy0aBgZO&SSy(^|^BQ`Bwo<%_+wcXuX$N>HhoR0Y)!hz6_L}>BTK(UHtTD
zO6Bn3SNLcA#RJ*6S*x67-7Q!T{mX-S9*dJ6i$9*zbPA@XrcSmxw)=~;-3bb!wb)No
z3H}YD4yT_#KN%P}cZUs^&05T5A4@pwF;gG+sHmT(RHvhoNz|(2^ZB^>`C*U6m-GfN
z&by5Fj?9hKd`ded&~k<d)%njuntpk91zVw8?(RmrGWbZ>%=e7Nz{ROR24ORwm^GsO
zgN<38kBWyFBpfbTTU*nvT*n~f)Rz=qEweb6X}e6F`QLD@h)a%hpDFtOh?kFVtyjF^
z^Q$#dle*#jde7EVFix6OM?3#)IAz&Vu$RgUfu@F)dmk@n=({|=MUhJ4Paw<o-Ne>4
zUCi)5Whqed<Jpxp^bgvW8{fmKDc|NR?7ipb#oAczu_*jWckIau8Xu<Bo;C`zTbBM@
zG`7WUvQ?zr)pjoVi^9#Vd);QnXMTS7Wm=sg+W#?q_M?vFIu>b3DlaX*iRShNhYu;o
z$Qy7~ESViDeqR0OCaey=y1C7Eb}+x9BEQw4*_%fCPEbeh-uRoGJ9dos)h8>3@eG%Y
zsnV_#JYTJH^r+P0+}QJK3R&k3*I%ABJpboYrar~Gj=naS`%0gs#kcYL&256;?nhay
zme`>F)PI|x$%|_n<Y*7F2Xn=_jz(U5L@lv_==S<sG^`l!h}oLd#7g<yxZ%Z0MySVR
zar>1RDp9t%J~fXSY%R-5xg@2q`)YCSPbuhqVyHZS{JR(3t*`vIOW5^hA{SYyERb*I
zbKL@lT+3Bi^IL4*a0E(jT>11hS6s=3+I!>z;je@K^Vg@39Xpn9(TeLct&|&XE9Fv)
zjYbeUaCfY<>3H&wGJoN@rd(4wTD!hFV=UG6>({Z2tSZts|CFxorWG`jv>9#tEn21`
z&c_}rr3lDEq|@s=-Bk?PDMwp^2cnUgvueCfP(|eYsehiM-$T@-O`9JNwv}F{qoeD2
z8&RKSRFO%yj4P$kw#T8buKUS9D^L~m%3PguOx`_HT`7CPb^F?-73VI$x}K>BrSI1h
zpKQ=!J=lZHd?(V*xbkVN)Mbl;FIQ0&X?LFgTVlI4g&5vdSFiVq$3mfiWt>bu-FV%T
zt_lAfPdJ7wqLABTTovjkP$oKXknnl_`A_MVb>8Ina^Z)s%!>*dsqvux-LypyPNcPY
zEY6RN#5lI~wH)xIyZv`R)tO62t9bqB1=f3YwAj`Re#$Utv?jb*saCDs`u^?#Usfs`
zTU!GZ0P+Uc|4sCGNd_qwJ69`Y5<<o5Iq~8Ud$7!WpW2A^>*h@aSLC&^nkDe?9R2p3
zMn-b%t)Mq+Wo4ySp^Y)Fun?Y8Eq?Lkjm>hj54Ez`7fdLgP`~{584AZ5jRId%lELx(
z=<*-tpA0<QT=%%oO<9Eyw5$KKPRe{+E;eR<K~$k8&!F-(q#WbGf-Js4omG>b?m}@r
z?9Y&HP%gdkIvI>cRsUXPI@@XFtN(V9RdQCfhv-w=mUpYWxVTIXwR*EQW*R=vHh%Lg
zV8xTH`+p01sDNeJw27<LYtC?=sFfYX<Acp2(sv}CZ3+|#(TTr!={agV)KWM?Mksmr
zdP;5aN^RMMsP)0KSz3+T{v_gpg7>O*h(xVohrx86{Hrk(9TE+x8e9Sb87Q1&?QYa2
zMDIucdwC9LXXouA7N}z8&5sSwFPoCa->vsL^Ze>M{02YhxbrVCR|VHd&+<uMXn1e}
zw^wFx`0!!ZOozcHETPrS|0GrAWAo;(-s1NC<Ri_7l^hK#v0i57^)F0R1q1e`Sa<K<
z-Livd&*(BYo1GZ^vU6F-*80B~S{1BU?C_GZA?*~62=$Z6G(3Q7rfpC4|Gd}{6l+)7
z>j$4<UFMbA?klww5TbGy{sull{I9u%?45hH7rM&d#Y#yGh$@`>^hl30Vz{s;7NJR0
zg#Ha_Ebspbui?DH^k+i|_xL2`@Up3|ekh~WSJ!PH9UZN!t5X&4OH_>!H$HzJEJS4E
z^|?0{ac&cz#caQ!j&e;;PrrWs`sepgGP46Y#rEI#pLychly{LON``{4a<%^Vw%DBT
zuQ`H`?AZ9ax|%<tq{C|s`=ZYGkLfhCPWt+7@p9hyGskwf$=$jIR`vIfUb%8bC*MM^
z(5Ab<x-&Z`r$($Nx}~LM*yU&G<;jJ)9vRC|k6KaA_E@%+APlx_*)r6W_q|?C#_h`$
zdfdDNE@xo<ax++r_*AO>_m3|%lJ4KXt<q-yNiW#)bCLb`zy&obf;;(d+!CVI5A}*C
zfXE(0g*}MX)*}0go7-~0rQBvG%v%aZz{_eXE2+F#MMUxeeYi`Bf%V}A?$a7-Qk(r=
zo_>63dc0n(f<=Ivd#X1faHxE7AvZVo*|TT#2CiABGW07Z)K(FUl>c6l>S>L~!Wg)l
z-%A@C8>OA?s8fxLkI%lmae%oc=e*j<lOOvV(!5zOd`PLKWO3c&I`sp&e(I6#bXSO4
z8A_MTREPIaYcZu<ChiAh#RAnJh-s&4v?!`0E0wBPcPK6p=S@iyAQKxM?fPol`4)#}
z^ELGR4>gjj^94mr>vqJN^;E}jhIq_%vKY17G*&G0ydn1&yFEK!o9DMqeDP&@RYA8?
z<pNfLO++*GcsKAPRln4Qb|oU<xi5`O*Hu(0De20pDkU|w#R+90%eJ}6@<q38J*x<w
z)qhJT5wJaKzd3`9yQ^yp3M>jb^TcBf+-DVU#?3%m8yjiFk@u<cxzRUMtTHl-#ri2&
zIAdCRBIVkxXX}5*s=CR<P^*r*dL{^MYq4X+`AB@^^@9PfzK8^V?d-0O2-Lq%DExdH
z%ZVQxl0QFaAZh&qDc5fOA?2uV^O|!c)`D-#<%!Sd7w6j-OV6ctl{5nacWY<w5ELA1
z4$L=e;Mw9ArfosQFRxun^C~~U<M8*4^0vq3I);W{9_ba6?8VPu?w+Q^+!q=Tr;dNX
zo<HYX8gy5M1-kk=k9|X840l9;SJ-|3I9TZj)~<l4>2wSD@Zm#ZVj|d>p2z%8g|Y**
zD-Rtyq^P*SL`ih)0V~tqH}oO3ezN5=%G0fpXffOM`c>7{v7**zX;)sJ92#sX9J{uG
z1IXHxs2Gy%znak0T=@7~0B{R}Hn-sK@V{k{5iURbW1PQ5B)QC0OAJR#U3PeFjFPL=
zTy|7dRr=gFV0Ti}mkJiklj`a}fXc38??gBSQBadw$+bX6rK6nrGYCVnTn1z9ky)Hp
zSha#k*}7%-Y0Z5WZ6(u_EzWg{TwyoKX!H(WMZ-K-rz`+=qa#N<&{jHI1{$}xFpGLT
zAni7z>pJqXC-$<%*|TT0DJcn_Kd>5qNi*%FIk**PxJZlDbA!d1Psaz<ab<4-V3JPL
z+TA*jOS*0;KRA`aT&|*~rgr2Acf)#;+#*07e!NXhjy?T=X+5Lp`-j>&5Gyefj;rJ{
zk-3(RUW-Xg5)K2+`Ic{P<5MnC{~{JitzQtt(*rp**v89v0F>|H#1K_d`<RL>Be*Tk
zW;H%FFc<F*)d=|U6ze(IUS8gHYQsi4x^|~d|I4TnPbpbiN-sOQ_J9%dqi}(`3Sa!l
z2&R8jr1b-o-@Zi-g4pc2R0nAxPv1p}pkz(I5h$Mnt1qAFPeU5zK^<yFCEmZ!+)~zx
z@G>>hMO>iLsj8|z!t#SsLMs|HF=25foqlxGN!w@Mym@nWxO}l<zzU_#1EDNq0>T1*
zqydr$nT3yZs|aB79lsU$A}ynA=GAt~P~H6(-gkgCO$DbCggmLl>J|#*t|M)jTRx_1
zrq)6K91dWdMiDF@uTutu?f&eHK<Y%<5wU1Nm4=pzx_$fPs@i3qs}C-f3965e^e#7<
zCNoKfL`Um3O_(RE#z6931onatMhahy1z;rk*FquGNgqFUD52I{z(e-s4)+Td*Abmz
z4!+tsCf(n@eatXe{Gg#mmRhjj=+LgX+k|J(iS*OLyLP3fq;R`~ZdTuFmzpv(Yj_}x
zV)*5`Z%=nOJ`WVu5Kqn-S4Li{`%QRFgkbXNJTsa7;)MD?Q@k606^@h(Wms;MWK;&`
zX^wVj;|VMb&)|3R$4PEJ9HDpaXf-L|Stn8IEfEKZ7nG}+Wt7wk3AL2WPVrs)cI`4n
zEpg!@WDD4Xj|?{Dl|yAubq89u-%bI1)VNRN#LW+!-@gBn(+`A)+jy;0g=1e`5-A&{
zCR;RiW-KQ(*Dh&2tCm?H28-*Dr}m5K@$sw-{wQ3Nm5wQ=^ZM1?M?-M0D2_>p5GXuR
zu*g9DGXehy)<}*iEQA9XUwI3poyvIRvW5ty#Fl$}PtX6_39yR+HX9dvAbXhIr@jCo
z3|#t>Mk0Vi$FOIjcb4LjaT0}eHots+up=Vx;-}jVAwaXjfsMyDkSOu_^JfFMpAAqV
zkd<RRCmw`4_>v3#_Rd~%Q{h7NCC9bhJLHFZCNb$kA_bw!h=MF&+Wzg^w}s~N`R2)%
z5}WR-kLfy;a(gbkk3S#leE*!Zpr9=lgcQkOF*LJhAkm8)2E}c^z3rfMD*|5ZU&0G+
z(*L*>85w!&mQ7cn5Wq@01zI3v=TTJrCPZVx#sE4+>=3C6j2yJeT&G*e7>Uu@H%78y
z#FK);3W6sRDu<(d8iKCy`=jF3t5<umI!$+n`_iy1NQuZyw%Cq5^<1{1taN6ig2m(1
z?0zaQ)QMNk`M&cMp2v-GLw9^KSNsa0C)bCNKIG13VBf1_B=7{q*q*U(?|2*5!yO>^
z^4q&8M+637JUTl14g3V|q$?cxC?MVf7eA!j#cna4G&3`M?SyJaDUf%emVM*(T%_x{
z`}ic0HYX$3IUvo^IO+M~$Md6a1jry_rHT}>jE#wzAKb3*^ySL1VjePUij_pGv=a|B
z6_u34FMR`|9!5~^xD^+-C^fr+s1R5Z3$m4@=8I~d`_>Tw%OeHUDyb>)fbV)^hSgP1
zi~^-5!prB3$z8?sl)Bi@c@`bl&b?5J0uvV;T#I}GrDFCEB4nN4W5aJ*EA|Led7+j*
z1mIG6LCt*cm?iBVKpmf6kr0)aocF3y)cy0(*00zuy8~0^XD8bMe>*mAlX4k1415rS
zLP=iujlsfbvoaYBiW!rrh%%Q3fI0J>filuNY-v}bu7dC*(oIcG+ef%o5doIfzX91Z
zD8~BLHaxAgMN5G-mQoE&)rvz-TTxN*b!BC5bN+C8e%nlO`~0KaK%}pZMPU-S1dt32
zWf<s+Jq|dmO0eqw>~E0hiMIXPS(~6Ba60z_lneA!(YMHptJkdYW-UN<ped8#<mBYy
zN=9<Y(OzoGO~v9lwpdqued!0ZEd&!G^;WO~vuSJ5PjEHbl~1u8&Bczx!!CVM7DatG
zcpu%_Z+;qF7d#C@<R^(8$TFBAyZHGbV<6$7h{OkSID7gIC)9$DAciiY^1`2+y~U|q
z7PE~=Il>OxV&J#7R^KqtUhV-E2?0BVTSr<b{Tu+Ww7~2WtQJi`ZB^i6Vhn#&@bv~s
zLA$b1HvxpA15fP<%^i!=af{nz%<N#5ebRH(YqS<U3~!+|=X}j`=G%auPl4M-hMLFl
z2U9;YJmz1}sgb~zdHG|iMh_meu>V0?S?M@ZX0d#6axvCpVH#-bb}A7@%=|Zz^65^N
z#VI`&na38bF4bZ^FVuRZ4x)0x42+13oY9NY*y~ijT-5Nd0tv80%`8C7&<p63MFRnY
zqG9;Vhcs)Fn$BT;$fs!^f;vH-xXq1J@Sb`!2civ|G66}o`2;ebe)s8dGL}C+)HZM(
zIb>*Pn0`8sloo?|pBBcGWuyi;gv~x|m2sc5^q6fX%NhO}Q4g|z@%ZuMr7l1B2@1}R
z_u0POe9HCI;?J9c)l;+Wiwki7%12&pN1P5J0Z5euS%@bE#-$-9p#Mq{CfkM0#-$>@
ze@G39ikf|YfaYkd<aADrbTc>tf~-cg^X9S@t6D&@;N%+rh+9Lb^NTHksCy`wL7ZKk
zB4rJ{pxtmU)U}-Bm&3Mu2)>=l?2PonL)9b4Q35$R#qO-?nCjqD>~Q)rzq-SxJ7epw
z0@b0AiXew17BO$K1KrOhGrHITkGDo*lcq{idfO_wA0X!Q;7hB$;_(EiOTZ;t7MI?;
zK0ZDPK_tl*J+>rDpDbo0wBZbf_Xopc6EXXcf<IB%weg2H0Q(gsxad3>VWD~R{sz)a
z?@(Bh;2j4~X=xR}$Rg`8Xg+cT8N(!OrkU!1pT0a+wQBB&(W{%<1y+U@7UY^r&yUxS
zRZ9rKznHs-v>u=$k~;m@bR10hZ2>%?PKP&)x&j8bD?&T-jNCP6&+G=^79Q|b1#}2#
z?P}rmghC?1eB>!mkMtkr9fn%Av1n%KOF_Rg8Z7~D^!tq)H=fS7m>()0ekE`b3yX}m
z;#j7jc>33uY23`9rH2l97i#<>kW9zStiP6O)e|<*;7)JsJ+!@R`K|oaTO#~~D(#<-
z?VXt%ZijmU#cbdIVHXDn2OAsPz(HB(W15=dQ=NeofkMi#*TE%&*bZ2fjJ$$ft7*k)
zaJ#Y0?eZ_9;&t-}nyK6LM1}AY2&&p~9KJ~<v0Pm>T`TKe*C!vCkFcc#3cGIMhNG#-
zCIHJr{ZjEIf8qfs+<*Ovc=kr5G)VD%)tDkEj#K*1oCBh=6u}V@z(9jCs2cp<eQ|#B
z)I+UWaL&vF%QRirZQl<y?0|Ab8bQ^|f4JS|*JD^Ihnloq=mM?779t?^2;Y`962EbA
z&WglC=o?VVQa~9i6?<?ca1g0L=fEGoeCAt}<#7!S-V+ZtIna9E$iSw78m{&_0|x**
zxL#uY6T?fNvq`1{1M~Ywa~PP06>xNv{<zR{+$0%+(c9V3%t{3g-swxTEVK|LFYK|4
zj+T~|y1G^9D4}@<m0Z>J2ei&FlS&1R1!->QH_JO-++B*Jk#uB;9IX%%1V$H_aX*j%
zX8Z;RZqLPi;1VbolYRB)$9ro*uE<-nPEd=J9(jtE!!Jh*MvUuw@N3fE894N>tJR?Z
z9((kc1-i^?D|T(Gpoz+BV{n6HwYWC21HrM3ksg?WhmMMiGXPBnt-d|cYwej}zPK<C
zwb%~w4;Y3-hz<?iMD})eEkU?di$*C$6^fY|Q%l~vni@;^^w*q;0D5$jZm@X3d-+Hz
z@BvZ6H&WF4E5x4{D=;vlfA<sW$|ct?lpbzs97H5^0Wz1#VT)&WE=uh#jr%1~ipen!
zO(2!9{j6N?>@_+Z$Sj4=fP%xSKMz>(qug$=$sVa9NBcHXI88a6pWGp>B^!+O+?p+X
zcW~tw*Xd;7Pq0$)L&cvma1$O-CkP*S9C>C9NX!$#N6vv+lh&QYh9@4{k1uJPn3&9&
z_roV$E3uwnJMw48xr#1klup1Qu`O}gzVa!0QEOJOe&)ZDknKfq33yDuVWomZV}acZ
z6KpQ9c&Lci{j1@e`D#zPpe2UORtzV>oAro_%D~6;=HA)_nVH^%(9lrCcqsjd$g4~Z
z!pErh{au9nTCa}0Px?_Z9`3(A;4OSwK_MY@D^YRQU0u6vqXQ%HY{PXAn%@EcEF0)L
z+3rzpQ0|VUUnj9fb~@X2dK6rArQCbiOKIr)JV<*a4)5XldI~EQsfDRd2wNz9DAQQs
zizns1G+ue`*-TIG*#Ds~ZgK8B?aI!I!1AG@@1%n+`o|@Tk1hfxD)=*;u-SI~AZ((U
ziNWn>cX%2-%*@QhmB|qG>Zny<`N=UXTR`raO;;r=)xLfEkg$=zgXq~(=o3Wu2M~dg
ze#joI3o5h!%d>FqTOk@jNE-{0)DeQ`9fsSaw)nwIHUhZ-UqW}p%Ih?GQHWa*+pwUZ
zJ*b!{jAV!6{Ss2Zlo@Vk@jooMZ(lWVE;EPF<o^CC1FWD21c^j1SiGz&flN@6y0j~M
zV8^2<@tk^u7G{T!hL%<%Vgk0}N;#|gqayxH68lMOU-36QdhW|aoI*Z)yUrEcSmHbe
z?FVy?4jDr51fU$heft)RfwsWe7uOV+op%fzd_vuWq7xuu5hZ-_@wwL<WfTZ2ucct^
zsOYatdINTAU_%yuWZ_OOF5j)xgy-s=F!!)96ciM4w33Kpnc2@Z*pO)1g@ws-nr-`b
zP<cTLy@kF+c|`Aq_1vExb4P(A0QUtYupPZS(j7&&<T)9y6eMcjV$s^V?_4F)<N3Q4
z(W2N%EX5X<KqiTkKeu?Iq><brQ}7pxTe+q|OG1pef|*H;yr6@FXMwDd_o`<hp5^07
z;iPCj(k+BSEu&wiNkIHT?$6Fmj~QiYsj5~}(Cb?DrzT%TkF5hjp#jqS7oYl(W-#AB
z(QHXWHn!w<Z1KAiv(%~JCR;y0z?f0w!Gh3TEZS~Sa2H^LF4EL7BpMJa*6HhYR;mYR
zHo)Qe2|sv1UG~am|6hI1q0c#7R9>Xh09iPpr$K!E6~fnl=Ft9`1;De#pPeB)W%%<j
z-JP7A>3YSFfk|wKmrsqLFv6hwopS)*oho<s!0JGzZhjAg)xOpK)8@^aDdgT~7?hJ`
z1<Q=e5=vj@zw`u~>q%z=w0bR*xDePnl^4HWQA*qL&9daDsYW}20<Mx%*V1x&C;VwG
z@GUcQ&w#G%OF*+7+KObKXQY4d#3{X>FeCAIR(t-G-V_8Fq&SL&x(O0o2(1{~YO!QA
zwa|6o-i|&p*}p@{{2C5M`9)@7BoHk#u@+5PicClsUJ&s4$@V~l07kLVyS~gf4zho_
zzdXKV$yw|u-$YNZ7Axrtz=4*9jU3^jyOM~HCM_2@awTA3tW@9~i?F>7K+L5pmdmKX
zzWf$(-u-8qL3ejIg<Mc{G=972+*KKD*t0xda*2kqHUeEe7J57an5A5R-6Ii>ZO+LG
zfy}r8!qdZaVQw0++LCE_U@jX!CA)t}^5;h>7#(j57)Ya(z>^no{|s0FZ|IJ;t)>>)
zOmv@=Z~VRJRPAd7N~~Zz+}5<L5&!=1>({R!C1_*xqC*YQ@~H4z<Wx8C116FLb`zq{
zXZu=FePq4~04yRRVx-@E$HwXD8jmukk+a~CWxh+DM=Hbvgy1+zGZw}Oomp*dZTCm?
z7~^=;Y+3I1e4PlXKHVcw&SPT8Z*}cz4^dNB?_9C|gs!e6fE>n#W$lGrSVz*IaYf|Q
z+&w@3)gs%8?zkm@kfU8`Sw8Oorvw%A+4|+O&em^s=SRErr=n7&0d3KR3OFJs_Y_*y
z0qqI%)}p*~*$`(KLwHkFg)*XyOujbaIeG>O2<*RcH~~V8td_4`!McYFIs>*_meg*i
zAQa@6s6HY30sZAVC<m-PV66uo=pxvqm<T~v-s+&kK;vHU5%TWQT<Cr}h87D-pULmw
z&<4#NC%^2<r-{uvd|R54_^@EcMn@qG^1!0Pcux&M{dQbtfC+_lLCc+8SXlUBG{NZj
z2Ua`}pu##JS`fS=Z$MUI|1bXFef#8CwnZ!1GetoRH>@qk+HrB6heWJTR)gB~&^BMA
zxfK^EL(?Djb7WXp*qu9DfsFxzOB|SZV(2>c#TVGPNG1ZTSZok!{)TF(QcG-7->XOZ
z7p0T>uDZxe{0^=IE_Mp{4x1T|3Ks=5K^Zb$S=p~WC^i<I3tjvco_SYS7e*jp6z>)j
z6O)pX;^f3f@Nh;A!}fuW3qea<#$y3J6L4ht8I9#%NU{yJ;oKAR2^<XUQfy$0CyUCT
zjhIp;g+3TCyvm=@Ygb|@QZkug;lL>oF)lt2Er8vHbOXx9-nw-hVFFT1j$t4=kSNUy
z?TmhFHtWFfuV2)`zE`8wi<?BzQs1xyjGD4A0Z9Ewwy-k2(c>$B4DJBV$=nc*eu>N1
z>s+{ZTTKHB!jtacpUmP`&e9qHKbMB<H~aHD>D7|0z9;1$p_LLok?uXLJzuV)k%`d0
zvf4|<Bt;a~Q7NiZ&*EI%BH6scXbmPCb^=TZtAjA}9EBd6V@ZIaaC$*00$wc;MD(4d
zz~H%twgtIEg@uJ9Z%Feh`wjmZ5g4e9cLxRr_O;vo=y;0Xp_cHD0XsyW{)Yf_Z<o3}
z;R<9Y88lRQ5@<d+B4(uQq26(AOJt9VgYwa%ZUb2rtJX1S?kq;JU|VYPs$0)hc+6Em
zZBNojPssumtsr^W2dJJ{@7%jL3mn`y`vgQJ)Xxb_Cy2WIx#s|&Ke*z4wx{j$%W3bU
ziIIb+21xvb1rPDUsD=)rN7s5#Pm~d$yLnrQbHqVZz~`ePnZFR1kd4^I$2ZrI;f_uo
z>1807KnwVRkW8yZrQwCK@7mQzDWeZ_9j+cIMj{A_@g0oBl^y*PPTYdj)YO0iWvD+)
z#2_yr1^G+I>ktp!r_lU`(uEBSi~2li2V#WEcd#C;hg11Bn(6tMI<Xjrhg|D7{k|n`
z9aPQY`HAL=Ko%Cim&_h>Bj6ENW2m?dU?2%YVB$_*hj78E?yk6mRI(#U6&jov{V?84
zgFE|E<p?WH71)UYW_QpMv|$^yJ#7pjOf>GRTtA$EzWz7JWg2~@BS$X43jurhQOOa=
z=p1qv@KqzT=nAP4Ag@IbQ>uTmoOp7Klnxh!B11w1!SD{8UJWKm&WJcTIKa_GzXG!`
zCMMrdAp_4fBa$vX^Pxulm*ly-jOZx-<NJ6HkD~?iNWXMuaiM*2y93AqcMRqhKIfe8
z>FO%M)DdhU(Q|pY5optDlw2gEjm_v&WmMqV@WEzO5D+;Jlrm#R27(B<xr8582{2U1
zMhzX8s>+cgcck51;1oPpT}cp&qz#ABALMo${s9UhV|HY4aBxgaOjsC(5R#AVuxVu4
z!msOu9*=vCa;{lJ{}p=u3+UZNTv<hU9=G|89<qgF7>ZiQBo4)zE=Nnkc*MR#hz8Hm
zOHgsBQMDKBzP*h=Z;w(SDKXJ*s6~vI_u>=JWdOCgaGe-L0*w7Bc)T>l;($l8<YN_g
zN3{^d4{y=maTO0)@kW4zLM#iKy0AIR7k*}h>p5sQ-i76&PPv}(j)<kc5JL<sas?JQ
zyGk<zAU_NOWGpr$8I5nV9?&cV^ia@i)Y||S8F$=;437{&gi`!A8a!=C#mk`(F!vg*
zhfn1Tc`z}4W$$7Z#3G#Cg9){8r#sQLl0KIQ00$nUM*tdpRB7g)@`~;LhjVgrXkz=|
zt=iT|If8#KJb6zL>i3uOv?>M@UYPd!*;mgsz=|g{Kx07UcNk;QhuX^*JCb14lP-{i
zL@{U&9Ok)^j64u6Ak@Gwf(ehReSXN192KA$8eNXO{X2(Ibp<`SWTXb^p*PBTE*{s_
zEy0irvgCqL5nQJOt5+caO8{Nj+1V&Z;jyv$j|Eymj>+dj?+>t_%nx%fyOoh3+;_CE
z{Vn#0XCkm^i$T!QEk%`g!HfY$BOnX79+5#AA;`tW)f09~pX5nwX{bGe2w!yXG3Td?
z5r%|NHVKK#q(v<=Z-=lAtW!oMr};LpG=IwK3Sc3DZ7xorv;3?F=5g$M-+w_DE(5FE
z*w^8c38J6h=EBS1Uk0K81|aUVi;e9re*1e95Q3ts4AI%R6%xV`T)Ec?bFSRG|FqrX
zb$@Z+bqu*c5pOO!SYK$YV2BIbly6iKa6ZQVB!Kjp64mB9czp@A7a$=35I6k%X!IW(
zPf~)aL<PkcX>w<7#Hu?lxpIgfde}#0nCB?#xe8^3Muv3T3k)b4VW9#}2&Jf_CreIg
zTo9T;vS4&{Ca3}Ur-wc;q6h1mic-r>y^M&L{C6jVvEN<Y-J}M^Xeyiqt;{n|!6%_H
zJsP9FH>A<aY!Ts?lA7O|e5LaZ3cesd2>T3T*4@*?@;|eAqmn#KP5!@=Hr+??L?&<V
zY*_geriosLekm&zC{#Xz#fi=lq!?}GK0}t92iXyU1CWYnN4Trd9VpC)nEDk?;!wbt
z?C=hUX)NwI6de|34>t||DEC7j0Ilm19E%>t6ph)ZM@$Z@3GQK%txJu+KY&p?RCTDV
znF(@qX$*tDKicsR*B9PRe7uG$8?=xQqX>v85CD=o&}c&_IiL@ogSed>HbA(jV;7!`
z@cx-L=JFxoj=p^AB@~zo0ti%wsLBH{PMtxs&DXpgP1<5WDAwt^0f_=53_~!q$hn;<
zZ@{*s$v<~>5w$fS>j|~J7%PC%K_vmDMY3S`5p<EEiF*J|GC?%;zYt2H^#FY8?*@zF
zTm0s`xMW_xdnYxRTaUaTx=)d9V{afE<LmjQo;O-@t-!=M=$H}8JWH`!SC|7pAaK42
zTnxWjudk${B8mAUTo0s&9O3vCZHMmt_U^1Uy*!%*Ns46QaTmj^5rm;|%Mn7o+W6c#
ztk*Ew-gw$1m(0Y+I!%;crt<y^QCfTsKaRtNf;)2?iU9+&vGCh5whY<7%cio&^4CLE
zy&tQQ@xWAUMFlQ2^PFXY)`A(q11H_$^({_dD>@=%gLZ>727?#CN};UdFk!)a@}cMh
zYPN&04zP^Xcqq7Guv@GvQB`7HMqbjNH#R229`t!VCmTU%IxV%bL~lGQ8LcAwJQ(P@
zJYK8d3ugeFT2%U+i3vH-g0jN`wuiw-Hy^bV(ER`j_-}f7W+&w2<dl@Ej;*7vXhbz7
z$2TqriDUW)E)z4E279eKo&W|A-3si%zk=raf#0~&!3`k_%ZX**xg+F2aP@Mc!XLij
zD@qLYij7W~)2@U*!)12!@SMirBA8czs<`NVIDjEA#{3VXm7@*q{Np$*NaSm1SeP&_
zhK4oguN+vuq(McCVDdZ!Nlm&H5wVR0{lQ#K8fM+*hO+J{k{5z-qJ>WmIfC~4KEzYx
zqu`@9gzMTUf_ooOq#_6|(Z9va_4V7g$jmIQOW2Miw>{F_=H~QO-XK(^Kn{Q|z7Pj8
z=Y(OuOGrrY@_t7p#k>d=x#+aXe<lh;1^W8>0EzlwD<CN_PIhb*dqznkJv)(-<h}1P
zfCTfW-CbQOKvUSWlmeVPcg8|LleY%&#b_BSCW!H0UB~U%1i}W+jm3YYo%1cXURB~*
zGE7g@u>?SV=;Oz0DzvPvN6<CItP^}>5JI*zgk(SZcfgCJaQ(>Jx3`ehs^jv%)k+mK
z3JK}Os+E22b#T6ACk+O@3}NDeUz>k?`1#lxDw!7m`gZ7?UxbY8eluk^LGwO*_z=4f
zk)DY%es>4RPk_&o^d)dA0DT*2ts=WVw7&!9o}yZyxJ`LC<%iN?OeJ=ov62vj8=AM+
z54Ru3F|q@zw#2R(ER;-3B55ft0Rj@Flz)n4RQ}oVG_MtO@8L}&In#%_h5Ov!VqD4A
z2*WhW{||<El6+F%KO<H|RSXFGM-+WL8@ihO4c?rC&14O^qWeQ&)22;tG5v_D2%okY
z%$hBYBs3T}IC=6UN;shJSE$T61s^Covvlr5of+fM-Ed!MyyAf~pn8&Mc42b21*Z{E
z5@#>R^6>Mks;RXX+MWkuk)tdRpV{1d^X5$yEO6i;)H{$e;J`^Zn<NNkXCu}(T|X#K
zhGi(*5_w`tDJT%#c)eE*yx0aS23v2Uc?^8pMR8j`YInP|Y%YWcW*n{`WJi`Bb#a;5
zOZ|r0?EjZ1uH{A4S_v&h!<;MvuxGUU4G_vS4|RzFk3J7OZ8wRKUKFH1WH_7()<L@J
zD&(W)Bf+42FfN1E|Ak-k@tEOX6KHP6QwF4Y8zmyZ;IA4f)Rzz_Eq;z<ePrT+#iI|d
zp`r0>O&QIXc)!d+ED6Gt!S5vulPv=JB)`Edxa=YHd5fk2WpOedxDO;p6LhbkWg3Vg
zb`#b;#@e>6d`gZ(!&Wk42)%I;fF@}2_S)eDvUvnyO{0xy*2Zq);^68bw{HDfWJW>h
z@n1wNv<ajQJ%OgFzEv<rWBPX!jOIrp!Fm&4!&IzirMhwB26j0tiu{s5eR}CfkFN?#
zfKhYkE)AvF(?xcsqSmRh{~OHn_reb>HE84Qz^+K7kBUMw0xLkW8O(h{+C90>D<k89
zBO4<|)_DF!oB|2wgYRZhd+!jNNoT<X5S%<&0XN<E?Wc#@W4Dy~`!LRh^@Zgl{~B|9
zKFYspvM1eXSWU=*-zSFjwX|TZ1kjLdWH7%yHZHCmg#^)vXA~Yd7%C6b9u@`0wm~YN
zL+4WB`IV%323m_n21FTyaasq2?Hp<-I_0|H8yF72SQWjT*l}`Z1VePn%4r7W3+V2D
zM*#}w6NYI9iU>|g&I`mz6P`9NdmKb-8RrV69xhh=E}c)P&jVvg+<x~?4?L0#a+Gld
z0oxH}<%{+A-)WzZX}~No?3RXy+Ja>JlboIXU!G|Kn$KVi3e9sna3~rEp%k==;W`rw
zzBPlgwEu6%o?PGRpI%zAL#UxX)O7c1d*P=8#6kt&9Fa#10GgCE%mb4NUwrcZzYau+
z-Cm5QAzY(s2A@od-i~X|`*rdmr33zB90WxL^W{s}TVK8t7<+w-*()#{kblO@)LhqK
zyn$|l+&aSnf(Dd0mj}!6l%9P9rZdrAW;!g+$_RgP0(7|D<$L-8)ffNsbd`Um`<soT
zx3O{9^Cb@t^cc38_bK!b;Ot3Y(hVK-t6{W+s+M&A3eKwMJr2Li4i9O#lFy1Z_5x(I
z(#|vt@c}?l_K}XlmTi)I_wIe+BU^wEC&x2?Ylugu^B$a&W@GyTqni=q5T?m7Th2;$
zY;}SyHw42HU6|0fJ9h4b5t)T%F4*BENZ;Q+r`6_7oA}Uz$H*1LEixSjBBwIi4Zr4H
z(Cw$~L_**WJ1+bDEY$nGNjFT<|0l;fpO(vh0(zjPj$j~$9Ok<A)N|AnnGq<nFonep
ziHl>2OT|NAfO5P-W+4k}@x^3Cz*Vw2YFB?079W3vDJBHT^BHTm9S3&fT-Rb;>1@Eo
z{u#*HXUywq^VwMg9%w!+ICLn9f7ACT;+Z^O+tS?`^8(ED8j7sGd@OR*%lmlcvft_R
zCp1^md)>HHIUG1U-DV)Yl0DmEPeu=mv?{>Farsq8i?&I(aSbP@DRKltz+j>4HQlt5
zrRA`vCvo}mWr`Imh>o$tZR<Nix=h+se=Z#^loqM2t;OJuPVR+Z*MM2FE$ckm1=h*4
zeJe3PiK#_QD5^mu!pW!;4hRUqm?7juMD{<+Di5Qrf!c|tzOJ2JuH8|@6->>RM|u*G
zUst?`?dgod2uGs?9M<kAlnQK3=`&9VP#t3<+>PLYw)L#_He8m;GfU^M;=*8FVkbv%
z-MW_$|J9MgZ=OFtP#aHl?Zu4(m12N&Dk?Ya-gP|n#_=8_6O;Rw&Jfu{sGfR7c47_#
zckF80+7>Vz*azFMy?sSx0ZdL9ylyat@7}#zCk)BkT3~I2f)}=KE#X<S%2~qm;hVma
znEoY+>ne@uPSQY5JzZT_XM-*=oQAG=Nzb=vdmO_q;t(O9o}H~N-0b3ngoJx?%PMby
z0?bTJNxIKzQQbYBIy2rk2Jc}Dx4LX!0nw$1@;M7fYx(l!iBcl4f%DK7L$vH-q9+Jv
zX57G~P+Mn-hNYuHQ;Zn6h7*1kAzSwn5Pqqpp!W4^#m^~3m%Uk@%k*eBJY7FOKRWqM
zyHDGJ?;$bN#rcVdEQ?lg50AwcFAlODMBx*;)c79#k|gitK|M?|?j@L|!<3Ocm7rh_
zi2cmW3=b3hJLj#qf#@JQ=i*}yE0<1Ya_<ro``q4cz+VcfIeeXti@TH}$et(c0dfpg
z*WSTli+qXeblvmkodCq+xON;LD{DNa73xk+p-2`vjj$VhSy8!#nVA_;Fg8BENq)zU
z9j+KHh2S_T&P@=bJfjYT*bc5yhfYXg;{z9dNgBda9^)=e`IhDO-%qmnq^9;kSiK5l
zvAlHYp`9)v&4P_4R}=7kKwciR6>uj{f|cIb%*(?>?|G9%SoF@5*0DhY6j^s3c1WiP
z@_(3~4o%3gZJYC#XB2EN5MC#E!hC&w>#Ua%K@~yyD=LqDj?LFtqT8O5e4LywKYen<
zoy$|%+KwS)?EC5t^~DpeSNV4DF1G8lAtyKUEih&WKR*XN<oE$y!o>o`7lT5<uM!jj
zV+GF#1aF8su)NX)2YS#G$j;53gZp=d7BYrp9>BQJfHj1tq<0CEEPrL&zX8}~xMI~>
z95za)eb&@8gX*#SbgsHMI}x{=jxJ<9lei>0E?^hXqL(KIOi|IVZ{);BdGaGOx9vIO
zNh_apoMPq5o7+WH`>qkL)h(^9g98KWnI*Y>L|vycp@`Db((2^!=#=E6AN?GbpO$FS
z`g{FlV+HZ45U$j0;MkZ12n5^VW!m6bV*vjE^k{P6GlviNv3oBNLO19I9@(iAbq+`_
z#DVy(U5ZqD0nEUC9;OY1_Ya`>O%@q_Dnm0f7ucTg(Yf(+!o|OY$&)?%zf0bSZZrBP
z07NVo+KF>${e}fldEUL|=XVj$sIDGFdk@B-LI5Lh;us<*6$fu%?@M_s=u+)PFCCin
zD4us~5VjW!Y+&^W^I<F~GO}{oZ<|w>&PK?#(J?av?(O>u=77aJ)nXso9VFJi!2m#O
zOG~VzGrJFZ6)YGpOvld`6!6ffk7_aEoId0**uwm*|IM4B>sNa&!?9pAy1Q|;E3{Bd
z*p-y5tSoX};~guTmvQs@IZIf=h$g=(FXPd7gtd`I1-xY?-xW_fHo>$7*~9#?{561)
zQv0Um?^%!?w-=0V@6s><PjbCN!y3MA`*xuA=i1t1R2w#J!eJH3Bo4S*EW!CJ)K6|1
zb8uNYlf@c{6Bs&hzkO&ql|td}U4=(nw_({Hb(rVd_n!5k{fee@0jv?tn`>mPG5qCd
zZ}0E#FT8(0oEFT><dMt$4`!QHP+brt=>X=If@7F^`E|iS#L9cRi0WoBqu<X&I4f#u
zen6`|aRkR~ca>7a#{=lVR!>n`BRwEUcQLK^T<nLyMe@>Ohc@7WWaYr|LZnMzo{kT<
z8}wNb#Cw6RL1O&i{r^0Rz&4CM;@knd580h18S#2b&%4E!CoPccAQH&J;r4Qa2b#NZ
zZ)-`b1|ZMjV`XE5oWGivmq$MM1q2FYoZF9%e`vp=HE^k|q!2$R;_0zHy<u;=2gm;e
zMeGQ?dWtn$sK}s)ah9a#8iu3tzuyaBCrYsLz%jAvkYzh~>Cymfr?B;ud+bn(v6ppD
zAPnS#Swjs#BLuwyJ>V$C;eJ~3vDCheIy_i9pc-_ZY;C}Uk-vsGg8s(>5man1K)Iks
z1boP;aJU@ghhM;|mAm|WKzk3wqphtC5ktsR)!scoDIy}m_5!Gk@s=dq8NnThTw<;|
z_%EoBUhVDe#e-9`q3%`qZ%@WQ`(c1{9}5e!!&hk0_<>V$LY(nQyrR_T6<UpC)f1_j
zRX8ZKF-(c%-A9b3&Cx760Jb1YW6W-ODl32KB?6;9Po6v>2`U>#dx|f8-k0=zndjm#
z2oE@PKwzK%4whWFkb<nlI$!NYEP%XkkjMDPG??EpaYkZ_h3pf<WxGO)$+6J3hXk=l
zg<Pl4T>sgQHr7)uO2P$1yv%BP{uay|vb{ilLfWU>iF*EXy2L3<Xsf7Ie0DJ5`53`_
z#2f3uyPcXKs@N?l<7=dMStd^|F`Q9P96W~S&2|v-755j9#RzIbfdW#n`}`Og`CMe5
zR`-QirI3CS){;0M-`lDx9!bd(*?*QRDXb(GIe2&;9K5~}EwPD-i9i+^lW=pw(~!JQ
z&;R{8*jMEhsl_{p-Y9ZhhF@2U${O=pK0Yh4N^7gZpq@W_=0l4jM@9?H=f=h(T>X+T
zIZ?~W?e*M8hGn5z)xWih879>-)A!lN%ElbaJcUpS@-NiNQ;on$z^);B&R8AUO!UeK
z2nd+6#_3QIo_E=BV~hL#TUk5MjgY?vM;eiIqECxpH6L~Kt<%}!*@A}8HdEGyrz<>}
zZ?(sF<?|85+I7wn#C-V3zi8B348T%2e0Vh*4mE1>gkh~@n{c#=e_f+0L#5l(r%&an
zU|lIFC?pvz^UMv~t@E+ES~Z9Vp9Ny^KR!!hnw8$)a#?Q<Yyll@ZE7VmPhI*A8#Zj&
zveU=P-u{I67UDfv`XQ<jB-yoVRQO1OH6Nv}l6T*ozW`0RO``o{s-AkB6YD}S-Sqe8
z)l6aaL0jsdx-8=Ap#?RC2ZU@5>_=^e$~bGp7$ahA^v2q6gPKd_QntMmZDj@&VXe<c
z2w9Ona~_lnaq-hwp%ojqZQHeuTv@Scb0zmq*w<DhQvQ#7m>ASCRmX5Ic@M3ZE?r8L
z)6@(_Os=}3xlC4{t^8;(xd^miS0(J;|I)knjy7vvqfxeNJ9ReujvWV+UV`HG73@m{
zv1)n>T_8!P{1c_|aw_<)v16tBVQ}Z9r?Zp8Vg4)ka3`*Ic?K|;wbXqe2Eeb46~ti^
z&ad<5wmExwT2QRs6tpT4*}Gy~tb1^q%0)%iyALJZBu{E@`xgT}%QVs&>O7y<6XVgr
zwH|>*$%%>F+o{i}cP(1uZ^xtO_8)qHMU(xXi$+wgg9hkdtjFQ)r00seIXR6Xr%^W_
z9!q!yC-Hb|k;yxTU869ej7R>#R3Fc7do<zs_%Sb_kGd1Q@akx^=-yH>=(NzWe_C$t
z);ZTj4SVhh?YTYaod5V_kIy*`ocH`g#R*y*5Y&Z}2!Dp(piJ^yB`tDk34VG7{?t`M
z7KZwp&cwtW6xfRAq0!Nj9t(4LxdJy6j4ymWsW|q>N+s^$<`xzfK%y}-(~mJUlw06G
z#@0{~_PI`3IXX_F#JNrVkOQ0e^<{VWk3u1Qva&`HyO0MgjEuT4waNQA5Pa*_#;sd>
z`}-L-ZQ9|Zp4RZsKmVWrOId&QsY45y{7Uru`%tLa9T-WBd}k389nHtb=R-@wA`O+=
zR~gJ<WMmW;0EUDw)^&ekmHg(E;%-YE$+4ao&}Q=q4nBqv9=r>~8rY6vvziSe5OS#V
z>sNU?-Cutl@0+p&ZbUpu{6WeeK=2>Bu_-Gv)5^{cdbtxf(O{NWch|TL=VH;aufRmy
z1NFP|0J7@pYVbJB@hQv8d*K;8Z{alGd+4UZ?)03TG17wBtz%+t4r1ww86Z<WAE&7w
zcfYB~Qf;88&v%`+!7AcZ5v<nF&CTT4D9-<VjD}5?4Gsr@-OR7s`1UPcDR358fWcHZ
zB_*Zxhkjydq3H33lyZ2XU`8-hFTpXx90?p<X_t1e26Qne00wv#Sa0|5-^YIeGJOqc
zVC%(L2EmQ8R#rb?K9Sw`#3LIZcCg7%#AQG|><g;RHrFkr-2z_=?ErYriRX`DxCWS@
zU*;<2%}Xx#WoX3b?d|1whTFGqBdbzilEao`=im_TL1Py~c>|zR7;LS?N24Q2&fRX@
zcuQ|DZV}_?kKqeJ5Wk6&DNju`%{u!MBz$o!ZV~5WcXCt=$;->bW9;khHnz01#Caie
zbEsESHMJVdNXR&k?(jB*1SF@-;m59DzaD4C!Rb<P5E?%DzStqN)igA55E#kH$==@H
zNl8im{znneZmzB;AL&K}GwL@#^`<N>EoEY41fRLFY1h}jzCxJ$$dGqppoo|S^TV$r
zBDC+yU%}5w)W__eG{<*^Lx<9J3$w9Qn|U<3)#E)23!QQI_&36r8Z&Xk=^T5<Lr`*^
z925V*z`$`{Lu2Faj*gDLKC&3SfB#;EZz~<$7^dE^`Wdi!;q9U!jQ3Tvqi>!qH9C;I
zBr(TzpyQH^v_S(j#kZUbM5jzqrXK&r^a0N8J#TCjeYfHc!q#Q9OYuRmL5b6`%j6Cm
zKN^LW#Y8ULg!_>vv(6qGG)1G!^sqa^(776$XeU{L$zY|~<clzLzd?$T%E!QVV&Jpd
zWs18QILc$Etg3pO_u|XNyb}L_fRC}$2u`J=R-Cv|Dfc-imp7<2x{;4jqoda@=5!AX
zM1N!9V|xLG^FYiN8qZX<7%cN!E_88~NS~9gb2Y?^Ml1PZpFssNcwiuL2&hPtwWxC{
zN=nDm8t|;7iixy^t1$t|@UZ-BYAieqN+eL7>p=yD3OFT+`Kn;lct)_E=x=VEn|V)`
z*!A5CID_Da=7X_Nr}Ol~hjFZKsq1uhz#&BZ$z)X~CPSq22sluL`%Bi&`PIak%JDD5
z9pDx9%1+>b8b?(HE51WWJ}@0U+GLupqHvXwnfcK?H;CJ*v<5i+d-o|N<eR_~={}$c
zrHn3WU96FzVHPMCUzx61-VBDALGe_F57EhsiiyF&XDt<)#Z!Y+)dxRh*uFiGQEVR@
zo1EiTIOS=meUZGI__VB0rOfhhVzQ|zzs%()Ps4GDR++`QoVhGb9jI0u^-E!T4ih<-
zYY|El@(CcKYMY7y15886r=J8cLB{gG69eqvV0K18&qBx36&@ZQQ03#xx5#7MMahMg
zNhq-5eo-o_TwgjVjx|GY<iWzyGOo>a%^E+9<0!Jc6Y86iDt8{c9}=PwnT9zEgUOI8
zv2Cv9_x=z?g7Y1c6Ojj(ej0sM6eT7)I$OxFU=9VPp9Ng1(La9taKaEkouPXqeug<!
zar9%T$l8I<PPyy!zWw_rP*dSI7s9<ji>9s2EzRx@5tL?^Thb7#LZ^EH<0&yP9#9ss
z_X+pzNds_Z#!aVryD;vd5-W*@VBsOZEvRKW=vF`#proVq%z{TYtwzU07>_sL-w$>K
z>-2G6o*st0pc%h*cES$9>;Y?o8LAUG19sw~{%16^JTT*oLYJn=BO_A|NJN_JpEiWr
z<LBp>zXn_-k1~TH_EEftib5;kK|?>xlt()GLnECPBO@aqzF5sz_-?SHVDsi*`do)9
zG=6dl4i~CgO^gH=$=zC!dPsSQGfc<GprIjuWi>V1W6O53X$cB>exw^fFT09-YX%8E
zI6&aBh?gt4UbxT;-`T{}beE6Q^k`CeI9@ZAfhI8NzC!uwmpJX+xf3RY2=+JZhA|d|
z>}|Z98^lb2zP>(8JvavBc|>@>nwpx&kJkYnXc$FVjqhM|f<fG_7Zy^TE;%2Lj#K#Q
zbO#$7phpAzSyXIn(Phj%VK<`g+yRZi=nKiM#V1k=JQmBaRUf`IynQ=B4&=hUK5JxT
zfHUxp!}|2};RtKvOf@qwKUm*fDtC}0lo}fKxDLRBWB>mB?Cf9hGi=lrc^_Y22sV2d
zgWFugT;1Hzbld4hTn`K+eJ(h7Nl};p@9OO2@WDx(x7ZJS9_zdiIRt@aR-(wm;Vg6n
zRT?-?!FO)Q4q?m3y5CG9QrnB{c@cTQaPlzDUZbma{OxEckgt3|30V6o8dCh|_r|OQ
zyOhUh1IWqA!No)2gW4QLQ8=oif=xpbl4BQBQ&TtjwlXs>pfe=nGS2P0qGoTQ=Gx!K
zIEiyxN_WV}*xXe??y9J$V5;_kR1caLHV0Z@XTlA;SX0{)d9-*kU1jJ5YhAw^Z`3~H
z8<M+uOD!=V&Rt{r=~2wYaYzO%4jTW#$+fc+IZtOd2i?D!U(#?pcxS=P`vczv<3Fz=
z8|Fn^v(wYmY*6+g-_o^*-FPLOM-wFhPc}nCLx{{sI&4Sz)YmnpUP(#h%U1w&qcHz8
zzr)Qq5ovVx>}c(Mb(W;*m4kzWXnCZ~&oXY@c)Y~P+B6_N-CRrSPJsRKpIdLgc=<99
zE@GrQh$be42gNWo5yW-y@L@Bx0>|Olakr9~7`@1vZb;bRs3hbmA9AO`2NWWd=Y&$E
zI`V(qX2&_&;ZqRJ_hmdNHI5&z3KtNd(11VDx=r=Sk!v43_m{p~x$o9TlRXD4a3ohK
z8T<w`tLfcEc&58E1R?Y`aw^@#ut-BRMmj5C;YJFA?lRF#9vdyA!)S|=?K3oz6RCm0
zDz;vBX#jo)n$vlw-ALT)Nr54nTqy7AItS}}*SozUA~N><nh&T^Z_CQc0BrL(Nt0-b
zEK(cZ_OrM4K%@~YIlPZ<kQ#hA%m60=U|QIBdEW+WQ65`yZBADH8psHe;=-cPiWMum
z$yGTroR!#_0!Rn=ek>Cf63PWfsbfbqQMTNWSKXF>3m<cUXLtceZR?}sr@x1Pvz$d)
z>1r<p#7dVQ>1rQCNTQzTQ`G`b;^X5}etm(|Rh8QJ{UzyrAUGLXSzl2Y?z?HA=>RvE
z$G;|8EJ0f?t){KqL%`Sk{P}!`!PC?6IS;{3(3Sm;gtn2wTNFGU-srr$|Cr?vU5jpK
zXQu{8pef+wER!G#?taTQJ|CC|XDkj*crZ)57Qn?26cl{NyPJ=X^yvJ-z>{J48dq%s
zC~HpQ9B8P}5hbN2yyRy8VUnPtlW(>tEg>q3-jWIflrecAu`itr1E7MLfu)57u&&>W
zV$Etc%w5%8;(83<$`mK9E^gnxeUKVB4Uy;SLX&<n4OTKdY!08ytSs$v_tJ^0kFiyH
z+2^hcm05jx23xIOXC2sRhI*=|jbj}IVqY~4TnfBp;5c9!47W$gPM&H514IAd;O@M$
z4da;E-Hex=FyLTv<A)D+nAT4H2K(jnn>We~Z@4ZRKSPxOyXN<Ib91YY76fMl!*#Pu
z1<^i|?348M%a;)hJt;7tRU;)PrhT~@m<Sh;%^&qh9fe$z4lnaqzG4Li^DqeeKmu>u
za{%2v$$&|7o_DQmIJtQm?_o4;L4-6mYPDX3<y3C!PSn2r0RV;3r7#$GKQ1mV*XU~H
z6E!n*`e6swmzCk*Fou$XUg4>VfYL+ubqM)Lpg_gTmtY&Rv*AiqBV<b}<P;EDcukct
zO8=;JEsRr;cI`>;HVj3;b;SH#BUB3DVD;}wJa|yapOmy|{t7@@MI3hwYCe>>4-;@<
zN}T>Z@1nx}<c!-8nB979m|254zaW&1`%->nwcNAj1f<ECGiNaI<L%}30-SN(`t=_N
zjH5QB4yBw-OEOlA77gaG-p*rdYnz&oPWu%v<%m3bUPBumGdfi{IkwPfw!GqOKIFR$
zkX?BhI1u@d<bDwmwVZuV8X8V4B)H<omu^;7R4B|AJ6zAt&u<BLgf$FkGgS>Kl1h60
z@go@+bxuHvCOULjcdi3&MYXY;p4@x{!w<VpX^!K~GmpBCBq4NdYzrXCdmA_5oHc7k
zZ$n!6hi@9al9PPkXFMz4cGK1P(2fp?M`9<1l=pxF#P)XyU5{q-y>g`#eH+_lm8tiS
z$TSNH3Wi8LxStFo44zeAE05*zk-2Hx!L#q6P0&>|y<}*(_grO=5Peo|Zm6HMeNcYs
zJ`oX?y=M<b+Tydjq5$1&F%BsQTv%jvb#;Ym0+h#+O)p*2#oMw>=W1zoiAxI$;k9aQ
z4xPY|qECwT;1$T6q~odkFEs|W`AMFyX6(zAVtaw}N?k^eQ`fRc>-KL(+R*c9#m`!E
zu(Jclljg4>BX5iJzPC+{;{T<6K{G7OfO#$=VQ5sq_QI$Gy*$WJ4|xV=jJh<>@cAm|
zhn=Y~)Gj+ar)%fz6glg)5r@CGGc)rW*xB39C%RljY3-`q@a`fb6Vn^K*i>;Zh|3oc
zz0pJ4w@0bXv)F;@BFd}i(Mu{3JF^On2+0%8VYsEu-0G~Iyf0q8ij{UdJr)VDuF|13
zQO6g=v}+bdHvu4Vx$@AdpgLWCP?$}F4t8s(si$!kML5G0BU6#>x9)TC@Q|l=B0u#`
z4>y1rV{Pm)m)(by-AuxUN6sB;2z8u>!Y_v~v5T?pNiYszIeL`33(Gfae*(z@Z9MTH
z9rMLmCMVKV(oYKL6=_CBx6wZmO9VGI)gSKYU=2hih%nJH4pZvx?d2AJSLf<cEzaaI
zbcL22R<rF1pk+rx7m=Lm(t-gTjIk&deR6bgxV_5MokhSSVG#sL7cDD1eUmL_YZE^<
z(rSl%)6d7a@B0mhQNiKY3;0k`Q;VU&tZ)%lBYY;HFmjfB6dzF#TP!Y$K#CMDX%4-f
zo0Ee>*%}sC&tJL}ZL*(#L1gx*zyYwgTNPiveA#BImUcA9$*wANSNY<?<;dwpoKqoR
z!qUdx$9XRsXThsFF-!6R=X1tB0^`svQZ%BW2^_Dsp_j)r*-49psQVz;l$4apouusi
z?02f}-k~+lB^XCQgQ!nye<=;CJQXFSFHRQS9ZTM|m}uvEM9h7zAb{9`Wk-*RKF<BY
z-P7sJVm4hQg*7)X#f8RIGokrzE#y#<{=0p&uPM6lU%#@)bdskbpFEKr926G9YeaQt
zm5hu!OR`(n{b~FXn>2t^<3$e@Nq_Z3`)W3n--Moqe2^WR@bw%V2|~PQ&Ouzg3yY7x
z32JyjzXgmPGsPhz#ax`6Zg>+$<fofBKZi5vahsmwwKGC~!Z?&LmFh2#*G(iAJ}<zL
zC6jd*4>L-+<f0r!ej>-tLl!tdbdz0nkq3$OIc|>wHxzR|t3ibbujv*%i_IRL7?Qt+
zHI;_P8gjA`{~vNvD8m`nB$wL?7t~};fh%nb`VjA=se@k6aRWwl`k`C;-jt}h@R`4Q
z^$MEl5*Q%PlOq*yFap>D4xLnNM!T)2w-+C(oLbQj0R^ppcV35V7(y;-Yiac^Y!v6x
zOi^ch_g%Q$Tg?)|Xi(-_*8#`<^VTplGXr#l^SPwax!U<LKQl9E6+9lZJn<7R58xac
zUaGLm=bFF2y`v+GxZMRP()#Z&qm&>MU`Qlp!2^MYwO9G*8c8||q|xWPiX;N`oA^<`
z@O(|SxB0ZuDi|9J$;oc{($Nv5|L*MJYy+{Mo3$(5F~#z=tBX92!1e-_pGC?=$YuNj
z40*bc0X|V79A?|UAFsyRB7Y(FCr_;;)2Dauj#h|+1LS9C--^mHtAA+N;3jPjA7)^1
zFexV(EsCJf(8SJN-=_$6MBJQE&i4a1=4Mmp8T#>yuAZKJ_jwnbIl#kXe<VsH;6L5w
zcz7c1HnujvSWizcslOSopTPO~iw+l6Uw-%?l;M%R=eYP%X(X<up`B60bz#48sN>;}
zI;diPfq~(#K*v6|Hr+k<`c~n_O+*)t{E)LvuiIUk8XH{^SlR-mVJN3xVHJf6NTb#X
z2?-T%@hwTPke7dk5|_($q<PdGr&D3b6*5H7K7RbzR3Dwcxmq!Ks>0^n@dkH?+L*WJ
zc7$BNc1;T;oTPM`DQ7NTyomN--4DD<N3s6aty{Tj|KK41WmJr$5xn*Y@2AyAR->*!
zm-j&!6~*=8(J^<Cl!gBc@vMF)KA`DV0GD-?8@;jVm5E3RGkObBc0wj&W&xwmg^W=F
zx%)XjXM8)%8&U0!7VgQ1T5RF=fYOO5%d%akb5k-hhQR=YYpmztv#Fw?`?j?eaQGF)
z`N1NmLdrZx(Ef~|$|(6n#iZk5{CcM=R=&HRiH-cklo>t$>cxweB75bEw>Re!_l>e0
zYWnA(FC2TEl_ljow9acdfjGDG?IO3OT|&ZwD9$3@fx3NUP_9pvg*gSLq<r?AjM%Bh
z^OSgZk6P2FBQ&+YhoA8_@UucX)`^O?zB3SbT~?ynn>TD=3k20kxjd>6Co!6f^}1+r
zcB<BL=kdM!_Awy0zX?b5Qq;lx0v1%!3s1(pWyK5~FE8)(y>B9}+caS=XsRXR@#1p5
zU>-i$Z0A_4SPw(0d$k~&&rsP1Qyc||`L;9h%IeKCA2yRG7W=Qenh8HBEp>aad3<7$
z^ogMe$OG<Z0h5c3E;9>D9A+Ufd$4KO@h%OPy(m+7HKI)6q!kDmaM&Jm0wlM#Ha>!v
z7ooNM89Th14SW(O<P)Yn7BSV2l;6y7sHkuk$NZ~(tsxKbfi}hSns48|gYvH{P8gEu
zEfU<8cUlUFg||VRv(h;+=yn;6Yz!s{;)D+T<0F144gHwNLH=5gcD+U<N=r-Q4wP~_
z%S?10;tENHfmV@;Qs#&n=@pnP4Sj6q1Q8D_{rI2}M~u@><H|xttyT3+Z;i_#$C6!F
zg+IniW!^~S1NyoUR^;DkbmKj_2oxBja!JP<pq@d{@ye+88(v{*2Ct$>`26f7<)GbU
z$of1}2a1pPBj!==5S@><Q7clBT#9MCR(G%Orq^ipVj;>PX9hnh=p6n=#Ri|bw4?-#
zgjX*bv&|qg_KXv?u0OF<_!1oD{%RD*1m|I2K|tHRKivt>vo~)_XG?{{5SFZNbGv8j
zW#ZhfK=bhbBJ51Sdd}On|Fb7s#+C>fLRmst+9zYHLDHs@EK%7LsVvPXdt^yOL`92a
zODb7MQfOzi5f#;lQlhB$^9?iieILhrJdWp`=XuS@|Nr}azt?qM=XGA^we+-aSGQzz
zei5eO2g(wtLeh?m{(1N4j{USJ>{i`?Jj}ymsl6bA?N4W$%^?<DYBWK&p}SjED0b#H
zb0>52XCguxwtLj%kxjP1kT_I3JX2;N666+AT|@EA=bfSu8C#bQlSefsNQBT6`rIJM
z&At<zL-_5|o~M)fa>2A%rhDgdKZFY<`M$)OhQeV2TxqE}V`U`^vq!}r>|Jnl&&``n
z@HZhBckfQ6Kf=>SSw8*I$Io0+zr5#X9#G}*j*%?n!2p2vDH4Ak-(@!<jT;JewY8mO
zWTu%#`<6W{hwMC>@H4mRMj4)z%#RNU92|q&!lUv$XwDV9#Gx)Pk2Z!e_5)I!$<3uz
zL3~IyGhc~}<C_-X-XX|Pfg~RXHXszG7Rx;|qtDxhw(Z*CQ2ML88*GX&)*+gr!vm)p
zf5;w%Y^O${W^?ALaRtXF7H9lj2hahtQYA*XWTd8wDLWKg7cO7U+IRz~?c9*J52)vt
zkkBCXkx@~hb3SnkGyqOe5EBy<iqj*PIynWz+1K8m+IhwI!7tB0U_(kSR(9@LuRqp6
zaYnUtOJB&tc(#H#vnqt{F>(4M@|nhvt-y*oKs7;=hsILqB=FZkoJ>5ZG(^-9jSH=v
z>IP%fW_59)x7w%rt+P)5eDE1B+QwbD)6*P)LAZ+|?Q+eySF$}+)By#Q@?1Co{2Ah1
zK$DgziUq1FzYzJ%UnIT21!S7kUl}=)+>`IzF@PNMy2ca8mo=5S$Z2@<#({#HATg|a
zU_Zz7AfeEwVa^rM!pe#NPwIFe97K(9hzhSHQ&dyzP7jS$@pFP*hZZ@<<TJ4>$#q_G
z(xepu&-h)Bu_wMRxo)&_ru_Jo$8FrT7^H#$V~`1I{WsU}iz}f|Q9GK06DncoSFIW~
zPxxKFnGkLEA*TTy(3r+@d;WaC$v+g6(mjX6At7a4@ahTOtP_nW-^aS%pM~l0kbTe#
zF0&rL=$9q=KK=aZQ~q_gIV;@~7+gTu9j>%;3KhZL-eb>T(_(V_B$O3%oy<uhEjco$
zYaeZz=>7ZmcNAukTbjOFD7-_!sI8q(>Bj?4ne0CYWaMAiD=GeF>C?c6Z;}_|YC342
zs9xhvTdyGN5_%@^CJ6e(G370)bSRM*xI8*)GWYtgO|DdfQnC`#GT+_5Xr$>_!!pR7
zhUM;GaBWzcW5#Lkz0MCtFG>n=o$6<3Mw19NytkLv-{{+zOv<c9?LsS;6c%t$I6!=@
zU*`y7DWEm?n^%C(_T1`)ANzC@M3sW0O~|zUa7Rm@rf_|@EK9>pLZSMH243|)aNuP{
z#WCB+-0PppOZKL{Eg%AmU;6!i>e}^`#~(ITKYKB;<P>48VQr6MFeG-zk|LwMWSc+o
zhrcI1#P43GTc6+J#pV5F%Ga)3H`|q8U>;>|WZ&*(lT%lB^l*-j;g*Z*r`Gj%Q9SyP
ziyHDfRR(nLB^@;nj|NH@2L2}9SOUHX=q6aY*bD9u@a_p;VW&419uEw}ghkm0S)sC7
z{}YTwapgvj9a~Y55PLeKY#4~07>?Pq@PEqGps3MX{lc@f@1rSqz$Xv#TREOd$M`f|
zXKwlBV<#YJQeXM{`aZmOFqKR{(!YB`Lc+15N9|KDtz5a1+VAk=thKJLC)FJfxO|1i
zFE&pPFTQ6n%GLM>6Q(~J4m-2J|0TpzwWV&*sLRya9acY2^zbhnNSsMC*mLq0ja05$
z=7hn6OGq!8%FnO6jkWlrrL?JP`i#|%ZjtBEkS5QMX3j?$o>-77`}$jin{pJ4$Bo+*
z+|b3{{U~(qo{fk1R#rB~?_Nj)=Zs0X%kr)jFsDUDN<P5a5VwODFRmfthg_`r`AHS@
zAQwFxL5LxfYla151pw>$IsfvfPaB5i?4POqp{gol`DaH5=!qzk@Z!ZL$Kv8XWA|6Q
zJN(j&M^=`Ky`!^Nk59CQtRJ;cW6rV_94*wTn#vC5QiZLd3_W#|bC#nyi)Vi8T`+d7
zY1{desxHpz6kd!X`NK<seQ~d)pm}Q2tQ)(J9kZrA5n4spG4#Wtb?AEb>uPPhvP3gF
z_}}b^mJgaRx@P4jnt?><kSb;elzLrFfvs4+Vnyi1y*E0*j?e9+eiw?qqzcae)S@L3
zdk08%GGqQ~?SDA7UjO_h@i)?r2(52Lzn0gcs^DKZal`wdz1Bh+*$FWUtlq%+tL*Iu
zyPg+ueDA7g!Q*GhlQe^0na3?D$j#NvnL!}(yH9k^`Z|=XCTDOYAi&+jV~hGAQj~TG
zr6P^r%nitRc`DFiKPi~SZ5zHd{7ibvxROch4R#Q2Lt^i6YFq~0MP|$QIh@3ceY&PH
zvGR(I3~h)`pF7?~*Dz)3)QTftgaZ^AKqiBRM?q?7SjQXN1}-|Okhy5_;u<)*RBzEL
zy}_%Dm`JI%dGqGXPdrRWyY!28ix-E~yrRd0?)R~aom<O)3f4oS7a8g*h@0ZdUT}-E
z*6yVqQ=hP<abrN#7+Pczab;^~eyFWQ!y2XzlAr7TTox4n`trN4l2n$F(fm|{!9=tK
zGDW$?dFYkhKg6E6ocZz3TyERougV>iXcQrw4t0y%Xq5qFmAT9o7$XL+_a$9$`>;eY
zmQI?ng7DkOnSpX)zmXhjYm=PY%b4wnzT&!?<kKmx6|>TxhO?OF!P3FRHaPgfqeo{z
z!FlvjZdLR0UI#_35s+UzVU4uY{%O;*&NTE@<I2;CGJHjA1X?X*;tbl7&n{~C(fu{t
z3W_IAhK@(>{+a2LpA7g7p$!O5Ej^BYF*85kMgbeMm*fk&#_qLFJ-duOJaqO#@$EV<
zLJ|XW^W^-@Mp8}jnK)X8nHO{6JVFy63(JgizxoaNk5IsSAub2dQu#Q8NqS2}`|9Um
zPdHtfHosKf_$cVPSG82XBFZ0IT-<WmvI@b}ec`+^5<U{ee$H`_CUefdo%c!|k7l=a
z1zI8v?Ai3Rbu1?ma>38KFg_uH#1MEm*6`w`OAP#NSLZBlZul4l`X^N$@-gg^$fIY>
z+-TagY3shy%qwZU4bBD^&CoanoT?@f-3~}dq0Tm;nyx{Du`g-$%Rk$PPrP^)l0<Q$
z^REiamd+u0^8sjH^fI85I77urNPTKsOyoo${gc68&z^-Iy-I8;UWaI>`bq1|eul?4
zDEEQD|Mgd|kF~V^0h))CP%Ak6b>UQL6gL5;)Gvuq1cXC7Kg!Z=im#a9Eyv;XuW;x2
zx#M>DcB1G%a@fJTE~82y&;92@dC${qqwdGN0M?g~eZR<zh9X8gSg@=#j9wYV#dHPH
zv;?o^Zu*04A7Z1kn?gpZS1K)u`3-e9p|O#eWKI}5^bL1v*5xjD==X;hI-rK3JXoTm
zrZe?lAR%NfP+geSe-idz^eS<U!eov3LL-4#z+c@#i7`id7RwhO`gr>xImd7KToVQQ
zVsvueQIYWdVakLn5-bKS42YkD9<!#)P>G?@F?vM;&W^NLMP0(FfJP!W9l=-~Sp^`x
z<lPYImK|9BGv}k)`~*?vK)j#z+_~b1)7*!Zo};cmX*|N$@P_>%bPY`nmXRKx&@Dxa
zbdd*55F$%!yHzhn7eP+TEWLnOMfhG)d_`yxXD`!z%=|>sA)3rbuHoR|0QbVorfBhk
zCi!ieOp6EY0Tzjm0)NRzrEkT+2k@F=)`iYW$f`G}TZqar?<{#|e64Al_JTv^BE9kS
z^aL=CA33s8yyH>x`(aPO{em85wuo6jmDlq?7$*+`Ro1^r|K)}lKLqv-liz0>x2fT3
zWMl=s8sJ|adb;%g6cj*&Vmt%VW4^%-UOW@+<8rpQp|{I7#uC=B7q?GXRdU?Xi7Dyo
zHiFI2H3+o;ZATZqcm@~vexaKN(Wd3bXyU^~kw-HFgN?7uc+$P#+#geSG(oqcS8oIJ
zHX!FRHe(#JaPT@ZcW4s&PBg+u=WD4O`DP=%F!u1#?VG^B=eFzHllTc3Ctc&kKnx5T
zJlK2NHYTpCnX8xW-1+f~vuz9bFGM-zcpZezw+anNCr^kHxt${=j5V{Ulo;E#Zy%zp
zOc%Y8Pe!TC%`==neVE1heExXHmmh*LD-plaS2-`bz?)N1>@emX$1C^#5GLWeD=J0>
zcQpVR^wZmX&O?Y6&c?QKW+I&<proii%buf=S+MTik{dwb7^@NEn~bt(WnKvm(DAq$
z8fqW#!N^Z#9c&-D%IUc}=G)7+l<Yr-P-o24m4B#up~&ObH4Z75xA`QzpIhh24i}26
zjd>mK->)t>h{(z1OB_Tft$i*?%-)IT&!5jUaa^Bd7-JeR&&s}f{!*OsYD$SEn=&K!
zmv%Fa%s~lpNGmk-<%<`q;UsE$Zft5y%uh8uW*AzYbvBI&udq!AzicVy9r7>^#0Jke
z$Qz}7g@VmRJBC+DKil&DKp|%4f$El(7zFTnCHXNrO<EfeEKU5}d(7e;l@Cy0jy<3h
znN!}}W*Zha$}xb8m*Z4RZL;p+{x073#`J6{74aJlSu{iGSK3VL2JM22IH*_QcsVw<
zi>z$s`3x)}ts~Zn*8`LbiQ9n+QAnXoOkin7%**0{D5j|b{}|vB)3xrir+;3HXedI9
zHj(xH3g7)*2K4^oSa@yCqz`K-iz>>?0}2<bsz#C=K5soHY<n}sj7Vm}D#TtP6LTAO
zI@U9Qq4Db3%`G=8yV;9@NSY|iZdPk<PV<&SaF<avCv_A0nUrl2KQ>k+tCMcvLq0V$
ztU}F58iOL=w{zz(xHbZgxK$9AkIs8IQqVskBerVU5^dH7J`jR+2F)#~6G&(Lo3z7)
zU5+w~s)t9%X1X0vX<NR0S_9#MMhRxK39~tucZCH>Nu>z;O$Vt->66H$bSkTeAD{^x
zcCFRa2mV}#Y0VpW1A`sB%4M#ryo>dkKJSZ-t>e=$N>f98;_7sVh-B!TUWMqE#l)<0
zcYWwT>NyPsWPQRuljGvGZCld)`+3OHCQd{Pm!6gu%^!h<$f-)lsPZ1)i6|Ni1w?wI
zKp3p9E}W83C_<q*GfDscJ%adSiHV7oXW!$t0t2r;bf^Qj5YW~Iozk8?S1w(mbz({+
z7tC6Ond;UzHYo2q9-(T8j9j*Q^#!?w<Q~KA!P>N}A=Z%e(BcrAD`*wdRX;q=m6B%u
z(J}Uv%mBRxO2jwQy2B))$+-!?18s$9^ye{S04CSzFcY!4|Lmh#=6?2K=u5<$C{*~=
zNK30Yl%$r2bVJc%i2gccY5cWoGD?)Zp%2OeyNsG0ZVIJe^WlS_UZ|5k^DdYQ8^i1v
zRV!7a7<$hLzCZQD>wHt9Hg&9M?eISYc7crLPQxNHNA$6#hCyIg;)F2mTvmmwD*{+0
zo&q-?bU;Ty3<b7qBRfG$OX!2(@HET!>FKG&SVg&fSEqQ61S0S}x*Je=Uj{7-RejmZ
zud>E}!cX(XeijoJaW`+?6dLyUZ}+va^T4Tmf=W`F!457Zssn>E)ZX^*zwVl|{d;x#
zE&NIPC%X5?DZFZtqs=LUHuhV8FADlmTzvm8nDwKJFq8HVy@2uN<K*Q#|07bClKVNu
zSSf^=svsEeDBJW(DxSy&Cluma8hs~nguD$y;~_-$PK3)i)Rw#$%`-UvXPd1-4L<*<
zhu>&Hm^*UF+cnG1Ps4-%0<&{SNW&47WO|ym@I2u`G*cg7CQ0%Gs@E^R2el)()9b?q
z46p+S8_2-F!Tpr3wE3iMulDc!W&Pu%q$vv5H#L|GNqj7AFUJ8l*R)7(gDAXO_VbEe
zrY$5p&*Z%QJ;0FEvyJc!G@E!RCyyNY1jIwe0c^o~6!8jbEEs3|SAV`%+Uk}t{e0{1
zJk?@bQK<cH!w#7G>F&oE%|Crg(f1f?`{Ewed569vp%pJ0uxe^g?hr$G;~yO}WN{x%
zH3z5EdD<tN|Ng{<g|-U$39hrmCvHb_Nt^*UpPRfR>a}M+LcHFN`eG8`!KyjcO9B5K
z+qr3tc@2L{Fw;XLPVByO_bz=Ze(7!e<je1Y5}5423{ilW2GWo@nkE}{|7-Zb0@=6Y
zYVxUg1Bzl~Oe5*r@WQPNhxp6c8Z~g>AL2c9K)Nd+EHYz5<_e_b6~%9p`_jCD=7N$i
zEYr8s=SQ!t*7<gRYWU72fEF(~T03(zdrH&%U2JS+s_@oPL7(qQ7u`z^oa(b<hq3s#
zd7Kd>CSFNp-n?rC1}f`Ez535d11bj25xs9pCggnLLO3#{eL+18qE3t|Ol#ayM`b96
zXF|5~dZqvU8Og0MU>eqmiXmwEAj$Uxy_XJ3knw<;2$$iWxA%`%@@pK@(z72kn<v%Z
zS67>i!-4OR0kcC*_S&l~!gI=x@B8j>k$eE9Fju?15}<j_`xn0;p7+e=AXS!^t9UtJ
z4?>~Eq?g(RU@b~@H<nou6JP<^lE`!#<9G<Cu_WPbtyr-FEa13s;c3S7q@kV+L7iv_
zxgJ>feTX)>72dDq#l?ff%%c$G3YotNqRLQphrFv%eD`fo$EHrfwRC+%<C#&c5K+G*
zCr?xI^7ro*$`9H)nXi$y$>cv~t0X_eoCMcZO86UBQ~Zmc+BlV=Fn)@+985PlZsweB
z>3?o$FxtxT;=M@@2cHKJeu8;`($L*}Qe8gjyioJVwG;Y%d+`JMFtKp`x1(eD;-otd
zwDj0jGDLN2OPjWB(Yw!*DSG{S$oSiBd{eWX)&Y9KtC1$*;@Q1?DIUT5`KUaj5gpfz
zBhQWyV+s1x#bDKwv!(yexfL1{NXdILN?@~#s6@pZXu>?}nC7|&^?nSGUb-!9`z_R@
zQ{M^3GN;TVk^P8?srOA--p?|z7+&_zWZ0@rPK7hyeV>^w?oG|NZp|7v<4Z9yVB%9W
zWcc`=N64I{-)0#>L83>jn6w%(a^!hXHR=|F9XxOvC)P|_=u>f>g62jq3><;lioJ5h
zl&M##`;R%ae`mV(5YN|FJeeWWl|dcXfYwqQ^w7>c;^xpsS7<1sQEwQnU~+hr4_Nx>
zfdi+PYfHWd4vbm;ms<GXCwSB_(k$tVp+>!<ssD~#*$(Xgnf)b&yt?L@!#=tjH&AKH
z-Ff=74rbVt8MZs5e#ET?J4pT`&{8nNC?8J|!gpz}gn^LQht~Z3=Zl}N%1<{liTkch
z@wGr<ER4boWGpP666lW{Jvxk$*jB=z36R%83GMer+9?Ak?-HZ3N?sFCq{9pF)_-r=
zh8<R@38KZsYBpzjo{bF&fNmyh)QT!A&x@<@rYl$2ePEWGR^pGb!~1*sPRxXYm+1;4
zOi9a!vkNkqG|2<arjS2PEgKP}r>(8soFX^WRnDVIMjCsFc~B)>t_1jG!oL}N94c%n
z`N}fawaM4-KQ5aG)vY;PI?ggme^NA{&}|;871JZzx8H}i^_D9vEEJQ)ywavuKn#E`
zf|sraSRlJV`!H*F>qSRQs|O6wml2&oKwL@M$@yXiCtZt!+};1b{zBl^)zw7?%|bJB
z{?I;Mq)B5pytfOl;m=;8Hz=kJ!+W4X2TN0u8tf2bN_<K&B77K{WBU9eYb&dj&u;`~
za)>j3<)+X5&pF`}#qZudZ@%d+E-sYh40;EP=Y9{J%@t0Yl9#vlJ)R7Kh|r|3z;s;k
z+I4KDL@hJZ{#6UR32ZFNV<f<_ubkX_T#knE5vBS)Tz`}QoP7SxoexOmBObWy0EX!4
zIvMQXG9$DeqpofmK24I}3j|^W*-U1^;>A~B*y(dK_v$-x%a$ho>-J*$2R$G1G+x=Q
z*P?W17<z??$pN{S&=7SWKFkppsYa1ew0!>&ZV84_=1X1ezJ(q&8Gs{I<0ZK}fVOkn
zkf9#J(v5nB$~3i19!&HUc*irAQR>^bFTul<8sYe6W2x;jgwudszrB%Z=t}EZm!QjF
z86ZHEVRCm;Qk*!*RJw-k+Pt_d7Kb5{8AqWhlL6=?O7-rY(S-^4eLt^i=X>mu>c2qb
z=%E7#7A{@dt49yN(kMFkWPdaia8UeW5}^84X6Beaclq7PRGCCVRj;8YA<1Z-fxY{E
zDq4BU4P-P85ehwW-;s@YYGFxv@RCxC<RHd#_zIv+3o(=)4@hO0QdP0LXIeTa`0LlN
zp_OwvsX$M)Z)4_Cbw@5#+|PYJ{5!|%20_w<E?c+OmS~+dJjt(d9XI6a)yFpS3~M`L
zqs1HZmgC%U#cS&7((+BDpb-NHiYNj|?rsof8UX^^%Z8saTm}9>Q3qN#47}nc2MI^d
zQcXj{Y@CX8(uaRT;!X?DFsiRk<W6w1i)LW#Ig8XZV5+^xzo}`=DyETz*UkAHV#v9<
zeyVF$uV#ej*3@?md`I9D^|?*FK-9EFtSl`TTP&A4KEY4{W`%z>+>38K7xh|!tLOEh
zJU<#y0*Q_s=aen{L^&76;v@ae&=YjWNo_Zi_zCpB`C$=Y+TfPbfPIX``c85l_#baF
z_FG(g{bmvZo&`c`Q}N{Mn}?98!Xv!;#(R`hP-dcelJQ>Jn1mFaCY@Nk%;2ojulom1
zrgQYBtSlV&hcTnn)K(|^DrIhQLW0Nh`&gE>ec(jtXs~%C7lw}G^Qpi7$_)3TN6a5g
zB+`ez;!~xG^X20_@;^RR9qAgNSDLP*Oy$Mo0_<CEe!ljrIxA+;P+7v_c*$|{09%6B
z;lw)+G>}eS<f3+l-er6{b26kJiZd>Y3C<jg3wYx^NF;wM^TaS8bS7}^kdLhXiM2Qg
za8HGPKi=}pZ%2uKcq`4}Pf4Gai^0;K69Z!V*rzfsDt@W`1|ds7@9|DQ$d=9gVSEZU
znv_M!ou=%ez9Z}3kYhTgUE8*rT3TXqhJDKuyAJV8OM?;=f&^;E@R}!g?kt2&LLL}e
z)AN&dHWp$Waok7)U8=~*uq5D~E#zw9=Aw!g`b|ZJHFiy9sv7}0&|9EO6XDoAPQ2F1
zalWxxKNKl*>Eb2um6qFFKujd2-uFc<rR+&z%$U7<4UJzt$^s||TnPXi0RF0HU6no4
zX4u%Uyw6%xf6Os{PJtO7KL5sbVvTIuwga~J)pQBtzh=}Sb22@1%o4B&c+UR$<Q<I~
zk_kTBVz`P{F#BEo4;>mbkChzHpyb@#-BtSZf#Ms@O+7hi{D0ik!JJ@zti#pe_6rw|
zL1hm(=#D(ox+e&vgm#qJ71O`}sQJisYxxVWtSrbSQ-+$@r?zU5oPf6x(L!uI1NS2J
zPC3@??WG@DfTtD_rj9&dxVcw8EkEsHaCro<iYyAT+qmWL0!MvQ`&V^UyVqGRo@1&u
zK3QLFwnDG&f6kk|)MiYs`sH)OM?@u<8O@oqHZ|KuWvul%^$`b;+N^HxH`%-EJyYXW
z*$Zd?BG>iM(IU4sC-h>+??2UXZQD<qZ38F14ygEYe0kq(&*Ns^d(6ESV$V<$@9o>4
zzg74mm7nC)Q18!=%$YKIa-iOp8w<B^eW^<uF<+~i0x(?hYOR*%mcgm7*XMqR%VGOD
zlwH^~Q2Jr0UkVg&c2a>AHPTH;0w^8Rb6Zu{tX<0x$s(kC@Wc%MV3+Y*PJJ7@i#6_>
zerf)Ny%lX=Xeq+OQZ@ON{j-P-n0>AT@6g_HE@^EylfuTD+eELV9<r+7#{5&pP&G<J
zhbA*^iKK+l4t}Xu|6U(;{klzkn6JOTKd_p$dZ;||qD#6NFmq51e}XIq+`8LlVhhQ^
z^HYB3pg+7fuKeMc=$SCb+?+r3Q=dMOS!FJo1!(=8CY(Z1L1||TX=gPe9<rTf@2*gI
z_9iAjJ>S{llNK}~Z8I^V%v^TmaA|SS(+UlRN8B39W)EO>+$-tA1bhJKuV6gh%T%R@
z{~K(B6`6A-LCJdGdGzR!>DuZz#an?#KdjaIVFdXlfuas<6u#!zp+lx+1FJoj*xPHs
zZXml?U-snTpl;?<*RTJ`fMGnLh&~)676EUsOT&GGgBPKbqejJNlLac64Qs`9YvDVe
z3-97m!yj-bN`tZA)Vhn`2X*sn1}~_eTk(FrG{Mud5>%Kyn6#`*tNO}YxHihLqM}pA
zL+LHp%$wIk)|&^3UmznTn9-@<DkzwD${3<0;o7yK-DNE-ES4@^>TLkPtmF^Ad1E<x
z;>o-grWayjYCn7c!}W$XLu&eP{XyA(j+4RW&5cEQ7`eryED(W)5hWEBO>S?&3E=Z8
zz#oq=Y|URD?B4nWMp=sAXrjAW{2adwheM;gl<%UfMS*zk+;GSRO0vo&mfI(2YSNhU
z*s$Rm@3EnQ#m0MMDx~~h939K}rj+U(8oQ(}%W2P)mj!PFXuupJR$%=$)yyt@J9`H2
zt+MGX^b-;N1K~MjlHjKx#tcIh@JDIi>wPOJs9ytX+OSyel=Zl*wDgSeP-YR?mHIO>
zVE{XeHSOqW(Df)@tChamv_8M^)TDOa1`8VHf2=DQ;3emts5@fBjI&wOm*qqMnLnEp
z17BNLXF;z#bPQF=gb9D;=DKqy;h8torlGk0gx55}xEY66{plHy`ryHA6+fKq40e#?
z5WrZAH>mE?ymxmx$03(@Nd{8YO{s5az%o1KSf_PQsc4{AtsO_`Aaek;F;i|6WghF8
zHRb$rNwU1bPlA|zT0YEFMJ&o>CBd@V;ga#-AFr&F1vh5q*@Fr~Z4A2|=6uB8--1xB
zq7X|{&95y6uUK4}eh5+VD`#hjBQ)NO1FbIQfhX|ftZ`4oc4bO1haoI~rzAO8c5FKd
z)4zLXpV3fL8P(4NAAL@vL*s_9@r;hdFHP&`!}34}yb4tmJ9X>`DNnwIK7I1!$tB!B
zZyf+J@*JbaEioV3oiNUYmyI3NazL+tKCxI*?(BRidp7dkN6E>2Vg|K{Pgsy-UwfN=
z>h8_QNK^t39$fo+G&lV@2?;wC*HIZQQZEA@ahNlb$)J#+aAQh6H4y&{_ez*l*e$xr
z?`gsjIrQdfD|<PB=m1LIax^zij$0AXd+h1LG4dVh-!e|fgV-|PIF3G7v+md(gaX}t
zKv=l;e8ZX=D@yGLXSplTq>CVNA1rO_S^mS6+Bhxr?AiNCNrpaoKvOzDW)mOg<!KEc
z{+m<&`m)S>w{I7ne>?^oO3v-qtl8GqV*4}jbKmnx!MejnjoN|-FRmnw$Y%+&vnwZX
z*Czpn^!OvNDeQ6K|JcuvyUO-w&z^}#%0PTr*tFcSh*D8O$tZ2+x*QJM{(#T1OkE+$
z92HOS(~4Dk#c$rw3yTm7{fhUq*lE^zFV4{p5D@*MW9uJd6zqg{Y~v#t{q;W%fB7-)
zEwgI`I%F@LpAt2iYN$ZN*Yx@6r%du~!0{Sr{o`2N`)!8;j^Y2YgYts%hgOHnh_j+B
zwuswGOw~)ocp87Xh-zO3Vobsh={~yZm?c)R^hn*W(tQ`hxWJbVO5E}5iHVQ=<vTru
z$7SC1e2x<)|8r+Io|luCm+##>+&q$vfl4cPb&?#8bZono0aO0N-^Lt6QAnGaUi7j9
z&!nJjKTK@1-J0(&EX7QxeBfMs%Das;9=?42s;Vj5qD2eN=b4lXKmx9yMeZ}c6G9jr
z_>nJXdV}8T1`HgykSv#Zl~_gE5bMqa$4<pzqs``Ou@s0Kv1=*&PpZFxAt&G>e8b{o
zpKWKaTu~o4&OA8d3q=cz`qI!~KuK@y4HpPf{FC;<a16+OQ+eNCX@X>L>=$bCSvZxx
zztu|Pq^$JYAKyK5^#gG^4$UeHNqQ?f!G?wreU(SFSJqfb5nfhYY-;3&kt0PvPoyTD
za76JywZz)=H6sFr0Y&YJ*jQJ>DBJ^>oV7xYn`c<eo}D55cR5rIk9lqHp$>Z)bi_4X
zQ!{ttnifi4a`fG^onjPxly!A)WJbTjgNpJt(mZnN9{CK~RB@Q`+RdatymQZ<a}PSg
zSV!Y0P46deM;p({KbC(PkGDZk4A?Mv^^wsa{DjOplb>*AmF?8YPa&3(#wAAb-MhOW
zC{X$pwgSdBD|l*`%YmX(Svw2x%*pYRBggzjtbE>%%?o1HlCtud%*~7!WtDl!iA@re
z08V<tjAMu{=-lQ6XNXpc=;HLhbnzmWdv;_FE#)pUGQPsub_w*+X;YP2_pgaMiQIOS
zD*eYTKjJv2h5n;wDT@5f;+NT2`Hm>@H3K<4{TPm0X#dQR0}6>5lC*+zLAE^_Y#7hf
z)rk|0Y`Vr0D$si<f6#amjUfMVP$ysNg6-|?(L-&CE|gW;GbQ*+1_{o}sNI(Y4$Ms)
zaYtc`OY(wYh|F<9h(MO}l54wFiMb-$;H=iDs;$M;2`&v@j;G;_>%y(pd!{TRToQCK
z{ji^1cZM@evEX|?xus-u_eWIi8Nzi%D8eZ`n|1I0{nxKwTTFbIlM@3nF?n^BYJv~G
z<ni24o(vR4HLJi5YRT~V%eTU?u*7hNOcf5Wcou*BIH~Q{1z<ny+|E_+1YpjZF{6jU
z7e)f%eAFhH1_e~wOm<?rgqKZLq9$Q_!5*_>{TaM}9n5&R2MD%A$@9Pzd5PFNhG>%8
z_!~<@e%t;1sCynyNlBsL>(;B+5RH{W$5HZBgZ~^6lVPZdQ-0gFZLQ|cJ*p=xq3p)Z
z(ysE#j-D}ny6ETLYsAc6Y{C)?xy}v|B?qjBtoP}#upNHCF1W}pt5AhXZaN#3R_ZNY
zwu}X-b;!QA_uqUC63H1y_MiYpw{c9K8D*Y8@y=z*>oV8c8o*Fjp0kW9BWG23%*uj#
zP)?Uloua%iNOog>#LI^xR8@`6W|^t4G{WDuZ|U2&QKl-Y3bBwJOCIBAB_$7q64KD9
zeK%OFig<<Zas1tGIJ2dsrlK-Cz!va^(Z2_zq@_VsMNbLdxoejSYgk>>)zplclQWG-
zPd6TUr?q5x2$c-oC!Z<Z`uCq7{(EV0v2d$EMUVdg!zxO@(uX9C*akYXzmHF=$)uE3
z`%coC?+)xz&=c^oA_Rg{K9H%2BFwDw+Vu?A#9hw5)~QPue2b&p7NMb(4xbJQVWj$=
zof?${^u?~PEM#h;nHxFwE{=PZ7j#G3{SG*Q8Z?gt7*+ssR1~pOe#ZC#opHtqz+8ij
z3?#|W$cS-V5IDykHNRz<1F;lc2Zsf~*<JSD+wIt>d=~qO-JBWJ5(h9)Gw07!ErQmk
zf49F;)7g64Hy|KFM8Xvu{|tw7&LPp!tG<DRZynjAkQX9Z(^jMSG}wW8y-*ovKBl<D
z!+BZGap8zeq%vGl(#mP>J*Phi4t|@D!wj<6_~w-XD+xEnIIKGubDZcjpUrTfE)!~h
z!>N&%O+4tu!;V(2-UUs}y=~j3&C%iNQudf1m+p*M=t4!Sv6Amgn2YF(`OTA<WftU3
z1%+I2!`RoY2keGpBs!%AvJOb|6{Hr*veR-4IbzU0T@B7byHO<NoGc<{QqD$1ZS;(@
zK`=pz1E%m#aKX*5zVx?2A>53}9R5L8_jDhwqcaaI7L$wqCDC?EgAxyuopXMM!f%2$
zcw>bryW415aT{+&XxC@YnY9UH3DU!v2c4uPzkKHfUi71D!3Y>aXYdU4IADf!nU*%H
z8P9MJ8?$8Pr1D6x)E4w@R56GfotNUtxAF)gBGR$99KHT7UAi<OKOi19%^H8`6TbM7
za1rrwaXCw-+&afRE|*MQ;SnjYrmp<JzI|zYVKbE+&QIJ~St&^Q$LaFfwM!EPbZ9Uk
zYtpukgmVxY5z<xT101O+0}Qovw6%pSf$yon!kmdej?J|W0C4c7$*XZ}Q{t(cb^h@2
zqq&M-4<Bp1&UhVm6JZBfOf}cmwz9Yw7@*8Fu`4`lxhFn2$YA-@q)h~Y#0$gJsIT4l
zjp}OW`SYJMB_DisQcGts3(1d;u@Wg-^=s1eXV2mY%0zyGtE;Q4KWQiEF-x}#X|8cu
zkn+>94-F>&Wja;BkzI_5NsOK2fR(e=riK&zwM5AC6cTmv3esY+-beg;F%x#4VtAm&
z$}R+6R9!0DMNz~1;oE6zA8$x)6=rN>^Myr9A`QwcShPq?U*mUSTWJjDQ)kOYY3X`8
zo)ly3M`MPxDa(FU(_#uE!H$i%*J}8E=r3VOnBhQ2g5_E38kFGZ@ZKZFFb>F_NW~S}
zu81;?v8{JsW<4O>L$@y5e4a8LB`ry!d0<iu-$QbGurbH;?;n`#OP563lZOwVX}-i%
z6@GQG|I}1`hvtGR_lskT?DkwUceV}QU?fz@i?V>k$xA5iOjo%ChmdnMtL)L6Amf2N
zc(;BV9ri?wWn$zyT`*o^c|lGxUM267R9w8+XUd$CG369KU<jt~V=nHg=K~`Pe?sLb
z>TdLnX9D|55BK~?LM%wjEOX)0=<3>9c^19H(wub#CK=ai-@jK|J5yR`;pX;P>)rLs
z_=qw#CiX80v~GC?5)rcrS;4Lo@ZK%h23!PUWoE~DYTmL_Ltn?mq!=cQT4<Kzw1h82
zjFn>7u0>3h5+*%u{<#rcRq(KiVr^AbZwwNDV8%@&>@9gPG-d!jeMXInw$f}P9qII)
zo=BHNb%@r`$Om=HUudD5O9gg=ASdG*Lx3HbIK<N(?f5vZaV3_)mhtRZdKjn`t2}lw
z9SI)FuWLNxRR|(hPt_p!qn|$CoP7JX690LqiBCVBc;cu?l}B###a;jY>l6_E@4wcd
zmiv(^jVgjw{S3!?4I44SY-o32;gSixr0j9u<73dNX|rbS^YH=t(IDlUL^vM-o#Nm@
zorjkd#KqCIB7}DmxKRutdTe$l-jUH}Mz#_h5<L!GmOfEL!J(UJD#*(-yKkmq=HM_C
zA%=b;b7Ip;m>CYVVfIXU^>#Z`-vpv)bMRFR^@mOS7D4BK)NJKZ|3{TGhw#9sF{XLl
zfxFsvYf<3|U5XM`K|Q^)-_w0k^iwg5N>_=3k0HZ_p}{cYeY7{MUc9)tIkR~Sl&`Ts
zO`gwl#p*5jkGZBfWnneCd?YA<^1uHc&Z`fb_5Fr;*7GyZhK7=Nr~BlIlkz!dJ5&&2
z7fL6E*l%@9=`2i5rJw`wKEw(ZGkTnW#O;J<@#qDEf^I}ub{j2Vd)i|HJ1>W34soci
zy83{KLjf4GQ_t{>h<iBBt18<2td+tBf5$TV1&9B)-uMP*LX%b?KgH_q;qh+#?YC~a
zv-=HUlIg+cc0zeXMW?){b+!joLvGyKvxGSm>gc83ZfEp~H_LdvRTP0&93azSKN1yu
ztiPXM!N12$=ga%MBq;GwvseB2Op48l%=>$JCP<&eY}v$z$TNlx9ZF|Sr2skT?S~ME
zl9ca9P0i+mQ`|G<gMGp6M8`9`TC39t8BE2rzvrKi9zQ<slKm9cw@7sMFu6QTLA>=g
zZbqXyS_no&!qd!!h(T-_ZW;l95Fa0(azIW}==|d#cmC%fb3CaOnzKsh4jrIfuzKU@
ze}jE5P6#O@$=qNEPiiAu_*fcGc{YKVKX1kXKR+bP^R!z0zU_{KIvXjCDu$K2#2Zc}
zhUVYgvW2w0@J|`Y%-r2wd-Pbxa5C4GtDRjqR|=X*GcO5#D8_09Zb`i8(BK}urXh*B
z|Ky1|-zZ)BvE$@$SaT&X-6<d8+yAEw5E%8A7h)$1TiZq~&gn;VpkjGNH;b*Tl;V*e
zTu6H`(w9_**cl+bM~U*pK^<`~*Es$wjO7gcahRdUX9Zu72eZDixL_kYjrgX|P>Dj(
zF0KCD;#T#%YpWLkQWboba0qG8U;k=+@7Q|~8!AEpy*l;&9?tB#LP)b~!p7!!_B9>S
zzu$8!__q4bpNIJwf~Vx=9jLAvuSj-nVG4T<ZYAkYf-N?;cO8et0e0{Fa^37s+l3tm
zPD2#Z(_tppafB5FU&U${(|^MUzZ(@5wQRE{8S?GhP0SN0d7VA`8h(w~^5|?EPt77+
zsyM#E8GqitZ&AND`&))*n2PQsM^0pT{U4ZP3}p-{q=!!)TN_liyLou1`lakjiXveW
z-aJ1~o;p=9Q&c)6AWhfYHd4^Ja}PmRsFY-i{{EYOlTC>lO<{yBmz7&7^aJ_&`Ud4V
zk+we8HN6TLw?cs_D*{Nz-kC_RJ#>wl{kI(MP2wHV85wSK&G?jY{)gj|-GhIyH~mUs
zePyo6q~kGUa#R)kGY`UylWpAo$5YpZo)iVFLM(C9|H*~VT1$l?ixyc0ss=P1FkV?o
zW<!|~y#gwFgB|DzMJH=qWidS_#cJ3LZCDh89fYJ!^zwwL5}n6mth^m95h^1=8ZcEq
z=y$_evz|JrTSbXZa&w}|$QaFS4mHuKPX4)Wwb6D;b40K~nU7(s0o&Ej<n=ev=k=oi
zT^5szOtfc@9?dD>Uvj8YYAYSy_C1q_O{S8j{|bTYlLr(C&af<*25O_03SX1Q9>Bw^
zrF7r2+IrXzu}h`x)?Z|bK!A*s2uHv`VyxRCfB$xE+I*$dHE3qcyuI5i)wq8pKZ{-G
zlx?G(xt8!C`$r5-_dI%Cr>DHU=p5aA=j!4@B<ljH;q*tkG~i^pO`=@y-p_ER5Zp;)
zj~~x_A#@&Y>EMlV<E}D&nX&+ANDLuIqK?NkbEh~n-dc25K2^P94~UnXc%oEb#VkGZ
z)E3}%kGztw<d{BfT9^y8(|VpaFZ=RsyCaM_LyQ(D%e-hB`yen&^cbmT3W|pB)2>b?
zXw&|g(M>^mc;^Q`p^Y-RenHcE(3?=FMdpCF9qGo4Jm%{C<H?4DPT*p21`1!}LF){p
z(R?$7IfqXk-WV(W+e)tHCMIKuxeyM_>TRw2P14`mLqS2b>AcdNVn|q#&TKVb%ENrZ
zkz^TJ-t{u+OYnb{xJO_7{dYd2#oyp3Alt(IC1(#Wlv=G>g&ao=sct}KD0imFT@R1R
zLvv8)ihrnEC(D#IX&){4v&&~;wZ(U+`*Aw1_4>aG15bBQ8{syYI!$sD1tOG$!44Ra
zetMgar_T0_e*-SSu_fC<ooINzri~cE{}u&N8|mwNUb##afE{s@Ceh_BV>ViNThu#A
z!;Pm*NrFk#P?$mY2bj20&$Q}^Smh_C>^UAzJAORr!{8h$v1vYeq;QmSaVZNRrFrBB
zo6Gg4hL+<9-IBsyl)cLsh9q|-zV3~q8y%?5N`H!_mBj~V1&maXdP4~=0yA^tR8&+j
zC&-*VfBt+r#9%5`cwV8k6|lCmyxI$l6skUc9JkI=y0&ae@M!F#L`i8CIMbrcfoLk=
zlu0;euBfzBS-{rzKN5{uRz)pyo6VFN4HE^Qq?=tX|7e*s!}}wHn?>f_D%MPX6r0BJ
zv7Zu*0J%=>0s07F0&B2nzY`l5A&{*;E-eN;F?_+JBF&KwIxVHDct+0=;Jo0#a%pue
zsF_|)aE=rI4IEi$aAs(me;6~-g2-<BJ#D|Z$+>mW49vcU>FG_}8vO45FS_Sh9{!sO
zLC~2b{*dhu36KPJYH8hOX$yk~r9Zh)$}<8Mr|gi1PjPpTzcRBJUMccs@ruFUX;7Ny
z4p&>K9YZJ09x~7u>ZHVL*FJrm#2qJY9X5>pUa(tzukAeBM$uN}Nt2_Kl9nu9Z2BMg
zh6chv_$IptrU0N{`jb5jEa%Q;tJqpt&+FTleT3qo4y$z+E5WJG6crVhDGir895{Kh
zzoMd${XK`HDO~a<H>N@?T_-Xh8z(R2i<OL-V??va&q=h~{ueLa(!MrPpiMyeOx8Z(
z@sWLv;rK6SpF5J+@S5VnL!3>RFoEiV|H?C-ilJz07hTa<<pNUMXMuqV^0eVrGAn8g
zcF;-oD<6?AhiBAZSy>-WM8+EIpl7nZpZgow@J%eI{!PB)|H&COBkR)p-`{NZ7D5jR
zpHs#QqcTca06##6;y0K+CY1<478XYX1@eV-ma5oVN)PJolhmn??evYfWPkGo7^nTY
z_~6y6b=TyLSFSw2Z{Mux(|0mhL{Tgv<uN%=`v4|4NM}yEX>5XwLBvIR44Bm>NQr}s
z;Qr*4VD>lU;STX{H%vucJv$}EP{7RI-|tFLKOO~lrJ0%d2x>Fi+{%s##dq(bO;teJ
z1P=n%=d6~}Zw?H$AOm}g-Vif-;K>MA6ws9!q69@br}U?=k+r+slAg9&H`3dlJsl7J
zmIH7jXRa^LelX~A3kPEkhlYZWd8E(Io%_vSMw3wyH8C4JWnaGn{CY&p8Q$W1N-Ez;
zyEbqp6f85#Rt%?<sNrs0(u6m2FV$%+NWviHP!mUW1#dZGkvnA?;dtH3oK8~MS||z;
z%>VeKrMCh6$L-od`mNRg#(<#RX40@0E90o;16I$JYW;58F?L8QfImXk04mSQT1(dr
z&mSaK%vgyhX0&!+th<2rl5{BnK#s_VRtOp~c5+AdU4ps{D5s#HqOp<`X;*q&k}kLJ
z*wF^-R3_G;;D9E&6h@~{4|FWjht%<<Y1FfOcNwJ<Zl%2=V)gF=t?hH$!`rgWGzXT4
zK92?`^UA?iVf`T&(5RfvaoWO~Fs`$iW_d*gdR<hU3~Mf5F>G!Nsj=O%Z(j7q{vpjR
zz;YtYOjo*fqJ$Ud@7pEe$*6Rf6%}Rj8)h=NrS<I3;y`j5p9)yet%bC@7jO*;AWf1L
z%a_k&%fzi#U6KQ3Jdc7hUSaYXGkPtGZCH^8P}h^BVyXNz&d_)+zz8cUZh}_L6B`Ne
zj>S5K_(fy6nXmc$y}q=|p!b630vsxk5n|o%F3JH^azkk-)x0SxD{~Z&&}c0f2F{n>
zpkd_z%Bn!*=VDAxrs~gbvo;btobR`=QwIJq53Hu(6Qh2U1cxSrSC4(xJfpLc@er}I
z3j*mT6i(<@7#NU5?{6wx)Ct3%Nzyu_0?Y0?Y*!?Wy4D_ICk5m0W?Q}p-UN}A#aU$N
zP6hF`^K?oWLW1;+Zrp&Hfy0}i*(Tbow#v$^|9}0jlf7+Kvu6Wf=``TV;RGp(r%Z=i
z&KnRL#KmbiX8wssf93Exga0YA-O8v@qo`Y&i)?gkL~>DNU$CU$=9lwX+Oh2@A;owH
z-_&SSqG{f16;)Mhdcr~>Sx$dA;%!!o3w=c%j@s-N`Lse0{6~injBj<ynE#_YGQ#fN
zflb+k(_2`|z&=6^A^{~@LDu9&#xk-;b!!@KN^kaRY^?w2L9m9Smo<z76#x#&AUlVg
zxE47C<WR?Y;2V)=H*zcqayk8R;!?xREkW}hVZzqqZA^l@&~R~iBlK=SjuDw|M$UQv
zzYk12zN1Ak2J=omtzfi-8Go@~i;tKPW#1}nJpC>)yMy|Ni^mS_si%yS1K;$g>?(7K
z-<7n207eE)OhgX>fdj~5-OM)v$HABxu8TlT7-F%V5M5wzit*jMjiOFe0fbT?KJ0FQ
zOgMy|eT@4^spnuk@=kW9eUCEcAH51>7J3za@yg%4fxG-y$HDJIt@E4wD-_elLsd>#
z8XI4raRW}m1r1qaHE6N)3-3e@xha$a*O8%x9MNQ5|F>nR$uQTUlKwpVy&qR>-h2bP
z0=c)7Q$&Wta_~jLRUx|eEaz+k@V+qD%Xcv|HXcCitxiSj`1mos9a|>1anYxh%DnDa
z=i+HX))TxFNRFtRf(deCe!NR~$^!m#k@pp1`7UH{ja9#ic-HAD%(w91+jY_$Qik#n
zAUn@vsk(eecJH3LLeX23tJPKNpuXUa7wE`h?&+up1)Hg)njQ=}`GpnjjP<L{vy|4E
zp`B?KXZK<c24Xs*KaBQ7{#ZidYqIj`t9h~X0^t?D(9@$lRatM#2fJ_~{n4YS`VXz!
zw$&v~yRIvTE-uK~060w0xmQrw!CFQ*LngRC(aD!N_B5h?LsZ1R|N39ihixN?KA|Sh
zpi2@SwOY&MQ@Id@VX9-aJ2?L_Q#nJ2hQS+B)x;!fc91&Ern-gkV;wR_jTu0Tzw=3$
z{0|3-F$Fbdkm+QtJ%cXeNB<RSq$%rdcLL+2Nt4KqP!89gizdA^mpE?_QD|jtEhur?
z$1#Z$#I!5;@n^3$tao5cIqF0Q&ro^>X98E#w?zP*e{s?F+jiGB5woHo*0}Z<@MCun
zg*&l)NBy4ei7W(A<3o%X@s<L`J<&GzBr%hr1Lz8>+C(0`Fpe5Ognq|G74EN(8-*Eb
zQ57+6d_H#WTv(pzu1SBVpK8^A0Xn@vm`j$psd2Q$ZbyJPS}Nxu2KLZ=Jc0Qb7874p
z*N<Zb2&w~YBN?i&iSDSrn&J@jDW>*d-dW(Q@{uD3<y-=Gu6h4HEax(Qfgmu-lr2_=
z^LU%o3$%C9-`G?OeF60qgFgrb2D!>fo}uz+4<I(Uf$%4mgs@Sgm`trlaqBHdp+Zyi
z{M)fD!=fqD)i`~56|z89^cghB|DrjAG`+j5_ns;#ITC>b+G02ytEi8HQ6IHGo!$;W
ziEQ^kzKdkM{RsRY1sRG6(Oq0v3cIvVe)K|`4I*j0jpOw><%wz1b8~-Q!JWVU>E&)9
z1C6HYGX~yh%te8K2H3A3uNOh^^_om0atP*A`^Wc@jOM^p0ewXO>gJbfqkD>0h#B?R
zIEk?4XnA5HAclZV`y2HKQ!yzE=p<+MhyW0(sa;TxHAb2=i1sT^nZzhjQuF6X)<Qls
zKrhjD)z!i4xJhD#jMp2CF5U)Cl)`SH8zK#Uru>NdD~r!g%b82oe#wpYKY3~@X=!<o
z@OT4>0JD3KnRJXskWYVk3|pD9+bgepP9*@l6xw6iXY9JD(LabUbouhwqPaYcho#^G
zu}_<d7*cKU+xD`u0ZfRqpyB#$1f1S{%r)?hnUjzYiA#nSszk4o=N7-fKw)u*-1E@k
z!-gJ=gmBRN$jPl<)eV6nhl~?KHix}~H_gkl4mUMdUrB-Ru8L6xeTwr%_rVsSD^Q#X
zgLHCPdHSiX*TV@>Q=xOgGjc)-!)W*ct<IjBNe=4l^K=3tkdysC5<L*NdGKD(WlJv}
zcWiqXn)^AU{JZXT=dSCL2K{Z_clL0TBv-pjA6LK$Gp-A+8luDY#bdZ?z+t^9Xi5Ka
z!1(cXm(z5!9Mm(H4&l@AoHv1doL@U#!#qKoM-7fEEBbk#Hdj%bwCFacMVK`Pj}zb-
z@aZsdUu7}0J7us3yU!y(*C0<9hgYb;itchL&GxQYw~lV^GH@qHm#V$++}yDTL)%F~
zJwrl6A@Wus=j+j{KVqt<)Vc@9_mF~qPj^r!sq`{%sd)01p9AzUz7Gej3l`##9;>;)
z8>R=~!HWB`f|g;rTW>>nZmTQwcn9Zp=p*9iCPV|TJ?2Tb^hVEU_$cIp6;ZLo@liT`
znUiNXX$EX=KpF$Eb`!-u0(uO)tkc9u4G)>@Rei?E0ouvqT_TI429~Wo$K*3-g)Ffk
zZf%RONcyNJSR4yqSXKl8&~5m8EM>`-O=4BtCw91-<vRXTTxhJM%ATvxLNb2Mm%vPD
z2`WxBm2kTJ(|JfsNnM8ULqTXD15=9U&)!x+Bd6mlgcn=6EXYVOiAa4Ad6YbE6Esuo
zg&_U0vZT2r5DGoMA-)Q;_-xYf{f@b@c&vXx2E~<SM3B{V5Ao5--Z+yp-$Bge0LdHS
z$};cLRwq?}(|#>9?R1|aN&vz&H@$^M4d!p-ZrPx1AY^)rZ1r$1YgbIs=zuVZ7kBf+
z<>nO#m(ygG{;Q~;>Oh=%Mll2#`^VO;C7oMJYQ2Lq<P{a)mXze+Y87|cw5Ot?4ns%Z
z<P-gLoP_7kKxthPmaHFGXiYbqS;wo(Kl9a)XQwL4>Ie4f){WlT5u%KeS7PE<VgZ7A
zRUn1uJ>Idoj94Sf@k3J<<B)M@P&YUv%X5%ZB$U@gLW3R5jsi+{?bBc4d-9|n0~na5
zv{yn50UCxiRN+U}-X|@dU==eyYf4^JqX-qy1Xrr*vSFSsIi)VDve6b(c#zVyUR+Tz
z!Hw{RA)_9>dw&KbrUZsvg0}@x$|@1g#$aknp%^CK)<m4;9XF-WLtsoC|2iX@#N|85
zM|#x2+^4RBU?5b#@fly}tJPh)b?L~tO-KuC-F0G>m;&xZw=)U*0#=lJi*;!5G8~%y
zTBe)mAmt!YAVnzs%6)>D;j)!<ZWDB4B1wf%!`QpG96$Om2D5m5{44PdG{#V<kuD7V
z_jgdDFoMMxQEpgGO$MI9Ub1D&%}rljp@k@w!ukMJG$2SFJ#Y@DG^3BafBa9L?r_OB
zk$RBk5A|d3Yz3*!fBa9<t<y*TzHh;wjX)_MRD`kA+}M?Y(b3UzFS$sB4(FLO_vW8!
zZV7UEr|0C+u=zVLc3=4j&s;fv;prxpLeU*oP_w&5p8N5k7p_<_!fX{<p0=I)UZZ;i
z>P%HE-$Gl}BA2_^Hpg1ZsR6H<n&N@b@RrW$oWwSE&SayL8>=5+SrW2c)Pl&Bi+x&4
z7ltZZi++Dl3j{#NwcVV7q;f)P4PE+W+pS_MQcO#Z`1%g;a(H^47Fz?6c(6DtbCGJ(
za}>dP;D7<_^J>LAao*li3hE5&t4B^5#|WZ;y*;H{TkMxC(XC0JM2y3{1dixr{u&hu
zXk$u#FF8So^Y;RekN{`55QQ;O+1l22rM-PEUxfuHbI~%F$BA9vJbnWi`VK6east_i
z0Aks?nU>KoF*bhB^<kD4U%JVhR_FWStH`;`{j$%5U4BqPxE;dx(bx}aOyEw_AP5;8
zd-v{LPE6`3&lb`r3o(|`bClRM!nS5{=XmoE6lEn8+Vqc1hjw>98<tDNMKe0m%aX=1
zJb4K*tMiVXJ4N^0HoBLAus>h{1s^Vb0PE-3I$hXJOmx4+4-Tse@u8FV3C>l21{Q!&
z3awkWPF4)*X9YA9#<E`AAttW2kbO&LwhlTajIk^&|H3R`nAayF+uct07y`1OHQ|4L
z-u?+%TCYn>>7k{{w?`|%Sf8JvJ#K}R=X9EcJt5KM@JKXc;igEqnBt{k<KU+R%ILSM
zX2XM6TqILOCyGtQ;7~ygEKUmCPf6mGl9zX^e8md3SbAPr`8(QuhWEn?TdJK!^Y#Un
zFSFd36n^oF%y>e=R(u_V=`Z4k3zshaB!)B(vBLhvV<!H7j_Ov!5^s~lHt(a!EsOT)
zD4VT9lqFn}`FNdDA%)eOcV#ltz`tFhCNDY;zP+32j$1#8<tX?g5$fd(u0u$0h!1FW
z_1xyby+myKlYy{UrEbsH?L#5Op7sVrN1WZiVDsZH)2m;oCJ(6f4-C{<nLi=!@{wEc
zLYnLUhP{V4a1#dU<x}|zh{2{+XfV)YbvBC$A@Pj1vO5KytT~Dp9Ukt+H{`8IzTKJ`
zEX&hkBuzX7xzjNAh<9VImY(S(6<7VWXx~@vg29d<Ts<)GL9Nb&9nZG+>Ansc#Js6J
z(F({-vY=u+-%j<OE}L^9Tz)dZ4f~~^h3$U+gpn(BGZW5!q5<+361jEn3g>Yh6vBx2
zSlMj7{i3*7NLr!So8R0w?Oo3yRA9`b)R(^GfC54eil<dUGlMjL&ADWk+(JSjcyK=@
zAD?Pl3E3tPCxf>h47&Jt{dr2V-~B-#Er6Y!nmohJC-(0z!sT6@rSsn!4F)?<7cOG5
z9*mM6BqVQOp>E%my;ee|^6Y|_JU@N-W1#a2HF+;NlJP0`4*Is@tsQ>JI_TOb)-dvg
z+Atm#7Gw#Q-{lXJZvA9t(>ly}C}m|FDVok<YM=1Z(#L_Sn?8R){%S0a(G{yG(?m2Y
zM>Uome&{HvZy>iPa5TK-1k%FUC!ZaTmlk|`ONU8@NREwVs^GYeYpzS7Z_QI20D5KH
zhqoM(R(9y@5q~6XJJE|T6Rm?f8uD5=%3u^^fe&)rq!yl&snRL9`xBReXArd|dl$8&
zXW2nUCcUCoj3g!}uU@l8JlEakwzlU@Rp2QmCt7~#qyO~+U#8hAz$6^&#Nu|Ah|42#
zZZL+xT+E7`2*2JfxphImSKxnw4&O^o{OyF(kq`lL20MP_rb<{Y(j8uQ;*A?WXHQUn
zO0Aa*C~P@{Z?WcIG)U4P(VNabjV@KMBfMC&W?-vZnJWsff6kcNzeh8`Bs-esOO}5i
zK!h=bOH}$(=;Qdx&6P>3it>(D3F}uNFhRh?e_@{YJMS~yS2(dn6rxC+{yeOu=ioQk
zACV2b<k$ze-8|Ayob=js@)54K^VhuF4MZmmdAbWLZobl1xjRtt_){Df0N;)VVrCNj
zX?S7F3vwVn-s8cms)c7r`j2kLt|CY)G*1^WUim3rs6k?!I6s+5Bmt)t^P-IINw>6#
zX&R2N$QuX8qb{h;M5%|dB2QlYH4&x}Lr!ck;C<UIbWda%*w%N~KRkx9Dt<ihEa_){
z^wSPXh3*w2uv`_xS}q`>T|PcN3@A3JO^rFq@8<(*How2!u32ixw!IK>RK{GOq`moJ
zhsZ`CMl_%&+?F7~STY@m;kgYdL=cdS!Vp3dUA|zf$Z;=jqIGE1McenpiIpo>Y$EoM
z?!}wr?DY|1V%|9A0;crFGDQ^?qDo3uH0;NRDrX;4h656uhsntf^hx<)mht}cH@g9<
z4R!!*O)MlkN1T&JgSM3ZBp%|Atjyu@jQiT)EN*-^0kh!IBR+oMuL~;5Ww{Dn2f3Hw
zXHT4V?{F*A(%M>&HjjZ!aK<OB3z8MvNs{TuXw?}g#(>_n2uO``td;%*lQ3gh;xot}
zDyWYSohl9(so|3anooL+J;cg4o!aa<1P9L)FzAqi$wCoOy3~ZxyT;H>$AAF=0V-ba
z>*`kFBRh9s^U1Fl?$165r7a5wNyeeOB282aqj{Fz#4JOCK4V!KR2eUUt>OnaA<b0s
za$ppK!*+w&kK@Ptia<E&!3#P~ME0%*8{FIysr5x2)s9tNOYQC3E4A$?{{r_A9!!`p
zY{$0HU<cOUXF@{6_qAKulzGEv_wIw|^*eRyq+x{JhG)G%T2PT>|5ntTTm>&VNG77n
zAr8CgDX2gW(oMOSAR&-(bFBXq&A(>xlsyNUDrO_09aM=nO9%^_o+DYt6bYIIwnGh$
z5aXM;ZVwibEHk4OpS!(QtaNW3q$zO53b@N(c^>a)k$#m|Tv*&nQrG7*MqL{tNW7T-
zY)1^ToyednUz(2TNj8MK9@cyT`X{D#H4p=_D33VC#u{b*I(!W@075`3HTkg@3>3d>
zJM5PT%A>nT`H%1210IZg@}#NSI<EdLR|(2vx7?k)yaw-Hw!Ylic--r3bA5YuZ6|dw
zt!islb-0b>Frl&Jwb-B9EXbre_~-yfC#VtG6lN-`;S6DRiyxb`>x&*d*I31`TDgU%
z)`mBywukuaas>?8wWraE93qs`<k8WwLs+VeZp!LnKk2u@IyzqgMs6&%;|H@faG>k)
z_@&UGnRjA^coi~}>zLgs^XVu8Fhgch`G}UBP|@-K?HU`k3X%1M<?mKMbWlHY_G|}H
zT|L-$n+xDt@q4>|zT?Pqs;drY7V%Xxr*O}TXS9@DjjQ^ZRh{P|4@VLzY6{W{&7F>k
zwxc*WwNp`)GV^_zw#!GLdo*W$%#?_=-y9j)`wscLHce*;|LM9(wzel~3RA}?NyGE$
zRk9<lk<(UMQZjvpjYP-lY|JWDLv)tC<Y47XQz&<gw|uIvH|&G(!2kO6^44YXsO@Rf
zU7s$!=&Vv03{dFec93(abs;7(>rOYbOb2zgps){R=%aa(A6HQsqVVM@YDG!_`-gLS
zv8V7dv#~M^$-8(PC^dhqd#<B4F6wye4sJ1qeW=KJBhry)fj9mbuA*{a90b>+pPEnB
z?4jM^p9y2lC)-GIb#1ws<K3*RIh@JW?f6&V7kHl@@2q&?n}ukUO43WT>tx%vhqiex
za!=<rw+1qd2?&e5eqEb^M%HCRUgeLjKL5rgp5NuQ#IF2GmnucEoqphLo>uv;U9rP%
zsJHdpRyy--9+NXhefXJUjKCD0^+}vkXr`B^mGfyz{4RGVH#eZmNon|TCaBh|xmY9K
zT={kq8=`7QvD1zsGj7eRwr{P2ZV<KEq!Ax^7Mve^_>OvOAo__b@83c!F#9Lq{Ntg+
zhaVaz>^a0yFs~YS&p$~@PtOnE<nb!0lOsOn4~8!&4<QOdtEKs^7n&rbCzp!D{poUU
z!kZ+o?ytN&i;aCc=SQ{uG~Nl`(JJa{=3}9RW^wv~sSA&WO9v0?PDMtWsu<GQcjQ;+
z_3NqA4r_Jz?dP#$v+mzNoB5b>nXPWgUE8VkCO0Xyt_Wx)8K*IDqy<xogCwbpu~NKN
zj*$wE{Klje?3(BZo<*DPfc}DVV}xtiwt-4YLI-ECgY?2~Oc4so4`{|({slM`e6OL_
z78<MYB&OQMs&kIa07$5QooOSZG_OmmgKO8V^BZTmx1FRApcS(UmgdTuSPSXy7#%1n
zsNWt2=d5GTBL$(}I!Wu7=IUm0n$n-dXoAuFudiA3pu>m7Q4EKL2Pr8BHv6;NhP^qv
zkqHaGyTd9wd7x0j{F62x$BZcv;qOKnM!y5P!`!2(;c0P9%heW4EeZcWy1bYTu*T~%
zuNEqa20XleCt2BDJ9c0^@u=W2GI`Xw9{dbmBkdT7`a*a@G6sDSq#MdjKYtq-a;Hop
zJHIA>!XOGCAEGK{ITJ1M%42aQbOhsOuBX<cN?*L|debD81J&!MteY17(@E_Xmb!&`
zEu^q5C|YRNlXA~8&fY;u?1Aah^n}bfQn~f;IS`w43FcH?7<c359lfIy*Pgjj)mVCA
zqs<R3K*PJs??-qh7|d&-KW}!`II}7&%^U_)X5UyL0}oM$`zAvfpD)7bLpCD!#<M}p
zEXj23-(QcROsy{C6nrRfMLwtS6U;Psu<&m0cQcsKHBDSWu^edaUk8zUV4N<L%gy<t
z945liRI_ytuh0+YPnZ#<aAX5~pb9Cktv8%XP~sOhK}b=ONdDBZ9$j=l1%<a<x_B`~
zt{(oy9|O^R^NZ63y}pe9^&H|b_tS3oRndOo*?u=?D>JN4vvx^)B?e=OuqAsxG|gQS
zQC|!yM9~Qgs!JCpbP~u6n8532lGYZ`$>}8v2J+g>m*&#t;;!*q4Icw(JF!&j4xIfH
zz9S?Ewpqvi-ly~@eR{Iz+6Eu1kh&xeWE74H$V)L~KOOZpabpNa5B|ch>_E$~a~zf)
ze%_pTq*n7q{UPaV;I3BQ?}T9KU>Ioi!S@ZFFyR_q!^{JprNzBi-kos+VYjQZGyNiN
zAWW$YY$z4l&&>o0B8|ZURLm132FjDcUa$gXv_t(CeM;$`Ls=QmdqV{YQ}2wHvrcW&
zk|k%Zj20rJ%S(@q=}TF!*C}z67Wbs4#zpB*ICzFU_T}1k{U{V`m+W`Ty+pskiyrgZ
z#|MIEykjuOVE8RNXCx_9S8=f?_X_J8+6!jd9WlN^pdl*@RdUfbx(VzCD89_gq&Ih*
zRwHC>AV<+KG{UWva?-WX?HgOj;I8n6fkOef3Yp#X>GUinFIiGbc;O}0@l=`1dGck?
zfq?7gQe_HJoj0$#0vq5?c$mIPKZUCzZ6~q<UxfiPaW0u;K)b{e)2j$saG}&nn+s(_
zz_%5)$r-$8l#q<OGe*SK7Nj^#E`kF9!xw1A8SAz_Mt`>%gpk?4X#doHy>a=a;`~-0
ze8Am-p0^NSXlSgXxUPHOZ~(+%xQe-8eq|fe3E*-v<zr~L%Oz3lKy9LLm>Sr@Q_dhP
z$BEfB-uJX{Q+m+*V^%#{BH%?55bE9AdYTCeLC23D=kqCaY}>qAjuz@%J&<9l%h~=e
z;$u0QJ@xdI)|QNp&9m6#ubYq^kvD6>wyc86j~}BZrsc^EQdTAceBg^1>;UK#my{HI
zFik^J#g$O4G_+&i7jo&>uX|y+jO7L>8odzL9{jqFudI9&E&m7RoA5A^QDOrGjIP+a
zrdVB5vuDSSH)L62536>eS4R^N@ZaE1<`nPwsoM>6eZ@=g4g6L!HUmE`Y?O$Auo>+c
zF8u3Nd3mlteDmU2<l#ZQn-VPcy;I0=IoG?oHl1~&i))MQCfq#_60qrhVPHFh+OPXw
z=_M(I%H?*?xG^<%y5ldZ;nO^#Pn#vy3>;ySl6k|S%4tYw=B`5{mQ6NP>0!HL@$=Y=
zKJWU!uG78QaI|FMo=|t)M$Jb$Yb)bo*Svllm-a!MIeC0Ll8%u=*-PQT6gZfR79A})
zFuK6$BmQ=S3FAGkenwOLp$^S5?5rOkPz6OBo0`^?=cw|>z-r?o#U`k$sarsy&A&XK
zn~+qf*;zyBv}{?GYD=5DvYvW2GiJ;{xxhXudZVnU8qVI}%F4oXJ&~dlajVVC`@I-E
za9~Tl<501lJ9kd^a1}rClRc}feEUAAt*s5AlOlQU-8Qma1YwG9#;H?9tNLydECfmh
z?vB_#OB#ww)S5T10Gl3=9@58KDJc;#u2VU*DCK|l0?Tl-o-S$l`Zgy=Ebd!L*#M2l
zSJ8tN*}uQ*^C(9Lha<<19Za{5;3DNcdsZ}N_dQU<$_I+;Cr#R#n5fe^(a+a+wTsK3
zQ?Xu0j{HSW5R4>ez%?9H@3ZO;9Sg~G$=rnXBQ$CbJ8`~o47p}wYU)ySr%k|mA@nG^
z+s?~($Btwem+5atTok5c_V!-J%)zpfvO~h7#bY=;f4qLcwb(xPZS1aHDQIU_3n2ye
zd}TiHkj%a-D21FqUwCJ8<3u_jO18bfJ$`A!Y%?>eDrl75J9pv>Gj{y=*R_j|Pr<Ib
zePJ6(PFY!5s7~=;4-72eTO+DyIeO?#FWST}?-1GtREbgCbQg*$ZFV_dzrK;1&wVHJ
z7Z&P>K_1D~B_nprA4E)c6K&tNZ6$?3r}g*k)EK?!)3fJ#>R(LSuiWm(=~nBTs7~6k
zG@kBB5^!Nmt>^C0=;$pN*<l}X_{tQ$4;~B|M?^$iJ01`a;Ol#<ZZD21H#aqGOGTN;
z7`#m+`DCQk6&Zh)D?zY4tn0TwD2*0g876N`(KxDL(>Gea>G{`NG;(oIzGkipTi{$G
zJCdt>nIho$x?g|&l{v!QspLUg7^OF#%h1wQ)zj-QI{3F@_ENzx=r-~sLAuO`&f+y7
zysa&!`fo>+756V8B|=5v!GU9Q16V6BLF@a&PVc&N_wHp54qvFJ$(Ih5UTJ&_+|oIY
zouB|gG~i*!qEXAF&3%?nTu3nkM9PgE8?=)e6;`aP0puW7Nprs+_j{I>mUQqqF=B8z
zv7z~SpJ=Rklc(3(_E|fnkl^5~8#<rACa0$I;tnk9oLs6fc1g!gZOS+3%+z~u^2G}~
z6Jdf7PE*Fk<YQG;k;lFxQuE_NY74@fGUpe#pUIQk_@iy4g{gNw2M9IW#N-e0ysukj
zWn?IoT>$~4&f3@2K!!cSOs(|y=Z-VyINw%d`cj0uV6@yrNk}&!{ks~Fq3H$`B^=yx
zh6MpU#3kH9x;_($Ef|-3)QJgn@oRH89t)c)P(gC%(Ie|3uR+G;;VLWrlJe*gW{Dfw
z_p<i+1b7uJ#5t#&uq|KfV?cdC?5K=M1N0fNy3+>1#F)?hzx`B#MBpvWQ8#ZAJQ<Po
z^5q}>cL%mmfGns*Y_?!{_<;kNpo3#3bEserP1l{Kd)=dM?;&{WX>-<M+Kh4OhECB)
z)sW~{=}b}fT?;5YOauV|T^Od(qkH#h>F9|>CuTwgWYx<*IM_3)Prw)Z$d4YVsi~k;
z13X3!I(_<sC9aC6Nb(e4Hczs%Ln!#eF{{~@%B};rtuw3QE?js|IUsWitme3E*`)VF
zpM!1q#pJy5tP{%_M_#@>pWFCRO!GDW`f^9UV-lP)x8h@Mg@y9LofE1zjhj4gp5pL#
zoizhMF8`0NH-XBz?b^66b3`(fA=5=gDr2S0QkhDl(jZfj1`R|-3T2KcQHm0gBt(hI
z7&4@ZR7epGqCq6#`~A6}cYSMp>$}#wp7(j~o9p@y=ef^gAN$zHzJB8dUNd1wj*JMf
zv$sbf8S(rM5QMbh_2i5WiPV~UJ{#J9`ew~vcYwnGKDUw*W_Q1f^!PO+X(mseJY&XE
z6O+qu&IqzO@86BJJ=2V@%=9g7n!|?=N9IDq6udk7ry?k+ygW3PGA*A8<%1Aw(tLXS
zxUH4dXrcrqUT*aOeEY3i(bXeI4j+CpF0Qi}X|QI*#tiO(Tv=FPj~^?msvbsybNKLJ
zN*@SLr~$vgD&qE_MJstAh#B1`vB^23S$@;e=>Ur~5~3*kbne)ZSIEqa_-ogWqBvc!
zfOsdIW@LohCJ&+uVC$6v9nEgCPTBn@t9u;zEiEJs9KQMaCILp|Z+Mu_+xFyx<GXdb
z5;c7SrO(C3GcVIyN9P{Oc^c?yYisT8>{`FoO+lXkJTO`rYi*6B?B-2B_7OW}`SKWe
z;(hy`z(kFN{-A2vwd-z;+Ih9SaKx$@3mPt1@c7xYBf-HKmluH=SWa}Xy~KiTQRc;n
zy*}RMtwH9C7RkuT?R`uD<c`%6nLRpaP8kb_p#TvPx;LuN7W=|R>6r=|S~?p!2PI-K
z`J<I1FZB~vAy!r;NGpYfn>YT}j;s}wvia~9eZlC&dKNK#!|OJEmuwpdCkCS?D=i%y
z9&TxFE<9?OH#QpVDT<`Lo2l2Xkxfq-H*TDUhHP1+;1CBGU_cy6k(O`a^cT6g^%(WA
z3bO9qv$L~9^S`k=Uw7WTHNaRN3ZJh<llNuYy-8d506Rt2ik6lZ*Gzsxx#}n-g!uS1
zsB9yA#{64^Gaa}ulo7KeE-gNP<;oSn0CKSyOVvVV$AQOK!E9?jv^TJv*RCBQlCYUI
z>o%2v>=}}hl1)ub%X@!u-IuijsbZHSj6@O(=a&RNug}=Mz>uB94(=K|$47i6^l{<H
zha{(<X8x0_0|J<vnp&e_Cd=t~Lw*@C0yPzXA6Rne*fC|K=F!pPAP=!+GBq`Ay*faP
zsMxVnCxV_5`8JorgXI!QV3q)%n2ls=MM?S2FC2G?YA_G@qzwgWMQW|19?{|z`5mPN
ze_HSXICV<L+<XJBiflQ|*K}jc))F3p^-xr-c&Z$u*d$?Lc(w10i~GRS2ffj<kKkQG
zkh5dAMOex7`*Zxr5u}xqP(0uuj%rBFC#<7~1BV$^&RzTVG4^(}l2WgJ{T}p`KyiTX
z1NCZzM_-b1G{U{4-r`quBqXGk5(D&%>{wg<khwzKmOihsC)2l~0)?z<>0X$m*RV_^
z#Q+o&YwNvBRa8}lQJ!SUSWgHYS70&Yvdbin3{rP{okhxGaencv-`<?~a4R{Ec#g8z
z)qa(GN(h~T47HY8nT0BV4;5meefItPd$mDF4;~aYX|e6*rVl+nMs?Yn)%g88=Z+6J
zrn#9=1T+u=L6<JXrQc*3t*xzJr)0rY?3N^$*(d|_pvH}xVulBxF5!ckkCV=wvxzW*
zsj6UFN94yjjf-F9^q!P~NE(;f2*V7jF<Hu8J>#EvH7B1t_Xz0l?Ad|V6+lHtN9Ir%
zl88C2Tv_mt@0Z3|_BCQZ68zac6lY6}4}Ss|f{qp4g@A5Bm^W>DOE?yOf@NuOIu#eU
zWAi@pJ-KOJfyA1vvSQS^_cnfYDeY4@vKz+a#9S{tRhzRAOz`2GATlGu!^4jqGxc-m
zTH~YZ8<@3%*2iYB?4m`B-e-Y2*kim^Y9Ydwhj3feG;^_)m7b1{(NfxNGz$-5Wwb`~
zaHmVC^$Ei2_Wj@@!3uO(yg$s}M@j=Y^6~O=E=>uVXt_<aN#X2Rm4I9@A(8lO(#6Fp
zM2i$h(isvuKWS}328s;3QY6`za3!fxMw@_mby>oPj~|N;NxRil>KYgXC2RA>q+E3+
z)O`jUnVTzCo;oG}`=o39P*LkNL5lyWxX+)o({XWJNxuH>t`ZUwQc`*nlu{Sy>FqeM
zzhM^AZyTF*cm|i`8{b%SwSV)skz#QSa~GVQSwR&`r;QtY<@k;ro!M&O)?g<|?J95@
z3-bH7Z+tPy)kpQn2nO8oC_jI;U9!Z$cN!{fKBOU_Y!(k}SZvz3xYMV@j~*3*Z|e#=
z82D`7oX!9<{w+{(*_t);l8?qC)B=ycG;u&;%7<n`>dSNmHNg|sP1)1!Z)n@1+r4uv
z=#v=e`dN+>)5+1XK2d$hF-pt<Ro1!M`ijC<2GO?R$f>8-d2d$5plPu?ALs!k&XD26
z#c5D#crn+Z{p9^5IS{ep0FoidvBEcu=JUVLL32>1eKEr-=tSAm)<l?x9j`q94Oua9
zPx9gd0xN9a@=fl-E@uNKuj+C9a$x1hk6d#3nE+t#U)QD{{JUu$?<L|*&CL3e+fpvU
zw}g(rl#d76Zm&$oq+5L8&?Q1LVl+;ARu4A}wG&hqgg!JAy0>&MEJjU@FxhoWLq#WV
zW>Z}T=#XC!9yhwX#~9wcv*HqFm@H=-W8+!2b`!#3x*;NYO=X<9T(j1*bBv+mkbY^~
zT|(k0`2xittYzlfKFQ9;qmK{$UH930WLww1vBBSiafHol{Un7m5l|);m-F<{G=g+x
zizJL~e=$Mqv{u_H{`w@`EiUK=qx(91Sf+PxAytH*aiCwvcco;{9tr2pam9#z6xAbR
zQ&Q+R=JNxIH{~sQScRgRL;f1<({I0e73mT%bf_-sxsZ?oLOE{{K}T6tvGC%p&PIJa
zh6@|wS46=Lu$1IajSUTS5HU-szcz!AHxfJKAIAGK#Gr=7r@6`<buiBfGpk*D_B4L`
z#@);{Fi@W~Y4zT!ic(fG59z>cSA;4;V<uai$cdOG5~ntlIubE#eU{lKA0LepD?G35
z?Bum~-CpLFF>eogcJffEj5>%5K-sUhHdtb&@@xhu0YJ-z@tlk;d@nQqrH8)aIx-%4
zBM*o+&~^`wS4l~sJqJA#)18j>PfcBZcL(y&NsLdyK8dx>3L<7m_2@w;s-e;gTC-FM
zV<bvYP>;P?j;HUgQ6T=^?gS|Wsnb|QZN2}1;`2zf5)xvQ41+XrovL&_<6`K4VpKpb
zAr!7?=pdgdD9YfF`5iT;Y~H=Q5~2z4!@zE?3(fz@<72iako<x)(fRV`NbnH9RaaJm
zNdx~_358{8!FY*NoIhigB$t?Y>XaSAC}Jpk#FzO7z&I*B$aME>CY?J+dM`AWly3ZF
z@#9WKhmdUj1q<$0(rn}L)XZS8r=!}FBUW3B+!n>2b#?xlI=*-%$&ZiE<RnLOe$D(P
z7(}V|Q(h0*L;J_Nbxz6CQpPHVRKI(dOm_m+1&GK-j+3A*MgB8_X#~RjghP`e8~6p}
ziZ%kRnOn4@gF_X^>&uf?ySZ%6vNR4lfC2=aK7GKPn`h6qe66Ow-Dlx(#*I&Muh2>9
zHS=V2^fK0g*Rr*|Ikq|`mGKD)eD5Ywr@+8Y+i#CjQX(50^)YvxJ3UN1vv&(=+T9Tt
zNH7A#;c-Y3Z-AG6w%rgd4?Bci&c=8ChB46<0N8v2zvF!4n!d$~!h<5>($C965{Ao}
z5B}2rW&oGV)sYZAfAYkc(-5mU=e3d$uJPy3w|%QS#-w0aLJ$_lNkuQJMpsGDbOl;-
zFV4%ml$YhgZ>jACmCl{}korLVwq+bH^n!R5khi)29*A9y&~eN}PayyG>yt-}ct@GT
z>vR&3k-JIVoO?Xmjz5KF9<}s@{`<hkKvZ1zxIv=153Meo^_;nU-_qn&2BsSxthhw$
z9c|{`=!n6~<>^*jx15%Ty!P3XCs%N0lXQG&{&w!%=-IRHvb`+K%-Ef{V$^ci9J;Xa
zbP@4(bR}SK!XqMp9U2Y&>G8tU2J=2DC_EC5@)i`&<p#t@o9!2c(aWDpQ^%5Q&7JGO
zDaQ%J--e2!?*<*oE#nfP`!O0_2&@q=2ejL>Zyyx$1riPPuwW$j2hf|_3!ZXN3Dw(i
z<AQ59b@Fi7v112+o5qO#XpaTutlu9_Ifx|YN>*Sf##1jTu_8+Kj=xjQ6SE8OL)-o4
z%^UEHzT!k|l*mb2W_{#7d38_1han;tNvH47;9E6G**!g*c>zzOrhF$Uca}Lm)%)b{
z8CbY{aK2qsbi|BW>-1lrDe7?}xX0u<_|buuW{&7!<PA!<=MoZDCXknFx|FEF;+UvT
zpMJdPGlDq%bM6|?Nh-e8S*Lpv-c)4wCg0&&-2w^p95X>h`!Ay8v0<cbbEGxsTiDsk
zkaIXuu6qH4pg4G;q2ZaeWNagjG3W1DLQf$%@nmG%tpq`<KhG3=7bydPA(lLxM~z~5
zQk+{LYZ0RX_qF_lP<VHbVp%jf2Dn1b4)4;tfB#Xp!bqYYu6~Xsgn~hJb@jo8bR`xP
z61hg6NvQm3U*Plc+^pymqiSSGQWiwYz*!NU+ylv#F1pCPZGM!KBb7RlcTd1_H`m<w
zc~G$hel|sV4`uY0Ecrz-?(46Zjg-z_<)902ICt<_DdE}sdCa%@2rDcsCaJ0hB`)Nt
z1Mucu#iZ)R=SACxkv2Y81{!Qr+w`N+-u5}Z;B(JazJITF+WrD<xpKc&uJ`m*@MQ5-
z$q~Rt9HyzarOE^`>zd^p96>M2GB!<$|I=8xpw+RYq@=Lm@bTjd-`sF28Z+JJ=f_*~
zB7O`TIZ_DnDBdni(lA3Jper$N!GhGMy|!w^B_;;V=ncH;+p8CcncV8ZX!Q>%ZGgTb
z!%Hve)u+$9)+LN{>3K*ji~_USq}^E$^Vn-!RFF;ipO6|jV1T>J*2#HUP|C^M4jkxd
zxqKNXp^aR-n`+l=1zk?`c7oR40|)$o<{YZWn(wn{Qx>hK)Lk+^s-ue{zsNq~_#viJ
zka7833dG6&w%{fYMZLA_mY(uUu%a^G^u3d){_#(Me1k$IT?;PTU#H!;5i<4$&^XsZ
zI1#lS-lTimr0xg^2#yozA9d{=`s7_}zUl7Je_mKf*?`4XJo-Fh?oT<)F(NcJ$(@K8
zeh;PQZ`{aw4i(FzoM>)W%&W}q*xk0nslsG3V)+pxni}5CNtvv#7;^pA7tWt+8oD|D
zXU|pv_Oz5KDIZhg=K5R+LQNJj8Dv!rN?p5lOwwQ?p2EnH>s%hNbc+5)YjeerG@5{I
zVpv2(G8?(&wT3fiijo`+W?c`ECUuC=bSELvvW_AQLVmoR%)}-ezd2Xha?>+1=5@MI
zJ_vB{H6aGMPSMIAZ9gVjZgi(e$h9Tj;e0JZsckdM)HIoLxYLyOMfZ!8c?pB(Z2FMN
z8>M8Wc>IEn{jy~<SAAR;wJ0_rVS)MH%#B}$JRL)}TVK*qP*)^{aAl}tvMNckOG-=E
zjTt;;*)4yb3dYn;sgf7y0w*q{<#eTCTs`N~vlW-fy|l}u(pE=dc~Tk&YNzvsGbSLI
zvnV!p#KMY)_j7ZDiypP#)|-)D_2si|@McI44<E34aEA`tA!F2j$pc$%aF9oWBN9S3
zRgjnFouX~%Yy8%&x5-~onv~A8Y>v(l$jk4-kZxoRuJcR_Jq@97^-P<<RE6(lTMF_U
zy4Lrh)rJ9k`yS&;J$&N8c%}eQ$C+S5Wn5lq&-htOk|5a8GrM`i9@toPzjE>7t*yf(
z1RuVoY6Z3n7v^y5z|IHXuKz0rD9Fp#&mck04=Yyt@%v7z0wtpII|9FDcp~Sjl={t0
zgt00)`9B5{;+od!26KEZVsAKA48e!A{=`wHF2+`G+PVzdk^WW#MvLg`+7qWpXI#Gw
z*}4HCzB+Rli8O9&(E*I8UWLi%LpPa`&`s1M-80{iEs%M>Ya~?9@NK<k`4OuA+C_xe
z7rqOwy~ASzBKeD8$bFAFj^d<jdQ)Y5KXc*nsRQKX9w(@?2>4jdW^-Z<JDx0^^g*LG
zh@!Fc?6{{c?ZkF8kJEfoQPD_Ej<`E?OXrj%Olp|>=Sx+R*g|m|e$`>EB$*|T|1~)`
z<s_CRBBHeE^F<=@J-Z12QMY2iitiM(I8W`d4t-HE#qal2gQt*T6i$Z)Afzaio%Ztt
zWCg*<eazZr_V&7uz*(%l`^C2L0<9l%Y<R%jj`DuU6=sITlLdW*CDE!CT9rY3*Eav_
zRBzSD$jC5EEdPD=e_DV@%~`&Cf*|k#&W9WxBM|nupj(YV8|jG+*s@<yWITO30Htef
z9N`@KdEY<6Y{N-a+8oO(DH+(0e+TVhSg<WFPO<q<ppOoD;@w(NTicK1x_)i+xix?N
zD$GmSBox)`x)~T7_xjkTd{SYjUgwWI-N)*YE8(9xvMR6EvJuF<23^U;#Re1^Sr~qf
z^bykV76r&EB1xlT4_=G(u5bG?RaI59oQ7T`uZL&9UCB6(oNf@u?}v=h(h6Ty{~4^Q
zub51PWR<*63IV2rQ?EJO+3BqP^b$$IiH<M3GBi<-^Ulj9YT|0CG3jtonVH8e)MfYL
z=d$N<WL~Pk#p&zcv!_4v!lX$rf%g$FoTek!K2&dB^JJ=q+Od~&q5d2l8%yTr*v^{O
z&N^ED;L)RAEC7nQ@VcZ|EkAy!yB8(O&zMnm_il$HLbUc=(h>*2nV>{dVP4^>oU^lU
zF8rD(nqf>4Twk%z5rW-Qr-LfQ-^jkFVq)6;d8ydZGh_Hx9c~Z@5*urT(sriHmzNV!
z*GMJaH8C@TSW_%A0C2I6uSG^hv94aN%Bc7$nea}sXHSYvgSF?{*lY$_rB8ofxO|u1
zi6kTC`i-xnGxs*Fo;cER@P)ioP#Dp8mITXSEi~5V^<5?Ik#<o-nn<adGZBt@n1aIT
zj7=R&o|67+p1C57{vKX*5>V3Mcv}criuB)2m~*0b$tzAT_n1?s0xU+TJ+{J{HPR*y
ze&1_}7H=-Q?lv8qURM!)@tL`kp`-6=YlXH-mblC7>i{A*z}Zt5VbU_1F9-~D(@?V0
z>E&2w-2zf0nsJmLIdrIz<2J(jd)XU!2B6wCyX-^I{)Um72W!91@Ug4W3_EtL^pz~e
zMn0@JR)cd{=4-wmor+yzPti-T<(Ls8I*6~(+?YA^34N`b;0Q-5EkO+M@LObP4SjV4
zcz`SmaBadHaX>JpCPn9G6I;$quU(qDcJS)Kf={UZ{hm8o^ge&q(bb)&Hu{Xf!dx<?
zdef=_y?6<WC%s9Sn6KL*74mx*?v8i|IgP;H+WAp-_VP`~AIB5HAz>nu7*H5s=jHwL
zo2r4N+K87NXpqX+KGoJPDC@IzqK3wY?r)hZdSV7VrwwP(8!+fKbEoVWdS5Eq`)zNc
z2(ahCfd`eepf)^8nMmxS7WuZ32ktQ8vVbvEi;mBh=ln|l@KjkzsazYk4lpner+75U
z05IWQQZ^g%;M<X-CzJuab`{NT(BpRYg|0EVMP@;io9f#ddNaeQyz*^zEooerewYSK
zoG`)b^$^;hDSAqag&vDTOHK^Sp4Cgcuz8?n(13{$ai<RF>qGc653TnRC=55Nr59jS
z$eeXgDtN8=`pMWZi|m?d689~kx3>E9`Szn}b?>I4G4Lk)womQey*q?R(VZ74lY!R_
ztM8VT{lSTV4e<W_eY*+89#>707i?JEzD?(OQ4!+ljz^|=*62($r|LlN7y<7j-AzJ*
zCcd8Ptrshc5{M567ylqK=ihr|nk?e^dqg>x+~&wR{9zA`LIzvRm}ZU5bMNzEo0ogq
zQ7e+~6bMY(v!`29gnakGV)@ad=a;E8HyQe_EHEy69{ND5aE$xwcS5e;)Wk6e`}<&y
zU&rCFP}8PmWPBSV4ch5vW(w9pOa|E-G1dDrJl3U4<HOhW{4K5aeIk#Vha#}P+Ev1#
zU6!e!)pcB>uHkG-C9AJ}oV;J?64*R~F3L5;2-2pU$(S(kiA*Yb{+vok58>fD>?k7j
z+tG9A{+;as3$<AGhE{j{yW>Pxl1NnDgC;u2ZoX+ye=v0gfq`|T_=lq7-4U(^D3Auh
zBh2XzKO-*|3tfM!?46%NvhCUxop~}b@%8!2^`sma_E@UO?pKAO7wXAHy9NvtSj79&
z-1aw3LJZ{2qk;jmFft-podXRNQQBT)U`DOAi`;<jorhY6VXix3+@wh}$j!H$-3TYc
z8R_u?{lPVO@e5BjO1JkT0eBF8xi61mXpWL1nDc6>gt)Q)D1h>p={j@fFt@t=a#)z`
z)0bN)tnjD%4IUgM&1h<1{$s??UW@3BzLnPfn$^W+?WTW}5M$tbd9;#t6<H$Hfm`+v
zMk(HN%;a_ZX;-nn$l1#m^%3h#tOv)5rba1oqBsT88n4zmja9TqQgvUzhYr3MnG`CS
zx#ia$>2pa*H`jfXvff`BHgur(lV6CVuU)&w4X=BFMoA*+^lqln%q?ao#mTXkTl(ox
zV8puvj?Fh!*pZo6SATVZO!9U)`9@}unVDPXiNr!D@(Qndq55+1V(>&c2aX~L7xf0a
zb>Q>2n<N9Ix8LqhNH(yauqm#G(VVV(_w4b~Ed$ZNZNIvrw6)d8!}`gxaf~t2NuC1#
z#Lj9eE!M-aHTaq>xmxQeWp#CiVz|m7JHaZgJ}Hhq<O`nIY|XEE>zCqoMimoO&#9@x
zj0T~I@4cM^h}t>gQ3)}~eAD-}WT7cnUt6jOL)VM9`26a<Jdn-j`=c??E=6c{@inFD
zb!%gN%k}Y8x-Z997Ae<wJpWp)M_)2b6{2)$Z5ZGSNl70MTVA|-@@D>_u16$<@-g{*
z%77w=?0)7uvsPSU8||E-3Z^^r=fWwyeeM^k@9wbLdh@Jr&F6>--?v-sTz7!gb*Eq@
z4b`vq8y=<dH8Uiu>Dj#%mq<+cU<Qx%Z}N()Zb$vvcCs3~{OVK7#;>Td4nWS+*ZQlA
zY4Czswzfk)zCh_$e)u7;b}~klOpT^m`>y%oK+2TX=H@#dgXg~X=}ixBV61Dn{wil@
z70*M0GKbI2OS^XMf~PzAr+lT`)hky*TMbiU9>+;YO0HaRX_cd6$@!{MLiQ!qZZzYm
zf!5<vjg~Hb7WUfzfOnd6L8l)N4~m1u_M(UiP!{xxgc!nOdj2Y0Am`wwc>c_po$EGC
z_o^pJ-8pfmG1EjCF$(taZCT|M?Y&Dc>^U^&&?FVn`vI~8%HMbJT=NJTA<sfFFI6H2
zyW@x|>(;SSj@N}j*2X7#nuMqu_4BWkM}^2qz3vL;Ed-Ffscomzwkt>`U3z_4Lvp}v
z3`rf5A4DxKF!ba>4XrB}T7r%ftm!Y!sTeWA{t>Jes;FDauL2Dy$%hXX=C%K6h#i9E
zm%NT(BjhlxZh$YMlxI$Tn)e4=luX4)ysZ+o)z7ND5tqAn+}Z`HUc~czbE*YgPMyHb
zk|N|5=l!dO$P7g?LiwGi`mc+DI;dBVguGKS^4}xCXiyXK=Z$P_GY9u^H~tuKFfube
zck?Qtfep3!mLH7@6a&-KBim+NF8BN<G_wL;f#X!DXu-R(vO@6nV-*JkpF39rReel-
zrv6GYM7r9LY3WHYV_+;W8ukUw3r|<j3)&($VILggNp{d46I}O}WjRf--#Pr2Vc)Kq
zb0nH+0L)aXd6L7car~<4*wB0~ZQAXIP<STrlj_LM5*>eCHDf0++G%Mf#W@795r^Fa
zTvgld)Eos`$qpE>Tbd^UF?}jYMM-Jiqx1`Bbv!-QizaCdZg!mQ64b9IeDw{JMds!(
zApON^Y*ESvjd=rdJ3}kNrqaVS@YnjSWgU{7D*_@S1}cTvpL!{;pa6J<<{eG#h$9gm
zrPkTY<G})9$Pb)IWRBLfX}%DDWFAoKt)NO?i*8i)?cI}?-pm_oS(DoaPbB_tu5_E&
zYKUhlF4C|x)Xv}u6+^Ng*jeQ?1cW^^{&;xX-hMd)?rh#Sn1)_!ya|YUR#p)QRPKnb
zgwxDU8Dm6%kQ{&dR3hk!Zry~lEUzkMf3eZm<*d{q5z=;mUUIp4O77`XQQGsE_}N3$
zQssN;C3l`1FH;qge(1Df8vVFc`KKkN29Duuc=ANvaO<%t%l5{)!lY=O_-K-+qB&D>
zhx_2oij;czStN~6lEU~PE2~G%S6J|b(<}B@I{#|fY})ywwTA-@4%g6D*!o{TdGZcr
z&+P1ZWB^N-`av#pacYz5Q1PJ2xW<muKcThP#qD^UBD+6Vn^nGG;_|2nojpatFQC<c
zQBmRHyQKv`ZlG-Z-d;$XAf3el`+|agd=}eEcXLvU68p8&vq83irsv&r54o(`Hu{I&
z$;5N&W@+`18as9CRo6JZttZ}7eCpDr%jH`gr$w|YVEL+GB&rm2<v0ay<T4s+YP?e<
z%Ui!_dR!1JMnXe5<~dJ7!^5+~y0sUdI)6SLIMLR&xwr3Qt+ZrSDvLr9tLOzT-x%}{
z?E1Bju0c@S+uI$c_DoW$Z`*QC);8(vGyzZ-G%Z5qqNd!+LrB)_H_U>AUT*vfodWB3
zhOue-m^<iic^VcPdd1+Yi{#CVlcV8<%V*R9JK;noO+FLWT2xrIeR#y;mx?EUyJ%FV
z^}Q-pTGq^QPp3UhH4%PJvV*alQ1wd6tZHKk%$sa7>tI-z;70_DgI<C9mS_IdmLT=k
zP6kiw?xonNYM;f=KR>^bQ{N!1#v`!+kWN}pW$W(a$C5oI%`Gfu(2&{lM>9n>7RQ}C
z+uo0hKB-vXxo%zTnDx3zyT5<?8@WZIfLy-B`sE$){^}=3jcsighE=UuYr9=Mdxua)
zUn)+h(l7=k3$7RFs{wGVf`%iDw<?b6=r5IZ`*yo7x~Sg|u2LA!oZA+Pss(V+p`m>P
zf^i0;9Hc&I+{B4}l2Vv@6_@hG+Vq9odshSP0AVZ|MVO>S0UnK8-d-EK%oFJ^tL{2E
zcB+xnQn$}Ndg_y^ii&(*YGFHp0p`=XVb`Zm&Jl8|5f7~2TlP$9SM#s*OZvp?Ry#ei
zZJKQ)-<32+LlD*UEUGe8|MhnB<DLc#G^5Z+Qctz*K@gNM8;}p8m&Xqu_Lq|*T?nuB
z>(TsY^7!$Ws=IDq?uPbwW`g6hs#4j`x5xGGPB^9aYFGeojbk;wx&msGGi(NqN~DMl
z8Im>YPwVZx-%+N3D720;)1WT7@Y$G&pb?7a@GE?o1UhQ2Eh<PwAiGV_Dnwexf1h0^
z*Ij2?cF?y|c_EQUk2<VfyWheaJ3oyH6C#*Jx@XUPo_&1kQa0dVKbKk#Fvgg$dE?KX
zg(SMFw*p$d1KB5fZ*~EtQ3z2Uik4Oy(8Q3IID$IIh=6jN*O*VN*6i4_`{ap>bGHwC
zy>LU}q1wZuP5BM#t*aOQa;^ONM9t!-o=dk#Qb#Vdmo^Q79&T<9kMfK^ws<VLAGOCS
zCa-UyjWEz;)F`&}@BmIF4#TRum*Zz^6i$Qt!JS1~e1&rYm^0mH_)MZIB4^v5-(muB
zW$Ps4t2}1Rbc|Cu33;SThhLg3eO_<ul)@R_DZ05F37+fMLn|Hlvu4?{Wk{`JV#Fgy
z&WWnNkdPqPx35mJBwLq8KJrTjeZuAW0lAz}-)VW(M>9H#WSBC{%N_MCwxK~=-2Ris
z4qFTwb0|M#M3EykC1veANtj>h?GnF*q~zxM>fYLO5$rH%H{%9$lq#5znP|IQJ8%-g
z{+;c53`wS$g~b`RVtjn@7GGz>!gl$3rZq;&MiWQ5pM17*k5|uusrmjc3&IQp%L2-o
z?m}^ila9zwh4kd~iuG=8^vVw{yW!BawZS{$wzqur>^?FwbUToP+(36kz5t!aDb^ss
z(a5GdPo4<+E$8yr<20$sM6VRqVwfOTqKeZnJd2vuh&*TLS`uGv)didP{{9jgmtxsX
zYs!?;?Ch>xc97~qb%!|RB*sM_$aML9@=fB-h*K8>r!Bv<7#>aYtNGlyRbcj;pBpoY
zj#O9C!eI+}1a(DiqLQMLQX2I>eyr)7|HM%v0{VB?lk9T7bO(e@;fy{2G-$`LD=!Y!
z%3K{u>4>Ip#|1u(39-LZyN#=EIq?o+RMYewKhSf?)R*^et#I_{H>mG|?N%{qmdho;
z;rR<u1ab4nF3*9}q1t?~h@I7Nr38BC-RbD6PF=bnur*q`HOryn-rLLcBQ^70zVt$$
zz>{+Q^4Whs<=;J_4BXLv%?O|c+OE8L1_W4NjwSIQG}Vi<XFD<j9uE?J`Jjp6`q0hv
zuYVyuDoiyx^P9GxGD=TQ^Zut`7x|YV>g!T}G4b?ig)&3<!svH&+c5=ZaLy1EhTgq>
ztt~8e&XFutXmIH=Yw)nsQtv%<6Laq@vi0sU(DY{$oPbd5v>U6s@y~DH-X}2K_yr}o
z)M<$hZ~XR<i1X(^q4uIJh{O9Wae}=~Q$z0^wdLgv#3w3VHKralf38yLoO*|~m^&*s
z>lL{k2|K8{T21m`Y7(3?dW(uiwiP6an-+-;MMWzsrneTWI-dxYi<(yudis)FGu4Lt
z9FE@~Lr$SeV`jLH1d3`8gU5zJA5_MU{pdQ@fK@#w`6_(+saI9=;%;s~fT!R5`FHG*
zXfK`gRb$MUwLBn@0t_3H74lh!2RW#iD(>LWF)-@p%X4&Ed<{!w!M}4P0W$uhZ*_d-
zRfQtu+us82(|@#PVdJh|BU&6nes{YSpLcXkjl$k9Lu;PaEqZY8fw7m5&+#c6rNnA9
z=c3=yTiv^lP=zemj9{xu*;KiH@Kr;D6oZ=&xLuriY}iiJ?kZgmL@z>OU6`G%#3ICD
z4T@aa=jTqR_4;8Eoe(*Pg#v-xYt{w?1AE|pqpx$xSZ6h(z{lQ>0)AsFwPxyO@#?^t
zdEsB;|92RQZn?e#=|M<J#BkXCVujSbW{8-y2u;W#wUNwy($X4mg!K97;ln^|;AZ}o
z3@25rc6+27939urB33F`<GRF1&r?T!r-{n8-yiI9>&5fuheI*(dF~UFDyYz#Pv{Q4
zrtNpmJ8E^I<4)}GQqc+k&(T5&lh3E#Xsa*pVErv<|2IE@_g#kQo-!;P44=AVa)*bc
zHMl?&P<b80fT^BhHON)jEtPn1S?KF`KLV((Dk$q_7x<l^f2ffvW6Ro|@|n-`^D82X
zgb{gm$H^Op`2EQ$^z_0*$9qxifzOLfO-I-tT6MTzYbPai-t*EFRwwzg<mmNvf&T{P
z68zd!OU8^+ktKpl9f?0UYH&EA01cvIC8=!6#g%6}6)A81t~2bRjDOaO@_nyu)F)-0
z_-(~d>$w#NxOT0T1=jHvF8aePEEW>Irnpyh)D~2rf@I)@1xME@mi-vCj0Jv9$ichz
z<AA<@U})L-g}oW2RPG#{z|%XFc9ED`Xu$YcwRV*+VI$}Lg7i*ysZ46}@6Vh|rcJI`
zQNe13#}a?<dJFVhr5s%D93bsE+>&xI3ivU2?ghR&E1Ao)LEpmTn`hh5*m%;s|C9|X
zN|Pqb3FO4zPfHzE=oM=1D6Fi_@N6R>1Rea8f+#{@=k3!UzF~WnPFiXF^Q2Vpp?izd
z-)>&s-}}iunP1WF$oRZRDdGy_>;ERNvomFYAW8Nsy27<(Ic~bV{vJ88FJ|DocN!iU
zueuP6mz_U$>E3^4ei;d5@J^wxYUWSJe4m2B#cZOvHYH>ZRK37K9G-yz0cbT}-MP<!
z2%Wa)v5IUhD_8BXfYW3@OZ_TaJToG4zna+oC6Lm6)|@#zojP^iK7Q4c{`n<PB5SM`
zVA;>Wk=$H?7i{(f*qJCEl(s2bSkQuP#k}KpaM^D)+y8ZB8VP-9e67^Q+X=xN^@-R*
zBci=MT-$qK?%A!Ubnuaew_uaWef`KA(d=uYuYW!;{8Z^aRtS9#)vwYfj`92XEiv!r
z6jof@kvqyYN50e^2B;_rCwQiuimcGYNL2+Q|Es$7StSd9YL~6vl95$^OwaCorhaLo
za`QQ%W)mW^$xOQ7j1U2cw@uoFA={h2sd&c7GUnHSE}bHudDsfw3WU_e(cX3n@MQSJ
zd90&no^uSw<H|mv2hb}b?gR02t~WuotxigUK}`8%^Kd4}@;1M{)h^R?*!HqTj*g?B
zX7v#3WIQyom`KO6ynV`=@6HbQU!z~-1t30hsA244fWGmtTG2AX)cY~N2w_2Qh5#uU
z!Yh2<Klt0qAuOldnBeR>XwFxdH2-b;=(q~`wEX!dj!HHef+_DGji;Fo5g4e;!}2vA
zDqz%Hi#YSQGO7MFJ%_JS_Pi&w=BBsKZu;77<)y`lVZkz}8BQl<pCHH|+8Fkw*QXbH
zCmMGYy-kmna=~zqKOi3wSinJq%znpX=j~093aq&-`?itul1BstCnP8ji4+=Vd=0qD
zXoW35V27l4TcOhDN3|BpL)0n-Z&Gy3d`2n~5rMCxYT01*RiUe17;o61VK9O<sJS#A
zeWyXU#iWcTifJBc!@Df}yy_xB^x%Y~&5=|Pzs6677=D<z0Y5f_leYo#t!D#4R>2e0
zHMbjp1!CQM*9~Xd2ev)bWZ>L5d*tvKuC1#Dyt6hqr%or)Jhopl!94L~SBK9=fLV-(
z01OW*!M&jDJhL${6+9G)9)mQ46<3b;D@wrW<<P~~XU~p0)OJ|#H*NdjS~pIoMrIgH
zs?*@skoz*zgVK%s_PIZO@M?aBqxgc>!|K`)$$?K5zVb2+oa@O0$sE}AKsgU0F_HS1
zx|3|0)Ksbi7FQvdF0bm5{ryQ7N3&Kcz3g(Y=kiel#;>>s4cqc&go74in>hBdSla^K
z2}&sJcwc0M@SW!J`V#cL^D;99saMoC?WfU*yk;j$M(piuzI3T!$kAE!Oj}wGI8wZ-
z7knB{`?5lWr-NwCH~9a6BxYj=nJij#`b%$Ni3y+j$Hk~Ua=b44{n<ia%D2zB#Yptt
zeVJq`9lmzqW`fAioga^}^@q7<Q^UbaG2Ky!IR|awd1_2fL^4-(P0Dniue(CgaNkat
zfwnAm>7)tt2TYpO?^oH}uI&zmhMpResNW|fw@P9|ZH)K*ozgySE#t><ji?=ii#ly`
z_?lJ@2^%08sk8-}Z04gDttmmgLU914qU=6?Txqm=n^ayZZM9<K`9$G{rl#4dds||_
z-6Pg*<mmQ3UCnG!m-RtE<A%j=a6guQ4st7Bt+%Y~4%dzJ%;+Z%`$B(3_O#DL9q+_%
zD4&8%TbT-Zxl@ypvU12b@23ahAZEp;t%B8BczLP-h%MAfAX?)(dVJf(%VOwM6=>>&
zNg}uI!qP*W%J#r?L3N%BWB#sV(MWp5snq%;3Aa*KW_k)Gowv7Y(KRGmChxEisv_3b
z)-En}*bku--TGy%WN`F!<G(ECgcUd)qdUNtPO~ES$&-C?1Ivn(8?Cox+vN}HyEp63
z<FbpthCB8syxr+gZRk^u(}+@M_^~9NGrX3Tec}Ly;N4bFzjoN?x%T@9;=NgbN`<74
zm6he2vjx%hrpI7b_j1k#n;B;*w-~h94`G_l>ecAnD4q+W#QXMb(drz)+%xU=Idq?)
zAJMEWq+*icBXfj~hO;3nipeE$urOV^^u&joE5X<RzH6JAoO8hwP1{3nhxK~x927tS
zY&_)n&z?MC>3_l{_@Qopk*#LZrhvVBCDT4zT3MM)qBCmSsZ;Eh(-#KLY9=F#laI(?
z-6a_S`jKH?dy*FL0&R}L<#$6h-TYVeV{f&5yvsj%?I0|QzaRgl3l=Z#zvV5vT3b)5
zo#<`qa!1Gcm&2jw298v%xVq)-P}D?H{fN2s*Z11smlUw?D;4<@nNKNG+WxqM3>h_)
zn=rXAa%=Lb8Pqb>e#IvxAvqo@zIW^7|Fi(wb9Hp=K#*U5mCMZ=X$&`}eAj8wqQeZM
zRQ8NEhE#bkGw~PgBn(itq2Ub-Imt6Y)H7(6+YqtNkXas5InNmU+|l|31=X=*4_J&4
zuzGWijJ5;5OXzchHVEUG-j;rm_Ei>($8Y>>W3y0SAN-B1urNQr_uO}=fm;z7yK&C0
zCp<*PH-c*!%@uL1BshV5f`Ze#MtXf{Hqop_AQ&)Ra%@^r%+kzRByfmJG%rhcG4dFb
zmGf6;y{~D`+TzQBs$mvLm(G8xhu_Rm3jv{_P<2(ycS3P2e|!1*^=d^A@1XWV8Q2<f
z{=$WYYuk|wqdV};nGEZcu6^z0Ir<n!;U0}uIWR~5ZPH<MPP8e~QH$>gLU6j+nqHgP
z+P>kTPzEL8<PChOuAXO_M#l=v(x$^tb{w-m_DUZL3!A>-Q-6d0-)Ur7O5>ri@xjQ*
zHk4W+kw@z08X*G?-nQ)ydK0%FBzUasw?5ZtcJ=xB%_Sh0!o|lY;j;^dv3J;~jVl|W
ztFR@B^jxeGhIGg_+#ZPz;o0#@aTvD59kXY73AbYO1>~lvsHiciyxsQOn(M2ik9*Ar
z>vBof#EL)Mf%5R-`KyTf?0XZ%a+_V3_w9!_F13Dm26e6fx}2HVH^s)TG_7#pWTR=}
z?uttkaj;8`pH_YsX#}%Vwo0ROMBMwXdl{|7RJjIfpCz@Ue4k#@RZ42Zx^>E<N7E1l
zxA9(P<ZHM;?(p4BQ$IhyoD)LZ7zp@YCB0Y&%SxkNV2l~@jh1IDX8IrlsJ#!Z7F|Un
z$&x9v2x^$M6j^Z%$*M&5LmU6n!bF;xY%kk&9RpQwxp=W#&c(4~#>9oI>o9(NysdJq
z*5t_*gSrBkoa6gSYeSkUuJa(uF)22?`P*B51YMH0ghBX(EI47MCI5$5wuo8s*RX1s
z*(41p-eQcngAj11lD0fVTd0d+canTJJ?|xGOHTEe%ybBpMVhq;pAPP9L~8aOQC3wc
zQe#0e&@3KExf*Ta!wyvxBjAzpzdp#zJLnWV)7Lix!SX`U6*P%ZN3Fl>5kz>NO1FP3
zo6~6%l{R>!>Y~YO7ZHGIK6$^dJ7xz{vcId07%_XutzPY?USSfH?SNds&ck(XDP1~B
zpcjk*5x=vsK>bTQs?EBf48K1^W+(=SguH8&7w%Cyx(rfZY={2n>E%_(G=mo}*0-=Y
z@@g@CK$z)vK2riG@}uG1PP(m_69PX)>DM|l2`$;78(~08dj1KWLKh1p^`b?{0P^0v
z5o}}RwU3zE+gAcIrL_rSJe0NTg;jlP<4CA=<SdM9Kz=}Chd_r*rQ`1I)-$kW?;?Wz
zaEfX%D=I8xv(at_pI!Lk1)aeNuV3@lsN5`Bl9AUT^~x3bkkhDwSs}+M9kTzpB61`Y
zfnSkleWs0xI?Wo!i}eD+^z8b}-NR!tttD`va3|E42<X2_agj_=7t;F~5*$3U$A`Cv
zi!Fu`|6$S_?g_Xa5ag80xZ4p1PEfyvn<{bqY0VJ&ze}zt!T$+;b{EFbQiEilmK*MY
zU`siE8sgXN+u1T*hw4bs3zzzC!T_`OKXjIJxo<A5?D9Jj`4wb_21zySCHW<udyFjS
z^XAR*tQC}^2<xZv_V#4cQ>r7^7#;vW2F)Y%n-3nmgoBI&>%@sauv8GOj7^pnvp04B
zX!Vky)sq~*!o$Qw0QOLdOX&^$C&0x<a}d@RU>f-;%lfcgAo!0)L{OsU8MA3+`}L#T
zASn0Qrl+KmPMu3EOpp~g>H)qRJUx*Ete6l^_3N&&CEFo#{CDu@$UfttxvHwG!$U$$
zsjI;^Q!kKFHb?U!5ZqgC@?KIdUAi5j$0EWtPS4Lf#;|^$G{wCsG<|VqNNbY-SQ5if
zxDoQXWte18k<dUVm<H^EEr0s3Q}gw_+}xS#f&>v|xSun&7N0(~ad<AWut0cO#(9A<
z1X@;G0)f_{OTVS6;9*!GE{|RvSlWL3?W3F^L?A#a01N!7A{OVpiE4lIC{@6*i!UzY
z1VSu}wF!Oyt0shVVH~9qGyJA!I>I@_NbYP>&d<)?{UqhO@6Qy{aGZ?6kT@?uH=yvG
zfTt_bK}|t6owFN`u%u+NqGBA2Br|h}q1R%1C$k-J4H}=*wYP-0PoEG9jPE{2VUA1P
z9|X_i1>vZU9^I{bcgCOD(z>)a%f!@_G2R+aU;nTFUR19PF6y;VX%cg6fn8TOZUjy)
zaxS^DyVXdr*BzI;6CG>kk@L?cB`syH0O5nInbu>_Ar*0<r)$NEZR=za`l3XN&+809
zgG0}Bt{y0L=F};)C^T9?G*gy6r0&W@f_uu6y{iqmQKCNP6+;+0NF|zZarm$&L>~N|
zkb6tSP)EnHL9%A58{DsYSM}czj?}eL)wLnoYTNhh!Gk%JfH}eHjy|PeTgm1myCDOD
zWC9~m+U0%mKA&Z0&yrvc9Kp1?DsOvT(6(t;h_@7GRfY|t87LZgUwV2Ql3P|S`L5=|
zyDwiJtv!^CXk#;AJZHh+Aw#BPM$KV_!gyl%6VgRoPT89Vm<~lp&onV<<m^t2N)lSv
zq)%FUO&dMh5z0p*hQJnFtlfqNd_?>xEn4H=W-T5%&FB5wx3>~X1X0zCi*K&1>2o}R
z83z@Jv|gP1Py<)2nKaUZc=4cNwizTEEXlAn{o>hQD1k}*8>&y@!P|9Vl#`tuxz$G7
zO?1x0<$M?BOi;(w4EeMIeuN`O@rUMaPATLJq#V+7hEYSQreP}m+gutJ=@adXT-8%~
z`|ZUUK3CJyw30-(>Did9rR82%$IIbZTjT2b=Tq^-6d|u9C>B~A)sF)wBpsH(^R$`+
z?Vs8+#lU&>SZ)BULKwCwHO8z1qMlf19j8XyZ_Sl607yXV!JASn1xhU!dR)J8gG~`p
ztEd6VM38>J`Sr7BNqJW_G{^Vv-nMmh$gyKL5Y_eezropcp}||IIHg&wsj990@+vLl
z{MkEu!OA~2QQ`a6++0@}-=`=-b%w>$;b#r|Qvdd5XGFZLpjHO+L`2s(?fUe(tuZhi
zzXW<$JB@t*jz@kOMaQ~z>r#gPL2Qfvb@_9ZG>J+5uq$zMdpr5`B7J>kd<%o%Sr*Q_
zozW}q`KaPiC(LgEtMnYCc}r_$Z?WK}1+YAHZ)aj&f_dwm#`^kOmr|TD9e@V^+T5%)
ziTuA9kUHnc<pE55A+~|aR}aO^YpBN-!Cb7RVWn-<dtFo0kfP0|^l^U$DckfJHzEK`
z@$v23aZE*ykfro|Hr$;sI2kcaL@TEqkl*IW(bePn6LG0gIlj;gMEwk(68juRpfI$n
zd=LwTHtIbT5(kJ|b%p+F#%i*>*FDiSqA@-U(`;4Mr_DFz+V*gaX|8NQa|N|4_<AVM
z`@@|vL!37L$DU5mfh-CNQ@1?Pdz_+7M7CHw#*_v2yWw4zF<m-$#@(6D$h@|`hmIUU
zIelgK?ygDQB*Yv`Y%X%YrgIl*Y$h-hSo77|Z!vbu_zxqsWiM8F!R~HwZ6cdI)Ts>8
z?jj8?`$CpdYPcx~*^W#XGR%W|97&i?wU#$Pm#>@dEsyo&mYPDP3$x?QAW%R%xacWA
zjg@`XS9`8NWSRaZw-?Ys0Ks$r&>1-i@q}0;fst`D(kWpw5cd<0JN^Cxa5+Azw+76b
zl}_vW`AWIoz4J3RZA7TespIhiAu`=tAn}R5e!}<Ex>NBY`(V-$NX6MO6f?EC9+@6K
zE%oAsAF+PeQ2RpF_Bs1zZrmYK81R96=g4J1D{hA(0DhC>N-^`4Cs?rvBQ}o#<}gZm
zP*S2ZomW~b%?M}E7J>*c{I{&QkgGskTW@tPHE`z+>DJcf1opt0Gb4*`$#wr@NSd)r
z@#QbRo25WzQNCz7-d)t$`#fBksaPZNw>hD%;=AQAVpsn;$p$B;XH>^H8?0WnN&||T
z9t2%-ID2~{CdFL*?l#iz%(|ydAB5SvFjE*Mk4*zdCH<zqTN<ddX@eawdf|7daSoiZ
zWooAJhdr;B>mSod+<n8;*f<Rj0>I;JZEYG(=;Fr%Mo~fG`5D8ZB(@|1kspGO0N5F$
zU4okhrU_?O(N{QV{0fYO@|b`Tu{n1xmPhniJ$zWj$qI7-+UwqVHa-8$R)(R>eg52K
z#g<$_RmwmM{s^oZ<tlh`m@gKC92G29F=@};w1J(@U@cRJI~zv{<rpLoZwK}-8n1-+
zr=})ymAGryyx>n^dOupZU?{-%?>cbc&(~^GHW)%zclYtzZ&%Seaq?sZSv!Z^tlp3F
zO0TTpR*hLbU?J#o@4z=H(`NaeyvWrsc3F(ID6x^_5C9E7+44-7Ezg7fdTEDqOQtR4
zX5!*YE6@YtTEO=J*!Y7gvI#(Y&H%{eqqr<t?`Rp<e;@z*8UFp8k;G1l$I>z~b(k<y
z9mM-3qNP`bFgXd?$dA@m5GPzz-+uiT%$@u7Q}H_bm*`z^{3@b&pvvNmL$Op>7Zn`r
z-@%a<D7>s(R;-wSO~VD(0q{)xQbsz^<0LE+7>YjuF8|1}$x_)6iTRK!Egx;P>d?<j
zL7sknT8;i$r#%aw$iaLg&;A1s=n^$-njL!eI>_?R+xcdA0PYFwI;;eG_3w{MfiXr|
zpe?{Q#udv6wdoa2{<qX4)(-HGW0XCO$S(+j93RZKJV;f-(h~f1HYVm0rdKQ-W7GK;
z2jG6fD*FP?1W&R!D>`*#A;aerq;J5TayY-Ut4}9>@Tq8f${L~e^zi9ZRZJT~Ls@e7
z%05wb_u48kkQc*Rriqwi1u>HWH7Q-)KpKR2Uc7T$i}S)}Z~Duw@E<deH4yA$QqdKp
zdmKT9^kq4~U9%OZkBiL{OxyM_bq-6ys~hU93=N-wN8oPX##ljQ(p*;y&W(uBi1@JE
zkuo=HTOjWm8wt18{3^L5Q2<0FOnhNcsFw`oMZ6jcyMTTh533hNMQl{;>&@oOS&g-F
zkj8Ln64-BzjU_xaR1>^CI_QjnTC_xv4N{KepGu(+SIjb$Yk@353GT=Basm?tkN|U1
z@~`h<+f44n-cEn>vT?Ssg^0BvyJpU*K~ZSt4`BDiQnj=!<~9U*1YB3pZ^Ir#rm~SV
zRa8Eqnh<~*`fCUZT8DWr4}#=U83)hd^3aTeT(rv-<ImQ)8oUvj?f{eny6?Do)2~l~
zn{XsVH@9h|xt5CFAL=M-8~xS?msH`?vhw+h7x-#wDkyXk_Z>7yKVOy+Ik#qw$u5(#
z{#+V2vR{Yj4%=#FM54A!5eHj)t=|ll=QXVrqxhLss{g*@|Nc`tnv<v^w-nqs`~Uu>
z|BL}TWxve{(r)cm?fAb6)_;Gg9UdTB&>z~c3soE%f2*4R{$2F6-MP+!XIt_X(N=uY
z`M7+s75?4ZKSF`%%@Y3Wuw`y8|CPzx!l|-DK}%~BZb)^g#lsA&lfJ-RR^AF^yy$PW
z^=YoG7q=LW5QE9ou_2{z&=9O6!I7S_an6=@ydsGhcvA_d4x;sLRAD7z1mCNTbA&(3
zC%TTE4jCjXMu*MBxLm>cpzZg=Yd##6Kfj^pt|G}S*JhP*K=T($`Y-va>K(f4gv#H8
z+5f|xo2g>$1SJ;K4ne8B^OBipW)}=d_x;}Z{J*o?f5tsFSi{sN#9-)Mk_7oj%ddl%
zXs<Z*tq$5Qcf=8;{%V~?{xf>;?cpKNgC&Ylf1hnjB2(wzsdrp<&dGFrU5QLTpmm9G
zB{O3$T#$NZq+s#Cx7JbzCtOYvCLzXm16{2^$iF=>^S*7pjzj$>0m4%9;j@3gl(+Gm
zIbCjIO;Ehww}Zdy`w5-4-{;w#=@nB<otU!Sj75u{6GT%jcm91$TjWR<IFTe`2t!vk
zF5S`&@4IXC)9F0o>hlmjmOD+cVf54AkDnC#P<C&iG$JOt_gsV*_gjMEQC1*zrvL99
zClar4<2>g1cM-pxG<%D4LVWzM8Q(rn8MDX9{ZC7=c2YOldwY6y7#()Ceaf;USG9Yq
zk2l_)*I#{6(D<-dP6liD-CL^`X5-Ok!O0unwsrdCbi2KyPSBpSqeh=Cto;7q)#IX}
z;?6Rfx3;|TdkT^9Q!~?6dEc|+2alaHt8Z#D)u1!&$0OyWhdnxo{3<yt9_f8P!#ov{
zDAa|$i|OS~>5~{WlJA8K?&5#?tku<XXqSj3oS}BBXT?}z<-?I_e@4_T+@s(n<PEOM
z8)QW@FPS{1Dk3skwFGa>;%HxMX~PuPN*uG#MEeQ@a&MiqTfG`9P4nX8oUTaKUQHFL
z%tsCzD2)UQI(z>o6;bd_g7-*yd5O>cJNWCK6#k--8K_u3z$o5~-0X64XnO?h$*HOR
zw2|h`_C7a76grc?(@*=X*=UBcP7!`e)HjXMPvumFw0Gh0LEL@F9D&QYy6P7jnZNDq
z-+OM_N|oa+4;jTG^>-ADZUxASGhJH(V|zNrirwe8qfwrztm1pDW6R&NGqcyz0Y~Gu
zU<w<BFVrWdMR0XjXsDcO|ERD1&)TG$=J948O`?lxfDKa(6i=hD_HUDk%{%K%H{X)A
zy+tz{V*lQg(9LlM#33y`Y|4W8gPX>B^1wYpL^CsK<Y_F<2{IGDJZu>VL_lrY+zS2a
zMjHl+WcmNO%zl-)L{j|ZiGH!WXqYe`lVX^Jn3BlHFJH#u?;d^a?<Cz(y^+{2I%n2P
z8y0ZkhA@-aNkxn(65qRQ|Cl9fk0E_J4Nd2qfd!^qAD>Z<v<SBr-TbW(z)X~g>Au&w
zQ<a0nhRb%d;eIf&RpbZW3tq$2v&A|H53X7z@`wB@^!|?KnJWwe!~E6yii0sv!hW@q
z`qXLDkQEIN5axI^E?F)brG-bYdrSRtu@1$yfs@v%@RfY}CSsL8TtPusqJh~fK8%~=
z#c5|W<xF{WAL30|PBR|=a)8VUIo44$tdlS*!~f(-^90YyQJ?yrenK2R90UiSHcvF{
z0PU@#<OHAWSMtJl8x_LVq7_mMaP#nhd)g-4mj#82!!l#&8>7ge%0;xDz>ADLrkfaL
z1E_Zd|7>Razl&tlEDAmqm6ZwsFCoAnEbRn8e_<FILtaJ&jKaAdsT%)~AM-!ijN>d4
znXY1{jU~@NJUW{@@rXAYUAZ?GQiAreP>lP3Kf^EgU`WV_#Yv0;J39XdxPez7h76hR
zkULDQvuxS0PBtP19n2hHHv*-Bjze+6{crIFAW$j*ii-n~yHN$e1;T!0`(=5gZ(zK2
zbtUTJdmO*K2r<Jwc9Epfy)a?q$Oi=lO*~FCYnK_~AiwPI#F0=H*UTSr4AYYyPAgLN
z^>+R?JRHi4QPNcLR|*4r-w9VYvlsLZe$0lUb>!5>8J&hjG4d5Z)6hER9o=)*c>J`3
zXjQT|b@E3ZJHfqiK9VBYy?-aR&zB9-n}~CP{U*K+Y%c5;jP}<Ea?`b^SQB}-Z_nB=
z{O?21hGZoZk<nIjepxTf*y4EwNUuPIMh>98i%5#C;3J-DzTl-m+NCYQcxPeVu}%Ed
zMpya?1iOE4p+$kCsGoLsIt|c(a~BtF`Y7rjW$K~yow71N^cRlmWh;b`Y_tliI*8wU
z4<1O0dC4y@0WKEiUoiUh7h#j~pXQV)0PgG9GV|;%8u8NOQKB#o$|~^<>gHw3qCe?~
zL~D-!@$2CoEgaN)a%k^iXO0XIMvwAe*mcWdSaFDh#Q_&nQoz6CnS#Z~zLd-(w8n*A
zQU+6uGUk={la*Br5R`Y!4nZa$yvGHou(fCmhaVT;W5qLt#6NJ;alrw&iRuq7NIw|w
z!3cF3T@}7`8Sxq$pWfPd+4BT0Qe+jbT!p|=P%ZcY^wuUxs}m-XFx!#O9wQHrMn?Fq
zhbFyhB@7ru&-|DHLVr%651r&#Sj7RLy^B;D+l)E9E!ow)O&Bp5VRS{5ru!Xq?)+na
z{b$L}&rk#*tHEOMGo+%7&oF*qW&^Ikkh_Z|4n{<jkmQs2@{|_`Lpc!`E#lt@lXi*k
z!s_IY=LxMLAo@UQ?t_^h(H%Y9-Lc#WBEym%K6UD5>Vki7i9F5pTq?^N=?hbPL|(_g
zxu8}IJ>vrJ<wh(D94wR$Tat*K)xzi)_RI~>TC^&|d9Hi$@*6O4AUQi>3EO_~eN3C|
z--aA}f#$*EWK_V#<YZwY7#lEnM$|%Kn4eN9dYqUGqUe|XyYJXo=4(-m25-#LMeRkq
zK01}5zR5%)slAIn-T@N@JI~mQrovrmODuPCf<-Dhw<w{Tm}#{6WGnFX#WA8#cS?c6
zfKMpaKxxWw6Z>~(bjRh{|DFaerSxxIm#)D0{DOI61lLl31S9My8pR~Vg!MA7cR_@(
zM@-`SR7!2YDJg?acZGl!hNy*(oW#g8cCdG*U=qjjRSGalc+P$h8tY;h#3>q84VAs+
z&##TxXB5wRlnHr_Q|IVsey*h{btGU*Y;5dywl~sXtEB(<aO~fxjhvjd6kMRb!Wccq
zXU_|;ChK=(z*)2Se0HkWz6t)l(W0$JpZ0H}4rLuC!Bmq&e=-t1Qek|HFv7^^>s*QP
ztZr%zjc3(3RDS%K)*mAs66!$dRmnew{%P5ne~-<2Q5sTwkh)_b+A36ocB_?s67-Un
z-|Y-*FTN-Ie5xxcje+n)X9=~0{L_0)G8lx%(&_QvhC5#D%w&oKhyM#@Mvd`i=SU^1
zn|;LhAWm_k%W!-?TKEP7nmMYNabuDpq}l8Mww=gh+MgL+M;~@XbVQciPn-CF8HHhl
z+U_pxc#;d;ko!XX0VUPYLNL%5;`4H!!@kV+!$^>!Stq51X;+h@)>0(k{f;u_j6y1c
z8El|1mYKNqg_-51_o2^ifVJ21|1JSzOFW0O?U*69_Eif<Z5W}k+r3AR99iOjw>nTC
z`l_DQS!p}XUz_BSneEqLJgD_`6*J@LCLB$ox$=~LqUer!%&Ys6W(as;;9|bx)xSZP
z{j!|ENS6O`)-AG!?xnP9G1n(DG*n(V>RvJX(UeIs#W5<|cJ7?R+v(9+u=X)Eu^nB$
z_kUUdlPzrx59nCNL{a~=N%sEE(S>_=?PBT_z`LKe*;ylD02DJCy<3&IRZP1U<3UFT
zK~~F1VG~RoF-u*M{TI{)zT+r^>7=@=z3YVE?=(7!*!44K#Xon<uw{Y_q|b6Owo4_b
znaN@-@5<fU3**7BagscG%<?7<Eso85XwXSCYK<AE@Yv~(>5<B0Jg$9=Ct7B9Zz=`*
zwgleYqoXjJtQ5gM&zoV0uZgMWILAFqEh7M&J^PPa#_{;*F`!w+@-k*7)@w(LPFp~Y
zGT6uC#KM<wR;1WI-rm3Jb~A0QK^T_#en^m>Y3kY~jJqhPMlK77_hjNjVZVgB4P{Ue
z6|O*OF!AHMBchqrmr;x3n+10xAx3qAFZ}?lx6kzu9XG@&gSauBv50h7Nr-8PUj^F7
zCAc_EBvLRgzv3DwjX>x3*8;<_ORk?vhkE|Sk&X1Eo4p4=mqrN+GyYTpA*#bsn<)K^
z$DW9aLfUN=Qw&XE1b#CRKxxJ!x^8HKcZaA_(dzm(oTWu<_)g+~bK{remxb+2=qJTh
zeOS%DzAMa%#I}H)u(=CtkQ#j;S$}xbF}Wv&({SeU7V4w5h~f_(T+S##S|MZgdI)E%
zmLQJ!YloKb=rN@j0$>Unb`R5bqde(8Qs)v;b647f|BKFB7BHALjWkTbJmxC*3XOYw
zhTm@-@|`t^nWGQaa06o=?wQ3;Dufj~3wF~Wo^`IJhj}UmhBKJz8~bXR0OS~SCs+Vg
z(o_EtG&Y`gc#^vKG*M`K!VO=8(@_P*-wU7O&EieGyjp}&j)RiP#*vZF(bWy74uaVw
ze|yT4yLV?>_L)Q}31rmiQ;U!l&&(sj{bW=?fHYX>_`kEGUi&luuzcrG%6lf>@#YB}
ztPI8IJ5W;8ZTR%*TM#5N)Xa2kWi`Ww1u(t{C1Y_O_hPX;#Z?AAsP*HFKi)(amfw5t
zZbFu<bSN^?d0c;C3c;&4iSti)ladmq#-g>aT(!Lda)vkhgK8tPio?_Jmt>;YLI#cF
zy7<ajG(V0tEf{R_m1!BJcQW>(p&P&uGc|Vu#R6!NronM+vy6<xiWe51X=-k+T)vB&
z6YVMHz@{}`K7dTo(Z(i{;kn=+BZ(AjEUA;dS|rERJKLn88qSL$;R+qE7CMOLpM15*
zYoFdqgyokD9zS-%;fQx1e5{Aas8MDM!(k!!7```9&B2HRQu_o$CZf-hRxXEd2O{KP
zCGfhKa0J*XW8l(?aqnv>WeGE?Su<fjLlVssTtGK&Zf@izOMR*p(i%je9f5(buemdg
z>=oU*Ax;Gws&bbD@O8hxy~sACer>j<LXWBeGLf?7lxm^9Oe3!0OLjg^xYOvf#7<6|
zDt?Oi!_--1B1b?>-3;I`1SzQDiQ_0ZZa^7EaEjr*xwb&abA#@AiRN#^XaJpR|DgMn
z0Vz5uZ1fB4CWD4{29u_yj}nPG8Ot+~0+zG4oZL_SGiId3#LZ{xAHhu^UXq;tCNHnS
za)AbcWDYTAVpRTF;}K9F+YWrRlG!}9=-k_PY37}VKD#=)#p}YN+Yi<ycAK8r|H5-2
z1W^ifPIz8JD$GE^i&fVxNA=|~9%pp+?PU*tIbJ@=FCm`b%BbDKlL{YR_4#1Yo;lnb
zP=b(*`Lgt1L(7mGd(yNM&G9C4$7?q!y>b4f(|C04-@BKF5R@2@fn?H;Mm+ib<niML
zYbt8iU+0$@;wED~S9p-JE67?1b_WIUJ!ACfJrCSPqc(v|rH2g3q$b2cFe9z({{1+B
zDpv*RON*W9+JQ~uB|=>Y+K&dm6S_*lSpN)14aXD=6INdh9Ten}(9`nDBSq1LO0()p
z*>RAV@52}V0Qzw3{n0-Iwbaa-?KHHEKM$&#USQ2g@ktyJJZu1$GO?<@!6xDV@4@za
zb`=FB1k`et7*A#iGgwwn?$?`o+AqpeOF0wqIfcuZf078z)P!LwD*COlR`7FhFY#*b
z)V;r2{Fylgi#rBDtf?RbrEuL@k;0(BnZ@gP*q9|?TW*nPPD@<ym&gvI?h>vU;`Xl+
z`{VuH&%{v|m)GcU`9F+%$>|{iM5p(eZ{SP(4>@4FLbLhDpPx_fCon-)(^f)>H(}}u
zsx|MPnN99u^eq(C51n?1zF70F#P_a0oEa2@N`Mcgu=COkVVbq3A_9}EY`g&Dx-Y<l
z79HpQ{pPklUS15rLO5gZZbQ52ntwXO(=YgT2wvp?%=4tdSoX)v6Br^ZknyG_Cc>@K
zMozXDh5X^!h(gigzD#GzbFlTP7F60C(HvT~<PI~}EZSC*irQl{lfaKxhpzUOO=&5c
zyrI+3FEsX?cg@4wakcNbs^savk2)=K<8{5Wk{QG<xMtqz!PU{%D$>kB%+Gwf&Bsrj
z&k2S^nn?(jI$%W7jmk7_9BGb3NzKedm}4?zAevHh_8fg9*$cmZKW7%-n~K}OfujQ+
zqA+D%hf4*3Qe)#wJZ<&~SKL?I;i?t&uUCp#4!Cwzk`)fOQP~{Au~X}S*VO_}1(_S^
z1N@=!39n1V2-!-Ar9zzj;<s3~->6bL_?G-RL&N|)ALF(2Z^6@#se&-7l721ZaRsnF
z8%&7rv3%{pe_W~P{n+v28&E3F?=uPQJ-g!n6|$%uZDv$5iz$<DyyE+FgZLifj6%Pt
za*c~a8HwbYeMrIk#VxoYyefqZ>8p<~36Gt92T|ylE3N{@6pb28CV?%L$FXbF+b11+
z`eXms#zvEfYm=icfEv(8WDSB`7tQQ?1uR(^E=Q!H4Uu-_ARz%3^``Y0Z<@PU40YeZ
zKB?3~X_Qb@%K?E|=X1xap#dU)ldXSg0amUFR?v8zger0{ZW4M&zl_5ajD4YG@Cv54
zh`n@lop7Fx>Pi`k7$GP_B*ko@=%QLk50NNXWy+MF$PV;kF`H;61bOc_2X;_8^U27>
z6NhDM(!0HA{)E3|4v4E0<u;z*=+O+RTuCI(*?1IQFaI7!dsxn=rlyYl5XldUPQet^
zz;6UdvsI`N3q*t{+HUuO1C!|Bi*;wiqJ%OW!t-LJDsf+`Yu5(Mst!wwjPi3c|5Clt
z=V=s!DvHd%ryj0PedovT)}%a#niq))YVIA{w}))<Iy8_UGg4GgROE)g19q(vzj0+f
z2C4-MiFtGYOf&~p2qzfr4*ZYfu~4EkbaH=eaOM!r*XDK15=c0Lz=Y2zg%M^MM_fbE
zz)YxxnRg?jqU6*ZqSf5#n6r%04qTBV<ipXsI2YXhPUQSwQqPg=Z-N{E)zG75;~1BQ
zS*Mft{uh8+&>jOAV1!vdVTYte!}|TDkC-~RfWY_n{`ghg)*t@&JfByu=PXUWbV=#o
z^XwLwZRhwVJ+0Vz40!~pAr(VA;SG+mXZx&F8%QR>r8k)+Vje>dmsBK7!n2GwQLyaV
zPL%a`4mDRDRtu(KX<9^F1BeU5u`};d)(7H-3K#{`Puqt3!ZxUnWQ&7NJgaiW;(H(<
zoY({&g)Q}sjD|TwS<A*@=7c)sSN80IL2(cjJW|pijN>n}WcKv<?+)vi{#!C^xyh4T
zIhlV*HPzSW{xW3uqF>#s1r8JWx3^03sI+#?Pzb1RZl1>ARp!h7eSs=oa4Faz2kE^-
zZ-Gv&X%Ke);ItRD%|u4}MGx^jjS3wErwhJlGnGr8^TNNlB<ueD51V?<ip?ubw6<7N
z0j~hTyg1RkpC2;}MIyhjhzO<;zQ!m*e2?`bV6wmWXt`437>j!?C1btVY`aJ63ZFi$
zV^lLT+RvbH(4epphim^^HKMi*+9s%7j52s7G@>aeR8GM%I{Vo(!+$5$imW~Vom3aP
zLK-0PlE{N%DBJSW0NRaJH)i^D+xYU&#mOTlPHf7a%?LsAX%;EP;&{l4^Zb~$&VN5P
zHJQsjz1wL*I728S1WY;(OP$lZ8R48uC7q1y-vV8Hp7PuaYBlInN=oQ3DqsY~LE)L+
zGU1>UOrSe&=P2tTxQwl%jAW5i)P+ySj#LGdSTtkz{}*Sx#AM(U0_XQBvYQY^L@!%z
zmH2|>?f>EH%)@d_!~XvaW;E6!OZFN=C0j+P)FdQZk}M%hNm-)MB9&1l>x8mPB1wp{
zYqKXyilmTGl4wzBLH$10^YobB_xK%ee>jGDx}W>L?&~_2@A*Byr-)NV0C*H@2}3@*
z@33L+;Qb~6xGUHZbHuX&yd65Uj#eg+3c!ByjghHoa9AM)&5X`;`<$IY@WT;Ia+QMo
z{6dSVXnR=R#Aia6tS3wO()~S?t9Uf&Gg?qfkv(A0d^B#vWGKZjJP_s~XgI%qOdori
zwph1wD-Q6W0}o{;rbP?Oj<-%*hHn;2=^%e^%E72#K-A~Lly_<VEV$?bdwyW}rl=={
zv(7J`51Zq`6fC>1_B%E%MGcVNP9|tIj3AN0<5DL*jFZCjW-ZoTo)ZUeuJYa}rwdnE
z0nWQ;{M?8q_skU}>Ix0^GgKtV2Ms(d9pCmCvFZaZuBMIas#OI2M<zwIY}INhU<BBe
z>`+*o6ofOz^D3Lbza{w^C0)_H67GeVk+(;l4JeT6dY<Z2cl|ZcB_ROBl2?lD*-%50
zNb;z*TI|0}Kw_s2v4hP?%H{Z0bIZHztgH^MlX#3!Wh`?vryq$B=D*`qFs@SjlJuGd
zeE{>G14s_=<@^<yf$7<$OEv>MsJsdT<I!NUC0`IiyL0PS-cX_WexPK~X=`Mb552fW
zhxoKeE1CCGzgOWIFQS31+%xm3q!4*-77@NZHi5^&zfz-|+=FqalxUt7RgwjGF|XYI
z&t?*J`9W{j_y6?fL^857r#=M)IUUU|Ra2}r>`VG+q`oi771Z(#)L-T_3yK8(v)}|A
zn-we0&W(1?JaY&WglW<0)?mmi!t7~hfhak2rLYY{50rOD@jG01<?0c0AOV1@ZW*r0
zTK1X(hZjT%__UW_HOVkz0;iFz<EIpGcd*{(>2`r2rbw}@-~@9_iHR#c*MVOX^&IQn
zOmbg-)Pwp6R{o@eygY(|ALS?8w)p@1_uEU+-zIMF_&+v^asm|T4cGZLLdiqH!IgQc
zSO?||K|Cb2XU|NtHj$j|))2fHYrtm1&B05W&}S;`tqC{jT?y*59C$u4F)==V-QWV@
z8a6)#nR)Z>T{-GOvdNqDS0w{=V-Z(jRp52PMc}X?lGiJB3ics>ZcjFiqGG6JB+uuo
zE%!$1e-NT$+xNV<MFjM)DuJoum0!PFOEXCP2nVV9i7@N_C!l_C+7(T_Y6~Ms7I%tp
zLUJ4Jq#}wKxUD)8OG4581)|@DRHqx;GjeKQhNol7LQhGnqWnh6aX^Efj5T+BOOr@F
zDH`XG(Q1!F*QeiFhRiz&=4Po!D*YQ*(YaeUEx%Gm?`_+*+Qnt3FrJcjlpF3W*6q?t
zFwzSm{UF&Ueo5&kbAAA@y9r@Z+BKZ_BsHoc{U*-db#E1LKP5-6e>~jX)hEtAGzmOL
zr=uSiH2%E`7?~Ytdv(<~g)(rKcxJUqVSM8lN=e+-ZLAB^ukqcJdss$g5Kj{@;Q$!{
z%Nta0+HfW$TgoYIfHL))_>y}1oCGgIT>90?2+iA1k9YzLCE9(VSWur5MwT2Nx@^B<
z7)}&E4}p->o&_nhx8cSA6Vt1hulXXJt)<iD|2(bvK>4GkgXZAz5C`Rlm_i2SR2hM1
z!%y!oja!U+z>TRYFQ>~F$qOcc=3&mXgpZY&tHowP4_+zVpcDH{4LM5k)s{YKj2K28
z34pZQbP3(W=r9SqB^cnUk|vVd8ycKaQvJ{1kVebRV*9+`H0Ed|Lu#ebD8moV%{o1x
z?mS<0b_GEws0c+HKsk<z)Z&CHUxr?X2_|$R$pXI*F*ugCn5of`1W#4P6b9#fa5hBi
zmK(dYB(s>f2Sh@vqUM*-VbV%5m#!WUNY40A5irs^iGYXOURoz<2Vg&8$JmWBW3Kg>
zXdDGBb-xpHu6?Cf9Njyq88UVRgNj>tw+#z1sIveK3LuL|&y8-|rp?!?s)0e_%^xg^
z{pX*%tgD|Qa^e7nqgJda-7>kQBt~v_)ldANL4lJZx2nDhPhVA5#(3n7iZ>SPQZJe#
zd*{}aPK3AYvW*-8@rC=Y=S}=uKwL~Ks^m1&FyvM2mX+w^V{B|(JU-YQMCSzL>+I*x
z4XO@GdMPy+NZ*1Ta{LRGUD^izH~#h01K6jsPoZV2`zf?;{Ud*mt+W7atiqUEXAyI~
zV;RtG4>5rL&`D8I5$n;e-P^*#no~MHU$Y$G6(+}u0jKpfsXp`%*sAMFl8qbUqfkYp
zDE3b16TayhurvWF{w^cq0^lNf3n(L044GrFPa<_Drr(GBvXsmHty_k$D!GLaZ-5Bx
zbh}Xb<-d1|7vj8l@d?}l-2uPs=}L@>!TM3|4bsl5BpJM_XwuYpu-wd8B+K$Zj3?wc
z5WcLXkKB$y;FN96rzdudIjZsggm@{d`6RqmZwS|_w}7&l>DyH1EOz6op&4OIfpd5b
z{ZrGr3~^e;(B)y$X|-gNK11%ob_cBskB9@cPX_A%^KRK!SE6j#a7-nm`w@>qS}|{5
zMbU?P87Ui`WCVV}eU?LC(S+*oU^qdJg(hW~el0hk?wn+&O?Gxq*mTMGmG2*@kh~A-
z{I%ru;fU4mp+P0ATN^)DOt&z4{o?!k1I{C<2SrS#N#Y?5BKVaKE{i4VT^js~Wb-s4
zI%$s6XO{-)(d+q;@-av=yJbFlbViz?hBiYOq=LN(JU;6*dP8CR`D%%Vs521)kPQO{
zfVvuam>o?!k8nwJCPwonEeu&%xH7<!`p;mdUxig;4loCQ^K*$sfZQ&c?UHWHT`z7C
zNi0pc$wy%nEpMMm)Jw_NRWL__n$fXR3mC5Dhk(Ne##T!=rEfR_vj5)%1eJ^*qQJaw
z-^`@p`l~*ib;&npyzGrOEhYY&8myG@I02vJB`XnPcJEF%<-w9c#F=0h8WU&x`1t7Q
zU0U#{W;*jN=aQ<4;u0>#IO{xb93q;6Dp?-IEoQV!O@*YcpT7QDs$R0d9bdcldX^@7
zY0&WVBWxw=`{WNCWXh2yoxd{iCGOzB!}nfeDXQ?D_?<WpS1`~8+xHjUY-{uq1cFjK
z<uiMG7C}9VvYQpC90<Vgf>BZT%*I)^5Ap_%9GQcK+IELERQ9^}!oW&112lz2TS8yd
zjcv^({(s9KTO*K8rnk3#=MWNmu)p8yKe&GV2$`~Z`%63=_@CyZKtLDF4lE^#!3@h@
zMA3aqISe~V%EiG8?|3m>*(3nVOxN5=D^2O4@J$rR0fsfc@|g`WVyfOKHXXQfgedzj
zA0%;KDc^PGhovTw$lhLiALI_wh&OMJ030-J3YLO#-!j0G5l{G;<?T>$qG}cY<LiqG
z6INmFY}7t!!UjX8%?pM_*_ZnRMKK?;O)|x=5o<nGtLZ>$7|4J|hz$N6ewVb6@@n{m
zZ+d^C5;A7i9;UdLbF~!oa+YQ<sO=oJj%qph4z3)A$qomHdGm6?!2uX70-X7Rd5#Yj
zx$<umK>ZZ!zZV!8)qi)DJ^J5WXcwcDvL&^gBR62i*`({&-`tL9;%m}?UPwYGT*2f~
z{Q?mJ3;;hQ0(WpIAU?xUL*oJ4dGqV*DylbKvEhfSY-Y|}S9-lUK<C>>P_vTCvFeYS
z_w)iAo*qpUa2Kdew)0w~+`nJMs0n}V3_SSv!}Z@I$Q}K)yGr7E$t{gN!Y2PGmf`VC
zkLkL`OgXgePBNB4PRjeQJ|$wj8JBroa8ZcQfZc)5nXxDhDkaLIKSUJaR+<pL!UxzY
z%xyt4--1yMCO^qikk>&`&D*<Ce+3T3#zDT4dSQ?hs$cz*^9IIOq+=})kL&}-*1#YR
zJC(Yh6g6pqkb4FGG6~?VP(`(srdhcvB<iavQu~iwMq~-&YZj@e$zc^q3wdJf>u)a2
ztVqWA`H6-V`v59@{`j%BfIjP&krqcx{#vffc|?WzJ&cjH+w)hm$F&}^WnuRFDH1ep
z7x7Eq!EB$<TO_j?09jiWyvN+rj8gsWTc_K35(`E7&h90Pnj@s4`htZE%j1_2w=}?F
z()#Er>nMN4F-NCFXLkDj8Sqwz<VFZvX7?1|@W7)qcYSnp2_d?_A|au*(`EkI*l{IU
z4BlYx!}n_YZKE;->yg`?x_n9#<H7Rm+&4}X9i%lb0-wFqZb}BxnZjU<LzCQCLjh<O
zT|?X58=6P3<}}YYaRYl}^U2lNk(}N84;}<{{mpnVd@=|rD_5^(Y;p|9f2r;tW47)!
z@<~t(19!wP$wGYhq3~0>YTKFZNbujVVH8Oc43qf_7w%fSr6f9}?AK7ObFr~BA+KJ!
zGLj34luZtgN<tv37Km0s$dIc(h(U6b4v!=cnl@t>L;J+zQB}=ctm{9-;V*J>7rqGx
zMS+AGS?Z%l10K0Gw=kg`D@5dp0@250LugJva}cby)Q1k3mnWz7i_M~=!v}Z;uuNts
z4GvI*VQ}Th*Q0RJ1nQ8{X(mHS9a231L4Ge%bZ&d)N>GcH5c*fGI?sRbSCFL(ZTek>
zwA)KE&h#&<3p^R*i+hG;AiOLJ&tXAR6~`1ncS}uMh0xw`rAe+U%gaSc5HRZ`S^QGx
zi48|XlA24DgM<lnCLRt5j**eK3znuIyNB7|ujgF%cB<i_dv>#~u8K@sj0(Tgl*CTH
z>7%1EYQKG>xi$f^3Uxif|5^nKG~!S(r_m$;$lV9SAx%;wspuRNs1?|R5#Xg7eilj4
zfH}1<oD(&S1~pCE#o48tHUy2r1t;F5%dlN+jo|Tc1Z?A!;3B?;ibIJwg@HE~e;XpH
zRsOoZt_sV@9E+U=fiy1AlIoOs=JVUPH)8hq`kPIj3_4UtObq-f#>E+oMRlKQl@YNi
zxzv5L=CFmz2rVb=*Bz%IN*_RT9m}VR?8kSfBR~U~ETd;#L;B8iydZ95t%nD7Zc2tz
zp6Pt}ML7aol<iGT9vqF?-31<YhsPWhb!kBK*Vx9e`|^-eGBEF-p5C1e+*cf#kpJcl
zzH2L+tz*hk6NT8UZc0jn$$PlS`{MJlu}d&e6j?!Ei=IY~I`r#VJEC#~xP|sL5DP^N
zmnsY(MSS|;!&#zs?|wNuf%JD7p(hL`>nKL$nLz-RqBD}-FL1C}J_2sns>QndVV%)J
zMzni}*oKyp1qKoqiu6p_BBXZXTY(cm#jgB=i86#l0vZtJZY#Io?y7`^f`Y0#={aE)
zOfOMb88|GJvx_eL*Sz*0R*iT?>~71Hc%X<*2fI;xWAg)7!h`w9$5MI_4A3v|{aw#`
zMwAB~XziQN7vi!>)7qJAWh^HKNw#sc=K%T8{3IA1uig>f-|6ZcNA61rE2wiT*u%Uf
zQ8X4?0b}Lr_Vq*BLJ%%+oPIB><iWOYr*!yokVBP=4+5`)Kz5}L2ffnuU<gyOXyKsr
zlJ9*t9wIUdUXJ|skvq*Sexp%)1vMS~JWah9V?;rT4j*oA*UNblEmJ%#X}P~*PEHOK
z1h0g{{$l57CoLU{(uOm%B(Ry39#pd+a2_LEGI-D;=?Mstb5=|~NGF4d<6`dKyXVO2
zz#*L;mEm70FB3^hzatz>JI#KKk@Bw0$f*ug=>fyMj>pm&*8_=-o;{=EmOeX`b!uiT
zTXTAKjU;FeUr5MJ3HlDGXpkZFin_$Q%7H!Xeh)QcCBvaA(dXiHL5u_CVC<H{6@l;A
zSQzFlTen)xwT`s3hfRlfJ?_>rG?&yPlhzjSJ>G!&S{fwL=xnhh|C32nnK&o-#wIW|
z)K2EKPzB9;j31_##t=mV-w3MSny{-QbHETE?j8J&B9fK!F9!Wkgolp&7T12oRzm01
z8Ah*>KZn0>VsR(_Tk$pSQ0ix=##7_*y)Ak%S@ocgkS{#nZPR1JGaS*RvSX4CATAI%
z*aXqxl-+T!uDCi7sE~x%s+Y$$1e%hN*R0T;?cHqx+7lxZe(#4i#XnI56ItraSjf4P
z@$ZMd!(gpJ?SQ`*Oym2-tpNci=+;+V)O6l@T+wc=nc5sexP+>v<m}9{cn;zWF(~LS
zIBf2)e*6_llo0tkMbjV-fSYedE!eBzkQ>uu{i(zEz@UKDghR|TkEi#U(KT;bdlMmI
zPR%@_>`vE#!}>Pl$`XS_4%iuLM8hXNU98KcBMFW<LoPNK#f{I6ze-3D`)D_+j~zRH
zB~n)(h##wkomS@SF7eMt-xyvOo%6~jgfkSI&Y-0wWy6uMA}2@3SARcbK-ZC+Hkb<D
zMloCZ;i6c@FdiMffPtr)-)fpfxdi!;bIz|vs`{CSiv{d^*z1d0`~OUJ6la?Qcv#yw
zDnr1Ag_)Y8ZNlwsPR|;`X*anSNV_4m+M{W3J!4|%nY~9TXPUZBQ{yw(PedabJY}PR
z+b$<~e7voxMZny(pD);Y*Nm3Z7QqvzNMkPJzeUgAKJC$2J+$qTsle-Y+4j;5S{b&O
z|5Gp^Pnc5bc7|0;<<U^-!83E?XT^K2wttbGEhxrUqMmEAf^P~7A-QcCupw)yfcm6~
z-Pv>JHU$KHfDoP*-(50(cSuMbCP!=+1*iebbh}WgRM%JbjhTlP0l+s=ATuB&K6|+n
zFI?*}TsNq^>h^0fTI5xG4m3AbLGwNia6#h*Hn9?j=(|_B9F<9Oy$VZ&_Z$?qj9BQY
z)9NyZ&QxE?E&T*j5j#Rd0RJeNH=MKBymgoJ<VC7}ga&j&ve^Y?^r|7EvP2Z1#O+N8
z2P=v)Cre?Q(98X)iNkWHk_~r@kh}nw+h1SFR~zXad#FABiYDQf;zPMf>r)5+F5h}V
z*5Xzp4tr|~CUwr)`C`%%VZOyz*g6Qxogy1mc=P6sm{kn*`hJ^VLzTM{d4MkXDRRFr
zU<66Y^u{Y+P)H)U0ITs+6owb249<p@3C5Ek#a|Kdkxl2dJ={>9nTA?r!}w+^6U4G0
z<N`2l`+x|JrxpDf?hoVCLX~n)EX7xm5G(syYJknr1qug0E2{>}f(4h+uGOunEsi)e
zVDR863QDLIdwj7YSDF#X4p|4^?yX1S!hvrbeR+5MB()S}8BFsTtxbdH-)MI7l4SEP
z+ywTOAb5b^b37@Igu`N?iIfT)7}J^R4%8>GYa%h@hS{}4!aYSB0~Hde!bvdzisuLn
zO#-l>abL5S5ay?@8u4?bM-tRp6yL`sue9x9T;}TLW(y+~>_|}%dSLzF2-Vl|9j>Tc
zshZ-SakHXT1UTkp0-1GwCnp*o>CU*YzfV`+1>tFn(H#%<>I!%lPyO(-&Y?4w2^LbW
zClE$Q9>klpS!#|Tyt%l#vQWPMwQ$D;Y^ZvR^T$A{-@~`Il>Dkh0<EkZgbqz^?h2;D
zu@DVOZWQ!o(Lq3=smr!Scn_dp+=;LM2-uP^`Pr-_&~!A?GB+)$eGT1V(jW6(;HKLc
z&7D<8D++Y{z{OBxuxjm0P?%tyM7=Zo4*rl?V?lqq;BrS-ipqP(#n2x~pgNlP`TL9Z
zh$*;b097qNQU292+`sXHjA0?M*f08A@zN#%kY7NwMYl^jChQFMldcGe>;Q{k_Cbdf
zj5SJg)|T|2an@KNjXDn)8oea4WO{+UZkI~Wc*QUxQ+DY2OP8Rf`Ho%-AZ=msm<ZNS
z5oi#_NY}p}rNi~Tve=bPAb$MwfQ(UcrdBWd5U{XRuGuv63V~^WeyQ(>A-^FZIk;B7
zENB%jjChS+f?>miGKl|o>2|{D5U1|hyI0+Bui@H@R2Id_5$OpDLhIi)D5{YwXlb45
zWg9fD)4D;jFArTY;qXX9(d#pqnxRNJwl@`VZkEGgtN@6O*37U|fxpnEMk<RQO3G0U
z7_c?z;|I!e6Xi=-_ijesvlT>pI1EwQ@27FAVzwbf;93ePEnQmt+`(;#I!KJ#i@PUy
zo68HV9*9-^cVw-Z0#9}hkqKy2L5m4`QECoT3}cx^1Ci81_%_3iq}+j#(K6<0Z;5hK
zz+-fF?e(h`esrERC1JLF=dK_=7Ad2@B1ES-AR~gH?}IBlN&Pz{1lZe~&K$})Vnt2)
zUJ!krn2d_x%<hfo1ep;^bo7iyTZK2DKX+GA(GZ<UJ_I!&Xnu+?4XsC>3i<ozJ{plX
zba`-_0g|9@T%1C?k_ml^AVALKI_jImV26$!k85qwS@KK^OF<D~0uFQf&>YXA{&Y&^
zrjlz!FNMXN=iK~BLr!C?v4TuQ+V3X3=&^C_rJZJ|RoXAa%+RZ*zE7O>vbUpEO^uC{
z2&Cz2;o7~E=?-Gy{T121RBnPPZ+(F=H!Z}8ma!4}EzeJuV?2`ZB<yT(Yx~vXAY@}F
z-xU{QyRNQY0+_|3neS#}GMzv{70_ICV_26PtrE9yN{ME&nVF)=d8S#4W|MA`<oj7!
z!+Z5IR#-`dL<?6$+xKp3`7A#{O~^JahhAOtS(AReyzxiVsFl!RMzswIOYy>cu`$FB
z+c_LKo|?u*Sy0sN%BAYL9{C-_8by<J|CN<Ir+r?u+$FL^kO5RpafhUZb`HPze$V$d
zG_Iko@HW5`+<%say2<OcC5D2XM^_LLU-VMZiboZrdi<517FT-C^zoMmfaX_;!k|sH
z6fK74;Q}@IlF7>3zjN+6zIl=hcqPGC3)ODjy8|Aav@Z8?<)UY2vu%`31e=dT6cZ~k
zt|Mfaa%T4MDZVyd-#3c!vm#F%P}QeX%hyfR?_qEjIy(=<i&fKk!%Uz-_G`&Jfjf2|
z5JiK)57aSnMxvj}HfyFN*Kw|QjlJqE3>11R<w5i1;fh5%b(L{hh^6eH8!yD<Xqx-l
zwJlYH#y)78guX5W7k@?1cyHXgv`8h<&aaD^9Wkl9SkFQiqvJ01&T6**UF;Q<$E+1@
zZhOSVCe`dsp}le9#Nb1}`r45+CHmClcT)93HVkd5hOIV!(j@DlDSHME<h>fqDZXn}
z+<bb%Pw#w;qDex`o40S(#Q#17sWoP0Nsb}ko*Wz)I7k}&kgJFkU-sMssyZG9wV++F
zbk&ufQzs|h9xI0){+v_XZYK;fmtCQz<0ee7`Z10-U{~<>Xa)~i4u#v9*XNgBQ()!m
zm&6^_$}ysWWxDOUY0}`vIN_y@7pL20SI{p#y-L>?9ENR`dv7q{#*gy7&}wJyTW{IB
z%1154dyBlBEHS@|TCe>mnkFVTkxY)TuCF?CEo<rS1I>I*acD2&JP3U0M!+P=*7i@6
zV&{~XZ<|`&$oHOL_@iS_xz&SOhY@zIiy*HRAMNF*SXNfX!R|oI>`4+Ykiajp99Dfv
z0!D+n894AdC|f>5j<cgO>m!J;Z>zKozYmG>DP>H^?8JK0XeZq$-@0rY3xx-Z%0vI!
z5Y&7scv2k5m+Y!EC-G@a<#RTh2rX!blC{C~k-vMfKTS$$H-IWSUExiB{t)IF$c~Jp
zx+}IZWemoocxVebD9<lQO2b4~fMa@ZRlhMuXOc658%WQsL}D<JyoZ?KBc#qq%WzoX
z&%h7Xgr%GI(-*%g`?0tRnvh5;L&vvC0MDk2{gA#fnz(aF0s(Z@^+!kEl0ypoT2f0D
zz2!jaTNT5=j0x~X>y7wP?%()R<sz;E%<)qsi$~rs=G-|cT}k@E)1Rcj!5^i1EE#;1
z`trl|LVJc33up}<6z57scX8mQOI5Lt))0(RilP(8_VNnR^^Py?plKzd5%wT{*TLD@
zR@tYS#CT@#!Gi`sQ|uhoTWr6K(7$B$H%aJ<PeerQ19I+Z#mQ)bAo!*a@h9p>V)h&S
zD>$<^3=#-H`oo4jrMyc52*|W%)hex9z2Sd}%7~p^9%McIn3ZNQbLqLce$jF1P=~?G
z9QOAIIz{S=cora#oRj<nzSk^~=v9^2hMNcf{a^(ch(KEi|G9Grj4>B4BIQV+$25!2
zOl<^ms5<47g!bAxG*I}rYaZo@Of4kpf`Vz4bYCbn6TgXmDzHCrM79M_6~%^Sx&?4l
z%c_A*Cd<(jNvP?y()#DuN~*h*)Yrd@-qqc--K`VxPXF%CwmC8D+=6DU)fCLnpBsOF
z>_ew<pDp?Y+}domRbiIR_zicCA2HdqBcRpFo#o%&)q15CeAjOC=WLaKzu88+&M7Hc
z7qZFb=$W`jMOVGoF?5u$B(3KrtOI%jZQF{s1lR|g(AG5y&9jn(Z_uecICQleQ2Udu
zj%Yjq$D^h2IkdK$QDD`pfUH~T%|1JCJ;l|!d@>ZUhb<NNkrJ9gVJNfJ?~>%1B)4Ti
zx?9XxoUz-wgiBLRNfy76NlirCAcO0|<NshgmjeM%Rk(+9yaWq8v$MvaK`S6`IJViY
zqI2iT6IX`ntzW;scEjWlgW()bP(ac-PWMK&(VL-m1DK)DRg`(_&Yg|WV*mMv+|Yv;
zC=_|W`O&dPUM6}}s0hC)BkUx+EWo1@SkwcvF{dtlHsXCMii&~nbGa!t#t<$q@?@Vu
z5b%JevE%rx@3lqIa3{Crjhi?T!pz*Os~jn@*5wjJ5$w<=3t<@IMg2p)*I;P<ANucG
z7X|S5p`^rU{tZ9&t!8t&|3q=ssIFR+f}xoR5uJ_Wu*yQO3ulilQt1I1fS+rmVpNwo
zn+YH|aJr9+Bw`apTE{lEfa6SkOWxCVR39hVvBoe>5K38CK=Y#0{{FXtjQ>zm!?hvZ
zhU&;m5_>W+m7KSFOlCYsiJpX<_!sR)(82H2Xy}>XzXXQ|&_{`)Rsy61zCP#k^j{^(
z{Q-F1(X!8)E7N3lh(d>yQ6jNhFQ6xo5g{E27PNJ%0i(GvO2JcE4J2zW2Rz{JiP10*
z`sf{UIx*%WX|8VBvZYcD!0xM8H0FP{xg%zd5T0@LyLam3K=a7Qk6qfeb0Laj`6<)G
z0~>ub4bCU=_&Rn)bom<MA&{yXS_}J3Iw;Z>&VLd8Q<yRl#0Ef1n1Hlt^20mWbXpLo
z;Pp~fz0O>un>YVJ^93tKLj_zNj)dA5zMO>S2fb*~HAASmtRMnGMUJoq7Lfsv9g9j`
zWpo~xD@Ny;TmEqh_50Y7nx`*u+)3+i(!Auso@X^+!%Qj6q1~h_<+FePO<l6_dZI~H
zgEoloQqAGRukfhghV7NmN+bm51VoV<k-uZUil3i*hrMHe(tOiXMWrU?m0*Ug?Ee1!
zdsxQ%jC{QL{OMDtT6ECN`|zPNld8zX?el{)rV@TvbjrEBa#X|<#xqUvv19RJEk!(0
zX_L@l2c?OAcmHXnqNaxNo-$Xvx+7>cdy)H}6e+qI!c2c(T`nwt@_W${O#eD)3@VwL
zV~I?=K-#3bJFME;lm$N74L#p;0m_<IA1LHQ2};Sneo5^u%#KQ*ib&BM%?b*&!gb$9
z3>&uhzyaELJ=p3vkCeZ`&VFzjcvT=t7a#$W)1f>%K1dz;^Z;tZ8cTpiMSJM(GUju)
zgBPZTdMZ%kKEt(ZC<}clNM#~*dD_^Xq?_~Tm`}Oei!yWKw_^R9OFH$Rb<ArHQSFnS
zLhXRpLf);5x)bXql#IbvxdGrj_?}aqJvg9O!rNDQzYjGk-tOpI4<0P#j0!J+r{JX{
z9Yy7a5s98ZT+h9KbO?cPC--F=Uw=fk1B_N!8MdE&O3+|Ey@6yXROvx@qO>jpgi{9G
z>az#O2CwGv#bq|wD;=F!wkd<L7V#~$*AB77prdB48MKQ94Z=+6CKzdt`fs;5(I^A(
zX!Q~5iymj^VUulRVqt_^hMR*`B~@FCz75X5B?&#N{k1Uj0lSHchWY2>;t-}wxjzFY
zL9`$eAa+q+JSr*V0;aux#OBdtxsm3V*NKT=@}6K;$xFt67GSHyWh2J@qT`b>k<?7d
z&KBF9a~UgqSjvQ~sz0=~U8vL&=udL>0-LL?<tU@czDxUah6NrDup$`BZr-v5tdP21
zmZ=>^TP5c)fhYxOU(0ClO^o@SDj2kNq=4*${qHYhxu|%<>kQsQbzy?hI6ZP;!ktK{
z`YiGNif_o19U-7HH>AiwgUmY*jP|qe@tuAcj*G|8(86UE+4^p?V-AQWLxX*@YX&-D
zuV_M3!f;Q^zP#hYhlbQq64;$86pEhe7XS~umz#D$PKux=`sHV%`kH?&XUv-Qu|WCw
zd?rpBRWc#&$HR3${8LPAl*Du-j*-sVXmXK0y?a}onjt8J>+A<fjN1?rK452UG9$F3
zTCny4u`Fk493-J&lc)^ASdO*F=&(C<1`k#t!v40okJ^Iy^SR=`6?esPI6LpJANP58
z;kFB%oMuJOH4J~|bEfX0)~xep8kjC{_Ta1-tj@=^lYOe+G=C=^S~s@s<Sro`LCZzO
zi}v0{Ecn$c!<z^$PpF7M^)$HeN)@H{?RBWBR@|L9aU#Ud6HhVFdNHFweOlLBqzjEW
zOHp|${I;oVccV4xj(#E$jQJ}nK+tC&SW_>^sJ14#fA;#A*!MfsMr+$-9U0Yp-J$m$
zK5E*`5N*G!K4<E~rQc6aim*Aiz&~c#&3)}9zO!@j695JsmDv6uQz|v%r9JrROE_bk
z{-Es4Og+@NZwgLw`yCkPh@PXqP09c%0Z>&5A@~Tg?0oT`<5P}LPziU>e|K`L0xl;Y
zz~UooQdm%Ud^*PK$K@=18@u#XGF#(f#_QMPQ%hqjI3Rik0xX_QE*MFCKbxkT2SXiS
zUA>U-EH&lf1VX=x*Xg>XY0=8W8dE~9?Hq&8f|?7ozwMJPohae%b>BVg?2eTUs=m~C
zg#dSKi<&R8JHd%0kRH>K(IBR_oYSYtDX{tqef9}Iq1#jl-&BDcX;fkow7>uO3n4kS
z5J>xPZF?-u+a{3=&m=Gbls@)-jTf-Rz8?wFNoLxR!7AP@EkURqT)AQi5eIMKPc@ZC
zW7b7zTAc{Yifr<03-!??5n5VW+Ov1z9>UQ9^m*3`sQ5`H^q(OK&sSUrLfv`_F8$RV
zvCx6iGjTw0HB^>=yVv({=d4?g9u;qBJJRAK*yp?U4xpDeA57}NfwFaYjs@51-@mdF
z2>$ib_I4YK1^gp^vUa@LRR|$!vDeB??f%?)5bfuK3A78xl%1yM;J7jgdhPm-o=7iS
zZKg0mavqU5d*7xP>>n&(SD!z`nE~Ro-)+fGBw{)tixr1kVniwr?I6vYoR~xhCM7U@
zIU%-Z^gYw9UXNcreOeK`L`;2DqXl4_PUoDiPbaT`xb;nwn=PL_sk*)BgxL<Y+OKZ`
zS!gIOc=!cNqCT3#TsAX~cqjD}Y6HOuDDpp8Iul?J3VK9@0|oM;dvP!1TmUD+9;oa)
zf4k~dQa9p2?!%!HEtXwMtA^?LPUo56dp?l>Z*OHn81=gWQ-zU?s9zH5&{*nhB_
zNh|+U6ZVa!>bmJ|)Y{=fm6dz{Ht{NFVi9{irQ|lH5M<_B*UNgMGLeSJWM}yLN|Z#<
zOKCBE!a>m%ELxx*-^yDz`MGYly|qJ^XzLOUV1yT~8LF=Sf#9;^4&s#$Z6S(wejPjS
z`rozLr8g=nTs?HAh8nNu#P?fGR#>M~zp+Y_z?{05`nzu*sLKjubKjwdL;#B-T8(Yl
zb<&$5tBA)@sbpwJ{G^=^hF<TwP@YEnelqhpU{hB=j@j)w-z)C?y9mbKl>LA7>C;CP
zUIn}>0=CjmZ1@Yamcb*SNxF1ViGn2g5fR$JAFrE?->szquj8Z?aLQW>dEc_dp-GpJ
z#ke^Y*bJY2m0>vSfoW#DTf^usqUV3fyDhDqdyHKB^+B?^PVDrp{f4|uDXlv;Kg8#@
zyI$#w^VW3Ipq?|sF`nsqhZ#%9`J#q)v=#I8%C}YB?KfDClNhhD8}>v^L!)rnMeT#$
zO3KPlNeeDzArtobfOEnWHL8`g6O9{wCc}xWuYaqAUwvnrM`egM*`Y(NO)2*p&P$S9
zkm|l9XTbxqA&uwgZb{}p#U^&vnHF*-jaSI|k5rjrxbfes&@FWno|nEq^5#&I-n`4f
z^MAYP3C>UG>0Ca*8G%_R3Ias5YHxFMq2JC%26y!z`F}n5&s}vGFnfmW-CMUh+b;F0
z_9Nj)@P0S^^2Y15%LR`>k^q#_Jf4ggLf4BIFRF?%OZwBI71)~2pdXlCH6c_%lWBuX
zYCRI3Tev$#5rF!jTB1#Uuagq(B^f*8btv0!N_mOBJf)=*r?=Sa;IjR{-uV`6o0Yu8
zz5MZ`N0V&3z8`#WQ`Bzu=Eua{1kP!mQ7x`Elnf+xV|n&xnM~2pO@J$Lcg&-YCD~id
zi2h&Xw;Q~7Io&=`9nU-!IQI^!=<#1>x=Vb!6`-_2Y?}~3u;>i^zQ2Qp0Be22h)}Vt
z=BFQA>!1YOciP$}cG1L}2WELp#3EEw5~$gGm2oQ-7v4;~UBUPWr@W<5eCkHkk$V4W
z9+W&Nm2`cXLF(ES^ysa7MJGUqU3vDaWG!qs=<WN5E;||Bx};A;70g{$qU+4Z07#iH
z%}voD--0PiTP!+0vi1vK<5P2vQ%x>{5~ApB08(^!@BF%_B=8+-3)e1RCf^7U*;^MJ
ztqiNV;zyl<5>jw22_^0EG+w--OENG0`laa9(wuqBM=!h00nN-LBpn<h=3I<<G?#f9
zwwY1W2gVv~yssTZ?;QJ#sLcDzy}oy*Ck(#n;IH!N!^BZFd$e+t8ZADHBK4Ns+pXlL
z@BiMgVVm<b(6V*0l!e!BGwNDQ4va31@(wVH&NGjf#H{<0w+Zr|cJ;Hgv~!eD@7yE+
z?09IB9)L0xrcKEdFKglUoRm@=jL(prolILC<VkUo#i~btqD4oo=hP-wZ-|T{K&xFD
zi;4H9biAk^GN%zG3Clo&aI#yMuB9pc4+o~!MMMLp1kVg1`6(Qmx99_va!TjzcHkYm
z%=|jE;*VXUdqba!wNEixRJQ%GRG*LC&`V!?$dK-2RhEhFWj9{D(7$QVu@Tjb{N;$3
zY_lZoQ~wt!Mp^>^FHA)+4{$JM=&F?~(<TQC?G{eTXgh-R=m||gD`dh?%at@{o&SJr
zWJRK~$f|v{rhqwub@f|qVG;O6g?G1>X+Kf>$FHlMY~*xdR{o2fstCoBT0JFD&`EHh
zvWkb*HmD?*fIQuAwcxtwJ^EPQyHB4k_Yh48d+DvKqvPu8suNPMEhBR3SX*4WQLH^S
z++*x7z5d&c7g3_aF+@EDQGxn8qjm`AQ7|ER(6ZAcVu)4@3J)P|AjV0L7K3?9ONY^h
z7eK*CAf;o5KG-wxa9x=XJL=zU+xnC2fo09D9P*3AWhYAo6>p}gZkLDD=x4{duWYbn
z{uOTf*m6oedJIq_Wx8^5Q;a{QH$-HNBd4F9wOSf{kk@^tMfxe0-kD^q%#dXFJD}(f
znBb1vc_j;2Sneq0KDiGFs>;1syL&rZ9ljg9bEh^*YJS_an>UZ*%kPxO4aU)}y8&uZ
z7*XFPn~KWD*SdBc{@ch?Nq0%&`pt|v7Qky0oiiNuO4u!|CP@s_u)rgSf%(9AKyA7{
z_!<NCNUCj2&(B)M8)I+wgv@~oEPYc9hC$eyp4Qsx>Hss=gM6?or8T*Q<FH>=lAlB`
zZStEWuNN2?C2?!-8o^{@FGtGueDPBZW!sU}#`Q6zr!c0h22cK41_@wn&E>TymLot(
z`b}o3*t2In&>B;!={?;_ZC1sr=+h_U7lWoRA4buxy1JTGh2bh1?nRe9rv!o;p&Ef_
z<f(mw=9}>Hw-uy0U0{(xa4LxQv1unWoIq4MckcWgZ$$vT$&7d<E-o(S<F~-M1R52S
zi($up$cim!YxI2uzJ-8`><f=Tc3(^#dcx~bTeXg*O`h|cEQX1R02E=|$r#hR@8zjP
z11GvSFL?ok%QQlPezZzOxmJkLVOd$=Rr{wxx&;CnY!LRO5KwR!**JvjR&F|adR2^c
zN6K(X&R)>iRxMjj9Ux|o=_p-K?bl0B1wmd;mgzCl8oc|Zvu7=&_)*}7({tmqU%i@R
zN{v1D?c1<52bVg41K~=iJ0XW4wFIAJ_*@n_#y^kXJ%JxBr=MfTMAX*7A+Hn6)1Q!D
zZLRi&;CjyG6{<If5NEb+-MYgLXq#d*MeJ!ZA`lCw-jf$Wrz0}u%y*+L;7t<<&VmJr
zvT|&9PDK{RHazFzvIFt)V)&CY1FpU8IZ?;i2S^>h^RVW6i_^j$eA&BiAB|;5$%S9J
zUcLYIN0yE3dt|IVAl=48Lj8|}fs%~M0MY9f6;(v}H@(~Rx~?=tg?TSsAsQBIso87m
zA1ErUSEl=OW1H^#v$SU&XefKzEn$xtumw5eP+q)1PN>f-t6PMUfFjNH0gg?-o_!tY
z&)_avt7zK+%>q6lUFD96Wk=FLNIH-2gc=Rqv@rAa3XHIMmC#aH9RI8D%iiSHj`961
zLX+@y{0rNnbkdc}m%oF|urakT23#;>`>t)Y-;ry9EpQj2<xQxHe6I%l(Xc#5fd^hA
zJk7Ph;kG_E!R3ozTp9?F%Sj{z*t&iD#h4gPWPSn3>7{7>kD|kF2S9_39=IPw>K8aT
z2>6G47@@D9=(6h+vqU-r)I&}u;TSt+j39&F-xJMq^E^ekMhI-w0?WO3=1r6bNXBdK
zYK@Z&|J5kk_vIVRHoTxRvQ5)lnVCW@%Sp8EOI!>Vy*BCZxahU$N`GaLTsPPC>*;%I
zw>Fa<2>!HNg8UYIrGL{s^?Bj5WD$+A=TGCzXX+t**wU5_3T74gh1@@~p}b-@_P>t>
z4xv11$-1rHC2`k5pa;fc2PHI{ut8*NC!|It`OoS-+P>ZHR<|7>w|arE@>37nieao7
z)YR@6`qa(V`0%``OBnn7^0jOID)Ik<dA^)M<&s`{_ptNA<DH-!x%EkXpBvTvNB(yy
zP#=xaOzFHjPLgza@7~VgkLz=r1WpyU6eZD=`lYDmXEHxiiuf3x39H}Z0#YvhG|8cc
zf0>QfqLM$Q;(~0A_e+;83pRQE>^Di)t}hEyZr$?Bx@5dvhYST9vnd%y?q%3iviS1K
zwz`fzd!D`f4OW;GUZ01-3t9l_gyOY($&o;v57(S4jV^27`qYq1+PRU)SZ@Xej}*`H
zxNud<zht!L`}VH*);)O~4KqRpPz`{TE*7dw4R7=Ss*CEku~1LNauxbaDD=_Wp}MFc
zvg>s%ibA<1I&pu!>Mfaf3))Ih)+PG#r|7rz^?5@e02jeL9$ALE_QERW`16ef+9GHr
z8M4-I`NfZtB*)F{vvTqdAW73>sQ&;P>Bekr`&xA2^A@f}p}B>XFzsz%F#9tC!I4<7
zexnJ)xT)MqKn-RP+jtz$BkU6Cy)-23m3b1fWWY;EP{4Xl6y~=IREtk67}T-1j(jKe
zx~4N^h=58*=dtnWCbNcRVW*|5Yo#%=Ca=BNUF(FX6#HFOjf8FnqI=mQt@3?2p}r&^
zSm`9osa*~YOY?R==d!ehILfzMuIM$`Ol6mA7%w779DOT2Jq91ky_C)ZAVg}WzZw;m
zs;a8#i$05IbstMOazYsv!be1IA9OQtx6q*GW#{$hE5X>{C;69(sw>0@r_t$6ehic?
z5<VqISekIKxUaZuQZgu4$3^TP1r~ADgWvL<9gR;V#8*R=O%-J<Pyn+c{5<}zqte>7
z-w+1sY&$`Yh`RW)6auX_B?5qVd0KpY35dA)o4mYZd~@_4BS$6>2*B`H(<X!!agJ<{
z7dtFper9VT@$NmW*;B|oju7xDA8j@xwgz9@I`FtWO&sovZi=gG`yC7;ktI$^_4L3~
zN!g~h0AV9)Vr+KzA&_$d0u+=Y*=gvp$QD0NY<2T<B(!36JyZIRR9flg=99nR=i$_0
zfY`nR2eu?rT1AFk+)BM@u_SKZY(@gi=$wr0O>;Yl@RW%%N|fZXuC9)vks({Ow)@6~
zf$Ip><Q-t8wDOPs41vdpRT;VZV^Dw<hyxopdiLa>i?)%>1Erp+ra1Mps|qc-_aq<o
zChImn($K)*Y3ruB!T7-8Q1z7H0*eR~r7hXF8Ia_NSjH6pNog;d6|X+b-Y8l%LTQTk
zg!r9fX0{gZ9}DV3&dh+!<JM8+;fV;BBJRdTjENXNpi8VqcLsrrCEIu*cVdBEfZ(FE
zkNH%f1{srtEzTQG1PSGVcX%d%e=Rur??_8R1~WV^&?)`@#MTf1>PDpA8!RY9ujPEL
zUM+D+d8FY)zLU+(5sD-fpAJPcu9y^;plCdKIK46S`C6aa%Wc__*n_53vN&q#1Sur0
z`}dXxnQ<4n>*z<jf7qxVWZ5(iTR~X(Y_YD>KTQH71Yfc1&`(qg)V{zK7TmqnXWoX$
zNZbV3RP(KC9@juT6&j^1b#f6wJvBBhHK=S5p?9-SI_og~LKGWK&G!t5UtMI)jAvOa
z?(vinvLhi7DiKsY;H%jRy`pVcH)222V-$)AA3PsM>Czw4ePhG7#$;381pm=e!@%)-
zTN%s26Xs2kDdBVks@)9!d5=N>he5jokt7-0p-CJGQC7$Q%F?^!Cqn4;_{6UKuZ0U;
z*~lX7<yXF(z*tcd=tE;7k`rX>jxgqz%uSZt@Vr4(%<$3ecVGYshvu>;rN2ZVIU|y)
zjA>E-K%rOgnQUSDm~<kHz0ZXGH(rggba8X*w1f5^UR0u2jivZPwbR!6u*8v7P$I=~
z|MARbz8r;e2_wR0W%~a4=cZ)J%0$yq7*>z*z<F$bHnN5q--J$*6|a8+jx-LK0_<H4
zoGylkkgc?$(xz?Oa39M-b^|z)VjQaY=7ZF<JgGjaS)xD$BntSQnYcSkZst7j!QXKG
z8W-UD@J^mS<o&21#-HwK!!K|&2?~4%BZ^^Ow2WsXz7r>$O_hijy|nBFs@s365kUX0
zpg=I}8+TiX*Ti3d9!$KK=oJ4m4GJ73rrzVEO@($OdfgPwM~ge$EJ)C;y<`p%<_-=q
zFcLSA_WHFgqq&#jG0XGsN67Gp`l(bBemHzeto;}zOn$i}Ss&vmz^bQ!aYA{n?sJ<i
zhbHaGK(A8#_*SWcknxCO(&Xr4SQzTY{Q=>#k4W-dNFu*hR47wb*hS=9Rrq8qx1?Q(
zmmy`x6<Qum19p1?ipIb5qn%q=NabXK(z%!zAuR(x^Xa5!z4qusFZcbWDSQo6p;UmQ
ze$#d7ji39;il>8k$WxsQV2tw6CC@%#F2xBvz}FI<uQdO`@q6sq($0bE{>n9Lycwk(
z0Dp9!bYJy)QCoJmr{J}!VNp1F%qgDAa=-k9D~38_HOrYCF%hIx@5MhqUv3t5=`re6
z;U&+mUsvtjTjVX`J6{|-etdgZy+<hVc!+b!3yAFcyRQ<1Q@I->67+!ckotS~zJ==b
zXA9|bS0ciT{Xi&fQ^3A`S)ksa9^q@$C08trjeVg+SVwi(L1syF^Q#9BK0Um+;O7Rv
zh>Q_J)q=yL)~<VZdZ)@sY@)iiFh28z0ELu9<Mbuy!boYK&uNnNk_;p0*$dDc!tqVA
z$3H-7M(fl++P}~bLprsFV!ip=ZW6oSIauHh_!|u)8)i1K>Av1Med)iIZ-bl+)jHZm
z0=~EqtixM43S@qgm;W20&j(^-%dxDYRQd_@r^m1rq3ny!v5-^I^HI_ur=Wj;{T(UW
zf6;#q8IpmZJ2;Z%T%vDxN=NvoX?h(f3=#9t&4J^yp36N7xhpy}oMZdOHsFbYU6r+F
ztN;xrd!91)41&b)nBnYL^4s0|vrD1jF9VE7_bkfIJ?mC7cb4<|r319IXu2Dx7M8Xc
z9Dh`isJ#bnpdJAIk0FogH*T1iij0r<45aved=vMD@I`<dh=8Wea;EBwy<&o3*rlcM
z9e&1>d$M4x3)*12c1$fxBO*^_5GPCyr-CMdH$Tzx1=a*NZTejH>QY>&;L=@<lyvSy
z6`wkb0m{*)#|gd{-f1N>K<@2%a9~O?HY`{WVGaylIEIo~#>8KH-6bs&AQm0bN8+@m
z9)G}VBI?_vm32^Fg2@|7Sgb;`01<2O9z1eSAQ5&4CF`fHf9o}N7%?S~CMn;^xbrbF
zwGdhQPS`t-D2V25kzT~X-4|UgJLrFhe|7(s-#^B}p$yM}IEH-{pnQJ#rcL9>=7ja2
z4tug|E932=qmEqMn^aK1)+{qcPMF{`sH~mxZ22?JX^E>ORR`s`M>v+EgdxRi{@Z!5
zV8(r1)HD!w^`AB0zD3VC_m&&2-5~5qSs9`K=FQh7acyXzr@Bq+z7qeGBGd86Xd`mf
z+-20Qeb}3@lT*f1Rm6`o{{1Y#lh?~O#-7}kV1)1tbk3g0a0=c4BW;G1gSYTmLZ_c0
z>jkRY!#E7&+hj&2KBW{=iYY`BX&?a{k+#@sYx7?EXpUsj0m)e=REC3c(r7Mrov?K^
zT0cFNjkgOS>vb4QH9CaF^ePNhlngq0W@c`V%1IW8Fd6SLBZgrXg8c|_O{b^HY{c8{
zl9&y^HLN(MrxXGWP5c=>-BzsFt8Kp-S`$<9b|-55ykn5g1ViT>12*oDZnOPQpB65L
zX3aTh8lZCts6nsk$qwz3z8%SMf_TIa*Hu(fS_zYYL=t?u^O;Cmn2_DX`a)pMNclb2
zmgn;!3WkzI<rv{-3OT1_$9E<{m;;ywtyuw^H)|U#pFdxqxq?W?e5stAxmRxA?d!Py
z<-~|5h!toJ9V!G@$o=C0Be7Yq^jl{JX-;$EWpRcHWj)3Yme){?Lll`3i!B9ntRc26
z2<hxwfs-WGp!i%0;2wGC*QCT1IAg#&w5|e@MOzR;#0>{%I$$3dEq!NjJN8e~mnbU1
zE}}0E!@0WnWz8I)^C3TvOMkjQ5qe?HatUr5<`$8a@kIKD$(=atqj`R)9{BAuJbC>j
z{3xyp&__*;Iu<j~9qtG{$-qvuS9Bk?qSvrt)%3@(@=u&D;DAHBunmFV&Jt>1ZPDxd
zajol`1Hb)zbJy4$^`LDeCx+{4tvoufV<-cbGfuQFV+HWz{3*KA5Tw5rvi_IcS2(Uh
z<kdwGdW^Ts4QkFfE37R)anAf-pv=v1lxmE&-u~4^k4L{&Kb1QM(1XN~P~TaeO7i($
z1uC>^lBEdbCXYMkM98o>Qn!8+o=WGNhzldMrf*dLJvQBQ$@1mJB_)%k_LB_ou>7_0
z5#vc;aQL3+D+yGB2G`S4RD$Kn{0Y|+NOXHgJkGY5FlkaprGW#Zc`Y(bRRsEk91&|8
z07)*Ho56EJ<liMSDk`Wq`K#ET(gQD%eCZ~)#C*0_b2FAEGx{(w7wu^DQ_PY%3}4O0
zH`!(oKrb<mlE&=Ir8MPIY?WmhTuxnXrnp?-tNtj~;}6spEMG2YG~k4?fSCK7^S|!1
z*Hl_1dBY%lYG5@ww28}hqr}%>n>>)<qR9y~%P~EoXL#iryb*qyf0Rfve%2!SOV(3h
zE>5M_kJ{Br%xV_%jW+#$(_Z>X-;~DBvW5m}fkVTL-eg7r7soIK!pH^b&;^M^p=X<)
zOKE&@7c*0-)11%Qtv*GqcW*ywNWgPWZ2WSTI&fPcDr!4H{D_x>HKvY{g=_#_qvgpX
zw-VC{M3z2>fPl!zLI5@)(J3<y65skG@|;;bPBKs+sCwj6g2Xj2&-G;Fe&a6xYYmM}
z^GH}IwtCflzY-vnqCgjAu#QAxwnctqxuji7I=6V-t0Es9hJtEM&Az>*y~avcDDZQm
z=aC5Mx>I-nJdxrK!UX?~8)q?sPd4vC(yOW0?9S5P5`Sn4Np*osJ$u2shIF?<zC_Ce
zOmgO4>yzv<sV;#0cZM*LgM6Q%<4QoE;z{X(f7-A+12r0Lr_kdlXNb-|Q(B|ZK!L{C
zU`!>Fh2(9MJ~!ZMTsrxvH?#o&ZsZ+_sC|8X@e?wLkR#Dyue|ofM^`s%-kj3iY?Y28
z)@s=B#@mxVz34{3?TT5m))&sv_vJjmUV0v#BE-E1UJ|%7?-xdfnl@t=m^~%3K<Q7k
zfANcMOUc^+!x}!`^*b~MOrnMCVg`BmLmaaYb;1A|B2#I0zvbsJRJxme53axV-=exl
zGsaKAMCWrz6~$zNhE4C=?bd|PlH?|ktSBZ%W^0J=&NBdA%TDx3Z1`(`d;`ZSjt21n
zDmq5YkxEaul*Q6)$R@8YUHOZRK)e&WMoJ$VKAA8&TS<DY{siJ6eCTuk9RMv$C?fva
zLfJxJS^AVoi8n`iN&idyQ92SQSW)u29vYgOWtizEM&1!`%Qs4{?a@YCy5!)bh^{yz
zT24kEBUvD!2ArBMzwm+3#-Du&Q53DYKhz+U!8@uawIbD#esSr>Mu$1++65Rx<Piac
z7MoBYqcuPZw(!xMZgP*oU)=ver@r=fCtInRrQDyv>>}|~`X5sZ(*HHYpgHgRA#19M
zLMUOw&&zdee7U~fG^zCM8$r|;kPMhlIB(Nnyd-wNG}^*}mCRHUqi5)uQ=<+=+!86j
zWXw;W#*HK|z;#Ce@53$;DUdg%2-uqBOw#*I?$X!=^&^pq#k_(KbQCl%=J29@cv5~s
z?>9QlzFI<PoUrZ{gWOH2rc?b|Y;P~pC-LKZ%Cxb88iT9{L211}93!bJnwbR%&{66r
zv0t4VtwC}VvCl@Vy6jaG6JT}Hs*sS25}IYh8#R8``0YFQgM9?VO+46s2ly~11yF9b
zd{dS*Yjk1ve)J)*RKwPM+8dw@Gf!9dJgKvMCETYs-T(*2j~ho!OxTqe@H{Jvd@7oV
z>gQ+wY5du)Kw0R5u%SAM5E^M0pNh1xKR^2)BaNMK0KZ7h?JOda0W)lDD1=^vWbyN}
z1&yzp$Xp?9DjkH9O6Sg>XY9aO&=a{uD;?SRZTO6hcp{oj!70w3Ev>35d3<Dpe0g))
zH`+L*1JzGo;xg@&sCN7K)Z?+_?>F~HV_y+Ho+}clK$raDbNUj{;Lp2^ZhV(Z(h64z
zl$&IFG|BZUc(YIsrpgcCw=s?O!CZn+lz$dcxD?l9r%h}4oNtT9*NlQ`;RueXw!q2h
zDouZ#I(3qtf)g7WIiI*2q$R+K1JxinQJ&}yfmMDAtbVd|*^XQHQH#I^k0vqHBIBW6
zF85{@yY^)_)b;pa;JS&l)KV=|BQO`y1LLxPe)ZNyUu|d1qYxQT6BP*-&88*|ahPx8
z_fi2z1*ALy!4@UD7HBidhkGWEnF9Ma{%k#{sHj!bk|SM763#Pc&d66ic|hY8s$YB|
zo80_cZS7`(T>!dM84i^nzNLPR?)3X~U^lWNDKQbl2dkc3SZ*PWzy9RD^Z2HwWY2Kw
zylG5SqJZ_9q<<t$8uBQ{PQQ`4FlK#_aif51+MA(aG=qrzTz*d1G%^6jbD7iff?7Fv
zct8L-5h}d2^vkz$;6!=pL%e|qqVEa{Ao@)GmY+T1(x2;)&}bc!uaO`=XAe`+4KO1|
zpySwr4#A)IyX3z{42;qU0p)~^ZW&&A5nO;;i9G0wYuCt7c%)J8qU&W%<;Q7C@a!JI
z0G7ff4W_M|?|-Z7Yf!+%xFzVEu%@GrVIzRzgp8B?@VWe^bD^okxJP(~6T>SPd2Ne~
zbi@P)*OInjU}~p!*>)#8I~zeb@-aFmNE*nQK_U8E9y6ms;NgSxy6-;xhX4_-k_M5p
zO^)pal&7X<5-F&lAi%k+b2_nb2seS9gdvdN)Lgoy<8BbRfSezH>2eAYv;P$mEfkyY
zqoB?TQbY7T;B>IzV4(y|iE(1$an!o#2!P<iB23tLfZWK47x*$^-CD%WTtu}EoQ}*{
zOeyi6NilQeWMXt?Pd)jhL(&e2q#-ai<kIIG7)W}xW$YrIv_jn+As8L^(p;!e4WU%v
zdZL4?moL+IyvYpnM2gEu?Gy41eNb#@i+z;M30iav?3XP&R5y<@Rq$J26$&fRneIVp
zM5D4DltG|<-7V2`t_krI2%t(DB5AV>hvKc$y?Z=sNVR~mn{hlq;cGFulf;dd*By0c
zWPFK<^#|6ir-;)BB0VijyevVjr|iQ`Plfx}u(#@#@KUJZO`15dgOW`4ixRSY<K+$F
zp!lZ1t~^T>EOYi!Cnry&4}aW;nntAV#l#RopdxrM7!m48l@5%z;NbP>-hG*);}y;-
z-?u5iI*R`9G%zDV;abAJ=FOW&cx<6n;^o!V_fu0-&wl%g3QaspkuD#%B{Jk;ZaG*L
z-6199EzW6S{z&@qJuN$E9+NF)@)Pm{J(?cF(WOO4>cT#iL`_*JP@O4FGTmKCcOh!K
ziSimHi|Xp?&LN0u#jh&dDbU-Qj`1DfiR+Lp0v2uIBqy^qpM1(f0o&EJ+}BrPU+ces
zccYdLdgfM>L2W;RHa3XHtr!qvx~UzZ?kk`@_K@t5^h(8rP@SlK_5zj`hk18HtnaHQ
zY<Ql#5WQ(}^W0^%7?1Kr!i-XIFmt^pKvfgRs3BHN_5o5LZ|{y^<M6NyCu*_F9UTkv
z@`5t55ac$)0D?l$y(N=PYtp32&Rx42jIhVwO#gIswYj|FNC2Wy?5EU2`qP=YdPvrf
z-zy@F5Yi<MxKHCY8>gZ<kV){OmmGB{*t!q@WWGW&rT&0VFvfmt#HV#oBHzu=r~i{p
z_8sO+R(AFx4|j|}(B8e5Ia-`U0z7e?(D*eUH*Si|X6DTz1HjMX$T}!dZz8E<WdW73
ziX=M@BJgpEqqib_ke*&<&p5PewJiiaxNzWL3ei}C(W1Ood%uz+PZ8q+lRLsYneY2o
zaJt>v!+~_ew2Cpbya94M(X)q5I{u`v!4O|bVL#IxPFIQPt?X=1qAJjqpN6;WfdC;q
zUIB##sa`0gb@tYc8*hX4zf<KipMur>ah+m~hUi@nd@~JhCrtp8Cr*s#HfV8gHMxV<
zj8g3S3;NhVgGeqo<di`&v9@B%4oT=m-|Ph{j&S$Tl48Lt3!gZ^b#|;<F5RMW>$(2w
zPtVK1@6@2CT;i3poqw*7o(T4>xSK?w6mWAYcoG%b%ViV_eIO?<6{S=1sM>cNiK#6J
z)mjbIvIsTaAH0zY6^(t51e*$sY`}m=Sy{tzY0<To#P~KCWUK&14N*X&WIAWg9O$Ld
z4JT>`P&HdkxqJ8iYOoJzn;H7OVx;t)NpeIe9;!{591rc_uK6R^22iDE&;jb!N=oO*
zY6v0Nb)*{9DJ>GUX?%*Es7z_a9R1(JbxU~#EtS>Ja&JTto6T_JbL;!1#8R|7v8I{C
zB8$6<b&7((B6O$dj+xA|QZw*`ODn+MY%z8~;u4Q>g(7WyWt`{)Me>|z=!AS<RY_Ov
z8J*Yc-T3!!Ijg!GTxj+AR#~J+&mft)N)J69ohP<C-&oH&ZLI;d|H)xNNC({lI*oBi
z?ZH|C8OJZW5qFBc<usNYw-4lhnyo+^IIW9cU9BXJhm3xt=W8jFmHe2pE{ba7^Y6?9
z*%v2v$&x6!y`O#C^fUh2CxUKKcIr~>FmdSU-W+`v5*W^Lx0H)RaZ3AT0bK*;J7%_>
zN*N+pC??ka#Sps_i~-I^c!y#{i?(g2%LY5!*`m1x%*l8a>Px&wa+vWAahvgOL8H&>
zO?JdhrzL);dy$Nez4C(dB#JCJk=m)EG_o0$?^c&^l=JroJ3?YO!b6i>V5jcwEWMHg
z*)(H27>54Jx3`ELV%JPY8Hz{>|F-Jm(@m2%h?nB4gsUgM695kZB!!1MT2#L?%G)rs
z;9EWY#u?*AJieFLxyA4E=g(4ke|h2`x(o6TmOJhpI^kK^gKiyTH@kNlweQLEu%Qul
zlUy&(9QR<RdGph|<|Z12ypH#do;*A*Ju2jO<(JIz8h68xe+zz8e%kao<MV1CP30C>
ziext5{ns5Z=rvD4^riCqk<gSeI*h78Xs&ODKo#~3MUS`c>B9P&f}YZ$rl9BLPk(26
z2*tU~HTR-p7~ynfyhO5Z9n`CA^Vc6gd;kSn>+X)Re~^-5-fxNcrlfwKWHo+D#wQd_
zCQmLAx~t%wFGtH_@3@#%c>~Y@=tF@FzCn!Q0@j_3aq2dhy?TouqsxKxa=7KpPbVng
zX#~F@uxX*yQR3Ulb;SyWR;}VF(R+D)ryvs`#qp9_Qo0zfuRlJ`X6n=@fEt@Vh$<7n
z=#?;hij@Z^s&&hj_vx0`ahQa6wMyMdLPSe%3HrE*WIjb{u5uIvD%E9xD$##OP1}Qv
zFOnRN*XSbKZ!p<_;uc21L}ApZn_};;@Ft$br=4Gh@mjWQ&PLB)B>Bm}5jStXqiS)H
z0q@tOO(ZAG*$NCJNu>6vph~eCU`SM!Q|2^)A^o=JT#{Ji&zv=D5-LFo0(yf}8hIyT
z!oIMmsD5H94E+$(wi$U<Kc=9p!1vNoH|I@dT|E)`wI2l_sr$RdK=^X3gR7vs@9?2R
zr$>yYaFF%!SIJ^q_z93E2}gEdH)?8Xl%(`RU&}0tzldfY`>>3?fKeM)A-!KoXp=xT
z$@J?SCc<D1zr8dud3z`5E_QaNu)0+B7G4Ugh?*SK@lNzK^KJkg#wwg+Sacylv*upt
z4s_aL4cJWyXhYM>aHXHv7`~cgBCY6~_VDmPKk0W>3>S*5*Ew)0bA-+qTW85%lZ*N2
zEDB?yGzUo!q!mdSv(qd56)|_3nwqtU06z*bWOFUf@cQGAqj(e~BxWDy6+}l8$zNr%
zgr_Imr#ViKrPjy&cQn1pRKHQQKlY0IC$K6iEE_L0VY#UmH<}M7UNtm~mnu)=5{K+7
zVOaTs1z!$^rf*fzVVzsHXu+T2og6*-ho6PIGR1J}5pbo8ut#u%?HnBiF;;cdM&%_N
zjzmWf%y>c{vcsVpOrhL=^mNBASo!q7#r_#Ju?=dcyV{()c=4HC!p9LyycHzIMXg%5
zMgV;Smk{l~UF!U$SgreRic!Rdb7(+NI1&}bsGC6!kGfsi(r}>T)q~8#Uk=Ir^E5UI
z4{s81_Uu_?5|ihM2cPT40FDeO(9=+Ty$TqDIMgaPx9BNHH*WQRGn|gh?mc=8@Vg8h
z0K=O<K+x@zTGHi1UR6rCXF`#F3tkfO$x+2&Vy(EQeucCXCe)tTg)oi|?BgXUC`%-Z
zr_kp^hF)G??ytxeMxf;qa{+&z`R!&>F5u(saL8z)On&ktmg0;{ru0#wg9tt!9MacQ
zS2xQ>*P{wY00}Jbv@9ITy)e2~Y){`ct-6n}Dwd9(&+WjJxuN70%&~FIynELHckSu<
zeY1E9PX16YT^TsXUo6HkCd<3}%|DRV_b7%DM6tD1kZk*TbH%colm4pJ7)%k$yUD~y
zenQ9%kA(vC7Ws#bKP1nQeo*>HUE6R=nwl%FH}7ed_5$_%gCkaNmYbhH|0<Dp=t+Fd
zdm9?QVt^$uP<k;A7?qlOgvjpMh(Z=WRv-R@mn~+JY2AUfu{j4Q5k+X)zhoyDU-v&g
zsDUyb!eFX>`;Jv$Y>$xpCndbsU|uAfkN$_q_&)gZ<qPbhq3C~VNX6U;?%GN10arcP
z=<9n$5UIfN0;@L3t}H8K{L|1ug90yU%I?&lWqbo8fWO<76WTH*V$$Xuh>Gyn3@CBP
z7r=gu^u0-c&G<L-aMbLQ;duA&KT#o#e2u&XnOn=^tFr5sHt}?k*#YtQ((Egs&uCS^
z;pzh?Mpxay;3fVb=cMe@p}%{|`-WJ&6CiQ=!-tPcv*92k?n-1C)<6=P_G_ct)ij$s
zHwX4Zmu}roS!wz!qJQy{1|al)UJ7w)enOQSZw5c=1@4}nA;VqAWK-s@=fu<8Dc_iq
z@bEv}{sWYv6VP5E#l#WeD5zs|CO62}Vw!2A&EO)&vJ)}zHpX3p-|#%fehavB+%(z6
za>H@evd2W5ejtZ~0UU@FpFCfbpI?c3bJs>2C&gr}u$c`%bN^y?f~fecT)89c5)hD3
zx-d0umL}VYz9)slk^jsRs3Tz?$JqRuGurYvIbOZik}0xJJZ{ryjTcWfHTB~rmE$uq
zGDNdHCSK-%DS&D>+CHJ(iSVeLtOABUPs8B1#QGnp=tlf}b90V4{}LJl=<7nW4hTLn
ziWoA*1)zWw9fGbA97un<ppz0ZX#NfQFT&@LgQAY|=+-UMpVwK<br$!g4h02nIQm*3
zw^(YtEMHnwTr3j$h(`!)K)o4DT8O<uu^?w%)hnzUl4^TqUnF1u_(uzWp@zpv2htK$
z2GA<2frk*rrYp}RfEF{P4RU9OGj^<&WCS#Z906;ywx(+KIZnJTN(qn$Dz77=dgbFU
zlH5PF=3K2Hyn;*vW};P7B25tDZm=Y}w4VS1m|r9BrgBNf^~GBDqJ3RVYC9%vS@!(w
z8_~vIn)}zevec2^1a}6A#Iomu<mAqjKFFA#Hply-&o3+*c$Ce$I8b1|ZnB{xoqZfD
zO7)e*vt3kFepeO3*;ZC!2oJeE=(DJG<V}H_LcZ3?WNVipCtHkm*3s5R;KCLI&TotK
z#ObLS<BApDeE6X5=7=bo3B_kBz^sQ3oW@_8{>G6Ik|_8UmrUvX=lAQ=hx+bdVv3z>
zumZp_3mqLboTtg&q`DEWDlQ3U1b3yMswt>3jjxrsa_cDT&+vi~sVax7fN_i(dF8~3
z#VF+ndXr)pa0~(KkfB3I;^6?81l<&IL-vUntdKVo-t&1pm?M0PZ&ZA+>{op1>o}$a
zMpdljxPfpoJ`!yU<_l2`G>pytEiEjBBUSa=w{Ip>o-Rh44i}Sj9zE)4La8uehOvso
z_oVtCe=LNlN<et&{CQNkLScykE0`P!lx@(*F}NuB82|YDYXqB1uf=#8SW{YV4Gu!x
zmZvsI#Ug}%SVjVc!i?jhMSZr344Cu7GrIz$)qbbQuHh*tc)7Xr982l`7D}y<89-hl
zCLXZ<q__#<34fo4VDs1RfLREo(pJYC5wUxr$}-wlv870CR#Gqr%m%$WnvhHfh*VaM
ztV1-K=h&;ek+`If0RhZEO(pE~<BSZ^ai}N;KC>J7VQ(^@?F4+3sn;@v12vw;B-o;N
z@96YW`+W7v6;a2vvAI7>7QRX}|K9%cSILPu#uxR0@#+UyL1K1I(=&>xHCF(_D|c|d
zTdKl~fW3cgWHIjS%^Ov}0K|1D5B70jI#L@2g}{qqRB^^-DkYHH6c@02EKiNyxqJ6}
z<Zk?s(>^$8{@=%)V_d2sri5?f6yCS_oiOnn1W&M76wFn+b{+o*OT<vk#js%rXnuVE
zPR5vbT8sdaqzJ7Dz>uO>*Dfi?pC<AE%3_CXY9Io{^}&AS?&Ae{6g=v#dW%4&zm`_W
zh_QzK_wU^40&J>kO5y+oV#~vBgXSd>pZF;fIj5h!8FjL2F&t^D4Rpy7p?+bQ3#@L=
z5=dTn8Pt2GeYaFaQ6d~Clcpb(0d@&HO?6pR_QF*}sXroL!u%`*pE=j`?zXOO`6(SJ
z3OXm$j8Rz0mc>H|=#Ipkef;I?SEQ4dpBqI0EcOD(gu|jm`wtvQM6MolLn%dL=rTHF
zDZ6(bv1&2PKnZjgI+AbSY9o_|Zv+sq_4zT)*0vrpcNcBDfUg6ZLie!Z>g*lfFoR@E
zQ2QA%!jm`2Tv?L<zJ-Aw0I2>ygTD8Bo`_{Vxf_#-UVA=EnNB*D(6vX8<qH@7AwTvf
z%)|kBLcKP;wVRvU#P<xTG0A2Gf@X~qQC9diA`qt;zSLdM@7T5L=*W?fj0s`+S44v5
zk!w@>Hk&YEcI+aAV`&XMlzHEAqMQq$@AhoPIuAlSv16a-vWGL=o}{On_i$(XbAXBp
z3pIjMpw*3Yo_!nbf+0hu1#f<on%Wb=F&Iow3ka$n9yVescomdatDZ(sN!cakU%m*^
za%BghAV6Y@lvxE1Z1VnC*d7ND9%S#t-?I5l61Ndyf<hEl5IEVro1E?<&R>jPvDB#h
z)@d196F3`Q7>dq<B9eDT8rp=2gbjo}i{`fTv`;@|gG@sBdF(%ZTg#ky+}vsdg(RJE
zgJ%jY0-KDJBc6`D@Ng7~+_5ufL<GC>*V*x&)3Sy1WbNhc+aGbvm0R2<{(&gj+i&lX
zB~W$OLk<BfKK<rx3@Ci|`ubL+NL{vW*`gnM-0q#tYD@x_l2!-?TaO}H1lc9bg=l&#
z_tY(TwVQJ=?aNtl_5dHGam1dB0+&N*Gan!(0r=4Tj2j8`Y>TI$IZVj!QWf3fk^wpG
zaW-~84i#4|2-jpz80`dR^<=oM4#BOW(F5T%<IJ2F#DZ#SVb3ou7n><`XTbLD%QH>}
zYkG2_*P=i6z|d_C;ZxF*0OFPqjp4&*Dx@J*XK1*s?#yMd@bgn}k2|A378cUwibfy)
z+xBJ?E8rextj0g34uvQ`M%^1s304>RcSjQ~h+CxbT?P!eCgR~`*;d&j5t}|wEvQrL
z=&ytu0A2~aQDEJZzX=XaPVgF5f}_{L1nq}GBSy?JH)^T6v2D*B8WpIMG;7ujJ}W}p
zC!;Bud;=Pk>X1TV8_<2ETk&jFQwn^1w>KrkQ>V0mDMV6Rlo*KC4(?&dzTYM8`3&2b
z2Pqo`j%!$uRjVqvIZT+J;*~2`gzu$~4i85eXcSF=bb@{;D!M;spjf&9fG|w=0Avxe
z$pybc17kc`ena{s(N@E_6{6#OtmDc3Monq&v&kN$tLyc#nd9sspNA6~Ay-9d&rk8r
zojdIE{=@syHpl@bKLONJHlZ%D05GV_?%EcDp(&)Ok6toSSO?FtC^KsRGJrs9k)R?M
z|N3OuC6fRwv=3zuph!`z!5bf|yRHei7c`W~qwoXJ$RQ2;bDP=9`QetL?1Fb-S633o
z?6<NxwRW&XViF)l1GCaoQ%m?lItCygfu3|QK_eUzIzR9gkfwTk|NecV%O5?p#ZsIY
zMEuKg9HLb9aq2`?N<@u-Q#m)p|HgO>OlKnvFfy{uetfn=lwufdu-Wd<jy^jUPR%OV
z-2PmvhwK`m0K2RpFHb1dcz!&8<cR2IJe-~Ob(>z!42&VS6#fchnt9qV2X8UH&B^*W
z`Zd)2uv()CZ?;SnV@_gX%uywMe**sZo}oN94uoI}5tneKY0CPBhI159<o097I9Z;g
zrrPKKb@t|QHLq{@Z+qM9gs<3To<cN`dCa(s70PNglQEPzBn@g)hR9S)h73_ONTMVS
z#tel>g`^CHB0~x#b>2^_*uQhmKj*C1_vO1~wT92<dG6=Fulu^L`(gM+PW8PG)mVcL
z0F;(?eEIRC2TeL@$72qu$AVazKYsiODBeZu;?kFU`XR%;7KaQwEq{H%{6K`~Vly2g
zG!NoFnaWdgD&VJDn|`=TU<^DMr?HiRVmyj<yNc^!@(1Dh&8t_du*bHKBf0@gb0_xH
zS9`KXB8DH*S*RT{d(fQu^Sdt%-II_~@cOmkL&Nh?Mn*=u*%PudGh^6d;B07(JhI#N
z<RD_5Y}oK8y&3}EV#|}SH(_@+=!L1kj93pR?_C1W2eI;LMoIuZILbOIl2jY<S^9o_
zq+#=p)WBV1+-(bW6>oZh1QHUP=zvVWvl-_d#>egs2<Yj>3>f$I&7uyXR*~MdXn1JQ
zS`Mj<!yz%er$vq_DX;(6UNdy>u8bUbGd5#J!x<3)M<XJh3J@;I)dQz=l+;29`TbK+
znUrn7oIhwx%<v<2-i^8_iZTB<q|$`lN5rGOql#r(YHSdM{IHtCazpA$MrjwZ!OF_Z
z7t>ipk&MQ@S@Y@cWq}MI&1?+A#{9?$F{gmnXeH57Qvn77C590_I%4x_ADCDJPX=Dh
ztEQC)XT8theY<ymK2^c4Pw9VMqD&wwvFI@#1nbD?vBmAQGB;$VWS+7z9Z^yey6|{N
zT130)OHpkFXR0?)`qY&Z8%_=Vi@f+%CyRql?;(g7mMlN=CAPLbaNh?nIVN|A=26D!
zjT^54v4iLR3KepdA{(XjOn!-tkIsSzygdL<ilSg&5#Zehb?Y|H;rNftWnp{w?iJl7
z)A&NspOs#+U6bzMrYp?#N3D21Ci==A-}SLqA3y$H@sG^ify(Ug@Ug{Dg)Mr(={dwh
z!fDq06N(5pt8%>A1om^->xj6v+4o7iX-pZq%EAO78?(H?d+j`f|Kmnx45z(C`N;PO
zxly7+BG?fVRLE9zL5KbHs|Aj9kidyLPSl}LQ_B(22(p20+QLO*<<subq|W{{1!SHe
z-$EknB`Jm1zi<O0w0}ie(yOAF{B_&Zh$`v~S$M^!O`GIB$1Pc~fTYgC(lWF-WBPWr
zJ*UkgvW<?sB<N|n{v2gS+j>@IFSE%FN9=#bx7VEO6f$@AY?D9w=_@wsD{K4Hf1Z|a
z0&k>|gk$&qz`#?ZmMvI72Fa|p&y+VsMW1VWiBFjNjAEg^ns@|P4qne8gyRfD%(^eG
zt1&`@zfT?PxsXHynRv8nSER3gWsx`&M@Iv7?i3uwQnM@}`;D>bdDw8)ETn+h!W$hF
zK?J+?|6(4GcA<qNksPv*yAN&&$dN7l_N(ve-atvF!HUdBY(U1uF6~pk5@drymeUSv
zo1Gr|;zP>zVaYG%^n}-p3xj(=PyWBhkG!o|Q}b!_efW^f?jco~G+}}}ys4BP1#jM9
zO!5fxft*1D2hvupVd>%CAlyfZOA+Za5}YDcuim+H3Yvua*H*TxD9OZRX3~D-TVk|b
zbp}NN#jBQ=$<Ob$8)#-hI<eBh?(xWw<%eUEP;CMSwC)5*9?qHGSOX#`<bl$w73i|V
zx10g3jq9v*OAMtJ<IEmBpfhV2A{}&^NP+7aIvP?}TC{C-1$*$*MFC4O7ZagcOhi<a
zFczf`j&tX_CJMxHjOHXN`7bzp4cE|kL7>UY%OmB4YP6y3dI?yL_?9d~?vd%v<?BDE
zwB|j5=7A?*`Y(BUK2o(S{v?R1Xd2b}YD)*HFTm!H{sOm?X_43)etd9zv#KOa28Hwi
zk~L-MFx#lhZ&}WvsRN80yfkav&%=GRwGD=&s-5Up0V*GEDn_%|h5x1Ge;05Qh@92e
z@t=RXxqll6u>Fdo8H;GeIHFsY8hlyLM<(mnS3}B0#Ky-B*-eDTE8-%3vC>uwinFKA
zzh%8W-(bIZj|`lceh5g3*pDOWF%EW`M|=yGm_(Thy}W{L-=ul-Z(qL(7e@*x%-R_;
zOt!^fp?MnDQ9&$7KioTAe_ih+%$CW`P$PO^YegasFohg}mv<B;+IV&uws6j;hbSEl
z<TN64AtU%e<-*DSQ<DwMba<8=aPuuQA=3AJWHn<3Sl>fbFuh7BVi`_BCqi3I<=2ff
z8_SJrG7xneG+S|oZr@%-s>^%p`hvI-kRgl(J+6YGM4Nv-6Ip50n!B%Gx_GfZy(~(N
zqPK5li^`*txa){a-9VsVFi@8AO%JGEy?%W@WHfSbXau_w#wcBmxSgM$|L~z{)>y*~
z%#e)5Bb_`W%}fw^Js;(w#TZ5llTyz>fGuq3n&OBk0gd;rqb-{?VRQ9aFn|6)WFB1b
zXUHkoINWabd8Fyd9j8x!FnzPOI8$&dF|TGjr>CWz<D5nPQ9K2jGZ>Y!1}Vv`)8|DI
zp{~CFz=7o47ji@*-9#4T<(#c)*1BcOJ#x7ndW;t@=?RebT99W*rF&M-CSQdzVswvY
z_`O@V7BSM@VLrJdD>{6;KS@p57l(%@d^auh7)2+@eyoNU+UXvf#cT<H8bwRwb;O5$
z8eW)v7|9zeX~mLIhn`3CGX!{_wLE;-u-ldb^yh-GZebFTo}YFZJ_$$#r~e1IdH%kI
z<11T@(I!oX<}<wAw82A$c+8Q=xU7bUl4I89tvlIkvqeimb>yJl<Tp$2BFQAyt>3GL
zv7U4XmkJZikMuSDKff{Cj#rd@Yi5`6c<8qN$;W<5m(!xpoN2>B142TIHtvfnA4{n}
zZK)gtX{6{_3H9+6jOi|2E~lj#1)1EkX)m5at*k}f$s^_jU?b6sJ@5h11Q#f`Q2gBa
zSx4M%DP6gJeonsIx8t|kA92xtBa2*tr+YfNReiom;?j5U;Jpd80Ea3EeibYe0!^q)
z2l$vFOV+WeY>N84_mc`q3hy&UpKsrNE$|7sFau^t{V@SMeDI*I@6chxptEHNsJ1Ry
z{*<ctCGlh>TXKtr9LF^;xAh^x>gkKF<sNhkk>OvwSc)Nqcz9YgW<}OLOkyd)Ydt{H
zM;RF!?wLrjIv5C%g_;#QeOz9S;(4EnHGVfg!q2eGMMoG$LB?rG5SQBo!)_9gUyD9U
zr(=BG1Pxxmxvfa$CUtf%c>n&CPoD6l$2^E?VSHVi!esq_(CP4fkRE6n5(Y*PD%wTI
z#R(H?Fw5UB-#~Gz#D_bY>P%?j0Awo7J4!eEU}7K)MBF8*$4@85)*<$Th#iVA!+H1z
ziywE-PhY;kG}$q5?vf=WXa(#{SmCMl3&K{H#JcWR#)O#5WM#CDD=RCR-e}a##l?jN
zqcub&cFXTKVRip{6DfrbUlQuvcY;(%3t~Cd4ns<XG19NQ@u<~*?nc{`l$5*s2fV@l
z+QgSKyc>3fK%@TAb)B6t|Lr7H?-Yjw+X5C-_}fUFs3D=wn>Vi>GaKXdpU-bcGovv^
z))p3zfNKr9fzRfmYA0)FlNLlaQ+(=OZ*TwZop}zWFlHJE&<W`OKn(Yuzj;IZzl*m-
zWBF3WFL)0BeJ93DBNcMx7YMFq@ix#<NYLE)7XMj%%t-ltg>JpV%M4wAA#lH0H@r%S
zj2-*U^}-rxyLa7{Pyg?2i{T4%U@?Y$(jWw40|a^G)lG{C^QkxL_Je0@O6mGLpfioH
z2iMHy!+FXk5T~AN760ZLzM)LPK;WsN$}C2D=x87#9D)HpczbSb#=}>w>+WS}|L=W&
zy{Lb;5UKE_riTTXl+`7^ZZFAZx&{Ag38ix5A8;9f`2f4@v&7~Aa2ofa?iI}f1T4$L
zIY~`jop}9$TEnsRpZw%#hr-vdahtk<&nS{@*@We+8-hpRzI|_By&6|X15h$+&9aHw
zfWXFE@_<cy>{!>fX}9j&iHV7!1QJT_l4$yQ(CUe_LZDLBvqz5;9a5PrFb$BI_||#A
zf+?Iplsa;n)av4T-#?4XDp+aGlP6OdE==i>ag=36EMdFr!@lm=as3d*jqwckHNIeC
z%fKb|nt&ZUrV+A973f*JlNj#VOqbed0Qi_uvY=@cWg&8oXao`$a)tD9@AhSMU}P@k
z90NJ-YeQws3`In@O2g~Oky-29R(O1QJ=I&%qD`CcSNHSL_kk3s&LxQ9`=y;_CAt`7
zRm7l$c6LeggV@f?*lD0z5SwDl2FW>nxr<oI9PDv{FLhb>75jo4%mQN#2&avKT7}CP
z1CMXGnu>R#y148-tXXR6lfq=M^?m{^l4^ZS{BjTB2bEDwtg}>_LC}tgZI`-?Qcqeo
zOj|oFF+CR7Hj)gQP9Pm;64T2iT<_SJm_!gF+HtQZ&AhaJ<j|p(%Aa3s?o8*moGU@{
z8YUCU&s?2`N=GFiq)tT{b~xD%8#YX`g{y9KX*G6qA&o_!?UM>B1sL<9|A8w0;lqdC
zs%$9Apix*?_$SE8Q)(Y$4o=QLmP=heGqpu4b#)xCyXfeoW@no#1I_fK80&p_5nVCs
z8O=l?cfj99ATZ~LFAQdG1`!zwkw<X^?V)xkYXy{28NPMvZ4ON@RT3gDaJ$2w$xt#1
zgsG2xhS#Kze<;~%DwY`O6MIOa>@9ggCjvU%KmS0l7&rEx8GN#%K9mF78EZ}+f<0b2
zlvovwrm%z(6Pbo^0a6(OCUs^P#kV->{#T;ON^T2%hRvHN!Qo-ryehIhVe4Zz>&fHC
zTuM6HCxe6i-3Q5f1BOt(b89;xH#nc2NIc}y2<uT?oCWb^kgbV{uyKL*2lJ?i)GK{>
z6@h$jt0xrTn-&TQRSXjVO*dWrLn(AMT-1CTkSrivhAfpIZ~=hh2jNw4{dzFvDm$7H
zIYG6z#zfe>3nzR!Z-PV&*&0=;w*cu_1l((;B$G5H<qoP_VTImhz30zfx^yJ&&l=Z3
z(TRNlXZtK-QU>G9c`@M!^FAWTK}-)&#hm&X-Bg>!V%9Agl`TyoTDpks1(!$TIx9&~
zAKtzDfxEk4P=NTR4^Jfb=S(ooI<d8?%FTY9v2oO$!Ujqv+LL>WNP@&$t9tP|SIiJN
zC&OMNe(MGypmwK>$tvaMZ1VB~PJBu10gm=-M6rfnJQfnd?~<}V%|=6Ur%duD=Sw=#
z*~-{t?vk&fK_cJX%Q9}+Oma~3R;MNd8k%aUfwAW>RfIqTw#CP7-@3I0%{>alA$iW%
zZruu{Y!sNvI^hZM=FWZ=$w5d$?HI9uWnTNiQ(R81>Iamg;w=d>IfbP`K?JQ?{$@BK
zf>Lbi@B)%Lh=aHRY=R8R0;ewogVdFHF?X8?GoO!e(F`|&Bb$Z7>~W2cD3ig#MFkQr
zF7eE5X(}2ZLqK9;9ydobsd-}!I?r$*#bqK*hvxsWVzZDlC`XeM6Jr<N6~wsWv%O|q
zFj-?;o~HiKKc=OZsZe#*e5k&aH2|wbLc-D}r4R%&zNcpU4*&>Aj!O@%%>#Pu-?uMK
zg0HS0lW;w-lF4LCPSApq1i>k5rlJxKCeQF?fB}T&(9RyBI;YBo^CQl@l+kujqrL|2
zjnnM;@GsH22-OkVbv3p56g!w1a2DINW2{>d)gg=e&E12Ty=yG@R@}TXDMxR2EeJaP
zrDjh)Nu#yZC%R37`O4-&MK>G@O5`c4EBqphFr<ki?~h#j{<CL4u`k<Y`z&tx5saXj
z?NO<La6(L-OHGyfQWC%7-|2gI7N%R;O(NtubU2xrl?0?`m_UI>0qVPL+Yh`_X3m%q
z<LDEx3U!P9HO4CkP77M!Y|y2{teBP(wN|azMKEhG%+UD<_4#prkDIpFiBx|%VIE!p
zxt`hiB32{{l?wFLvPr1lMx#CAF=B)pSvnAYuk%Z&AK)E?lwiRqXa#NBgkH@iYQVe@
ze6^E9Xpa{A8`{wjWV&eA!e#0zDfIe8!|u)6uz&;4iR&xiiIBMOI`CW=U$DERHX^bq
zl`{55<$wRPO>kN|ZQ2fp`za}Q`0Y^)W@W$GRd=Dy;27N+CvO-N8=IB7f!1ex+;3FH
zXm@L?GcSqKvsq)<PUN8WQ+=6Frp(m?>lc~-#*N;p_+9!{CK?9EEHgLX8y{am7-SXD
z38HNi(xF%+zUUH{W(_960|Vp3c44|W=>$=mbAk$*L(k#}HXFSPJy?i+nsI$}ve0(e
zN9br=r+o8LrQ8jr5^WWrE&<1KW859(Ct<Ig3*oKG-Nl%}arNpBR5au&hdrG6P$63?
zz!Q!PTF6%lPdhvN%AZ;5viev+iaxw*FBlcF(NlOz!u-Vbl9nnpqwL78BS-St3#3|9
zUT!q-U;y(8iPLWo5O}GIyd}HDa?gPS$S?As!xg=8W$6)nfr{C7<i(cCjEAhKifgXq
zyi^`uxYS_a6_H8lw1?+~u8GDJhnEM94r!S=O;jR1=j0U5k%I@@*KurU4HCK<!8+t*
z6kQT`0Hme8*)`4`J02n|AEpaxv)H^lP#P4EQ}lLW6Wiw{#;YEq`$N{rOxzoY?}P7%
z+9~LOFvfZR9`Yq<J}D5Q5i5lD7-xce^UIAjnEuj4`@7VM>OrVT0UVf<ln2R-I?LeS
zOil}I%U^{Auw?g21`yzkXTG34El~!Z4><iIjEOkS8EXxI5r6Tb=zB5;cdP0#GI=sl
zp-KT|1&g@|dF0Kh*$xgmnrZm09DkN0qd!HajBy$)QwL5tz+KU*CKCba37_Rmqvo*w
z+G)zsd<9@KST|^}IV*{J2E%gb4`C|V+D2d&91Y2++aSILw*b0928hChI$lKsEl&lb
zAA)?3bWwUP50dGSQ`Q|df{Qt7)F@<uANh!3*}D%s$C-koD5S>)^c#9<9*$z4I5#FV
za5FefI5x$BF@5@O)vG8N>1u&bLFfS)lqXEA+f;)gqIo_t0+FQ?#GsKMR2s?}RLaGV
zqM{;GAm0cP0WpKfO)r1CO`7>WBE`F+pO;4Ofl`Km#3#Jp8A(tmsW*F$ExXjdd!Ihb
z+1|B`qBRT^bni&q0g_%#!b$Iv2!@qYFW05e6d-<>b}j-Y1ljE4bpbWbjP#8->Ciq>
zmoRb5C_$HaaRE{R_n8k7peiOVXIw~5u7=x&E%6zImx*VdF5`9(AQ7>|+8j)!S0}t1
zH6ycp@dS%a!hB1^3z(P_ZrfCNn_|ALYS;e#GqSSGeul8cjoPzRSW`ssaGUAne}Clw
zvcB{_%9AZ0@i7}%E5;*5_)--It)npHwn`&ca4vIU!gc~C$Q``lt<C)wNs>ZS;<YYl
zFB7lEx(g>K8mD?@ZOM0TrKekzz+phC6fDzaq#?bj2w@n8lf}$%b*h7tm9P4U84+M5
zuXRU|H)Xw>nGyVBl|^Pf7r@{j7K>DV#a;77vNSC4o0+wB4v1)QZlvllaxKyW-K8JQ
z4`u$fmOd&RYH36y>&2*9QBk<q-Tn|5Pm4uM<BnJ<ZH6JtB9;bQ5w(pxdaz<$9Np8!
zeVgMq4uic-4HzfsLTH6`ku#FPi+;eVc*kW!+_*uvF=1Ok95_qp`H*RGnv+E9^%-KL
zH?Lo-Xuxrp?x>yhoMSe2Px2ZkC%wT^zN!7VkzO)ua0!}AMMj;gKfM@NG%S;2q)*Z<
zQydC=cJDsoT^868xpLBq7GXNtoo93+k%b|-ss~jbjSi{uXYebKN_E$)>}*k#691$%
zv)807OC;E3Z7Gg_kr8Qm<H+-flPt;N1h)BYm{F$T>Tt#o+w{vijiJ=4Whp1Elik)u
z-?$UvKEoB%6l^7U)Sg|tOiKIw`{sg2&H8Ag&)^1=M+Oi2M$;;yryB6dz=~BT?=d`t
z=6I|PHs?j#Iv)^o%OG84O%g#xs;QQa?odE%JK<i)zvhqnpxgMH+^{e)ir#4OC`-y5
zxDi${i>BZ9a4|G;*q{%qk1Z$$DLHINV2`qDQp@iO>h^OvP8c;boH2|zP7im`TR1MT
z@rbo%9~JjYNoly)$J5#KJs`;fEks7;(nTjiK|trInf7km^2cY!PQPk;ElP(HQ)k^|
zgaxKONqSp9)f={l9%D)P)p5p4a~@G|(1<Iyx0CfxJrzDRY3uBUrsa8|fj<WaUoR%6
z!+=Vi=dQ1x@Mi6BxO%tXlBHiTkiF7ezUNJ@Pk6JeNM$8YRf!KH<q^;i#Zl+%q~&JA
z*M6v?-4o^n54M!DWAS3)8LG60;srJd%!Jh+a))t2A=gHj-%U$<$6lorV$s!OTS{x(
zY&SVRBG^HM))-G6N+AhPjx#_=Qdm#K07Lf^J9bQ5xkB5bQl{cP7xyuCFeJ2(BD~*7
zwJSR{NhiXkS@_}ds?60e+!({|;^cH*ExDI|GmnAlRJ>DiDE@mr`X`IyHD+d~6c4i^
zJWV!R#lEE<?p*ElCck_5Xa4Aus2+6h=FLg&aU41!GlS>~7aK==m^)Su6CJcu+cO;K
z6Gf{Fm!xoYp#N==&)>ea%;`^m=|rCO)7!VhVsycD#*GZBu8OHN<;%Bn?MAK^VjV>7
zm16F^^@#R2xz`WFkd}ECYz0<0svjMQ;|2!n-bKbSCr0F=^xJamwi!&G+=dDt$RT(p
zg9q^sBu4eu6JX1jLNXnZ4)K4mhGZLwj<Aznl9EK^zDiLnAWxNX)a*CGVlbd1{90HH
z>E3-lxfMBKk+2Eaut_83jd6R1Wg$36)ybqpn`|GWQzPJnN;`5m9(&eKCNn@e9k?e-
zN1n?2StyHPMlUviHasw2df*^O@}~57V(C{Zlx@xEw$hTIg0D{aH&sj@lWc)0A%9p&
z>oNVvr>cYF2c52%DwVeN*bsG?NznaI!(<HXfBN~07t?e8zTf&)L4oYX5Ug`-zZwB?
zSl&=i-PcN8!X<rG(d^!=P|kR}OKQ0mEkEeW4P9p;Em#TAPwuOyrG=LORN0%HWaK$D
zj_(Lp$ThI7msJ|8ap)q)pne(o#>P@yo`DuXmgw)G1R}Mb`E%n~LN^CC(B<BT-)`68
zI2IG_-Y?v0kw-`6U1nG}uYHJu5tKO0$V!c2Dp|qw@crb`ZuegDXNSQ{_SL4YIXV|w
zhFe4(j@ca;IHNR+&d=n|2O>^1j?Fw<Rxor86%@>rLg}J)998nOkrbV~684_l74AS8
zIisOyj;lN$Ro-K|Z$y4~h)6LUls{SRoFCG(r!?VVz@VmoU05i03S$i<0WhHG$nZ!f
zA1ESTQ9f$8h*jc`jNu#SKCLm-DkzG3|FnqXWq=MOlB;}K`keK6s8S!W$NtPKv|=oy
zYS&@IDj>v}`Z74e*yRrXQVzF|f1o?CRG5@jyfZ#9(4e;3qXb`?H3L1)!-)>=?I>Sq
z*B-6h7|ejE^0G27RWeS!!8Jfx##2WGOrS;&_T|$oylVvqK{_Da;b0}TePa!Phm+4<
zrs<J6q@F(yAOCbaVk-3CE?$tkcDZOoBv6rN9<0zjJovQqt4O8Hj-5O?gvCQFFzUS=
zXkp`q4LNbkJ&H56n_0lqr>nDO&6=MacV4QjKE5`g2M$%C<q$5-CzqkC-_i$J2wFvo
ziv+tz?XgKG5@#)AnarPB>KFKBrd(umYaVC-QMRQ7y43sy3k)~Xu>&YplZcU5nhh>+
zpVcJD?SRGf(`V(<79|s*$4P7Lo!p|OC~iPuR9}J<3EM#rZ7LByT`S2x68iY~up4VQ
zMG8^@MzcEi?CB*un6q`^CY_JLaIIlvpVM8RH3fl!ezJGdW(F9}3?hA0Df(Zpo^U*v
z{nAoGxQ(y6S1KX%RhNiN<DzCJSiV7fY1E>4jD1W$ch+uJB?<Z27^>prH9q8e{y~;X
zo)g)loS)GM<ccDO9vE_Eq|^I0y=&q9$h13mpx!J6c_8M|c<wrK&Xh@$PU0N^dJpyS
zi2K(k?UyHNtlckm6_H7upi_GP(Ie5mR%Nb|*~r#IJ*wR_u2Veo*}rm5ETrlzEIc&Y
zV?A)F)ZYox5&W<9FDj)!p7LD+9<>ks3!~pf&~?0+l1x5r%zVSUcToW?0&B&1v8xx|
z35K6dAAKE=LRkz76}lfz4hkmWWjM;s8FF%re=AW8lXgh9(9#vN*`a?Vyffm1^jOLq
z+EXL4J?y0;DFr!Y_e3|P<w|DNL_-WFpd-mG6-i&bqnOMm4SdFIAMce;N=o8zcNGim
z;B&*?%jtVhNfSW#m~GRT^G<5Z8}fxdKp4+>&c%80?j3%eS)g-#m7+OTwHJz4*#_V>
z05E~xvc`2jyP0uF&GU`T84nN^73Bfh4#R}Y9AEeEp8=qe-c0?L=h}x+wtgwh>s^F$
zZ&|E`3KCP|-H?9$42+G@a>YQl=Dj%WDQmn`Nl5l%1sOeV!M(sI4A|DVPOQd8(0<V(
zL~)m>u!L9ffddY4i6neX{4-MCy=PAb6AZ}W!zx0{#yx6<WdPDfjqBvku@{y+5kBIQ
zEu67dmX^0HC$gtG+Opu@W7$+&BprmR@Z{-J*ep#nND(@;XrVE09i6p<o2IU?Unc_7
zqJYfz*&LRnanQCQGGtuv^D||q!wW}jn^qoH>w5}3{mzqXbo}JWZ#WkzYbdM`Bw61H
zD3E&lG75xNImvs21{=BN<sb7sD850<&u!{$O#t98Hnt%a?FfTt`BM3etaLk=SRAm}
zB!GN(2G8XbFw;^?c@5ckfQH7{VrL|9v*^I?995~Mp)m)MU*YsTL>lqr;^gx-R#x&^
zo)as_G9r8cM5w@N)2$K7`(NC;z0*kR57y%;fmo#pz>u|N;Hr@$lX%<m_fG`-gFDxE
zzyQD6{`vcg-xtK?+*7nJ<Jt{GjQsiYFgOUrXF4>&H|tb2N?P5#rD6UFjS_>oWrp$B
zeyQcU?4(X116i<eA=kWCljP{7Iv217YPLPF)7-gT-G3-4sS5l8w`8UvcA<Bu%-0-E
zk|lafl&^bx4As{sKT2jzoZVY-pdD)|vvm!(Logo`1pmrE2L>NF?zO8|=|7$tmB(?*
z|NhsuKN!UL({=4oPYZP<OR}Y)d1O)Ql3q6FPxW^@)P)5_5~AFXuCNd{O$JuH#Mob;
z4>2ZQc{)fmdP{9#!&gp+v(qp~g%ZoeT6*p&Uyr#R<J6M`2#i1zWML(pP2_^Abs8|G
z)CM$t%<{-FnQ1D(b4FgoFK&SZYKpNjE|>^VF672S8I0$&dJTX;A9NtXu!%xC9XO0*
zb4v1AfkiJ8Qhm-@BDXzM08$4*E&j|IHEH~KG9_x6@s+RJD|T_M@@0%1BKvuS92(c@
zLTvN#nSZJ~Bp7TrBwO&1+THa}rK8=$8h-$k7F;9iTHbyL33}^}9ZO!n&IaC~Mn|V>
z(2g!7#-+nxPm(<)B_@(c=q*2_c(2Rv#t4Dwm?m^XK%3gcN2rn?PBt>~OMo$^qYoZJ
z4={D6C<;yN0!O*yyz&yraWZ2on?VamMDzkrhy1(yvbqFE<+sMo(@;<^i0U-SM>B%2
z(izq^^F+k(AohQ}L!BhicGd6BX=ty~FJFX~7JdmR2$adJ3+(8F2M=PT3GfX2aeI9S
z>31*Qvx+#`5my108lYC_PPA_o*mJwM5Q$RF>aMJk5J)M$xYDk>B)FVDXF2~^YUQ8V
z?e4G$C~<`zHKMM>RWepcEyZtn`h2B3B_#Q|7sUBs(Rs+=!A@(}GXCZ}sZ9i0Sc`Ra
zFMoLB?~bRsKssuwe}byOdr76d_kE{OiZgUo@d<0$MdB^~(@z~U0To;tf^-1x;8Bk;
zK8Jb*ABK}AwU!Sfo-eH@3fQj1G+QgnjkJZLi(DI_@ZP=V;p4=`4zGZ51g;Y@oN<cx
zu*s2aUQtT{?eNHO6*NM#2aD}+An?h{26;zsvtrGfr2>8vC<?t$w-?&BM;4pi(Oor5
zarvHSjj3xZ%l;s57qNxv9Tz5=Ei=RC@kw|~&Q`?fy|Oew8@4g+DN93mgl1I{l&&%T
zM7Ej0Asj*e%h?rFKp)!(mPic6tq^b8+I~#>mPcHTH*e=+_d!>2eG}W&i4r|iBak-2
zE5q#>MU*9=Z7paRG8HkT$vSRmXPOru>iYRg{`Fi8leTsX7Kp*uaP}|IlB#qjhTH#m
z+^qAcMA?^FFI!{?PV0}F*tKuwOqZ85xY_x93#$nY160)mEUxtKzP3fm<dw}IW(*wk
z&7$AkOxeI0(y(5$X5BTso9WmmPP+1sN4M@Pw^nP`aDtlJYR|Ea+N}2o%wKbQoz<p4
z8Z|2WQSt8Zs>5GKeCx2VQ)1ccQRVLf%EDXPlMNc=Po@`ccv0n>p+~On&y`VSH<L1E
zWhGhUgv9Z8vuCdA^!Wn=@k6`0r-x$uc&6TAfu%nlcGMpS>!FNE(6}y~Xn|MfofJ<w
zTD%TE2=}p<7kNZD<L9Fv{!u*(G!Oig#>;7$PT@xKSkT80c(RegQX+5P2T9DcTG%=h
zqQQec<kwVxi-<~gc)wb44`#PfXrGmo<hRhIer5VhNO17vJ|ybA&<Ly{NDSD11R!3i
z<vNfi7Kyhi+u)xbQ$8`73HJe#Pv1kLox&s4rXL>g^oli{cRlF*kS9F%D!&|mpc
z__O<BUdnOq;>A!v%RpR+os)z+!hX;JRtuK&{<CVe_4H>lzTfOv`*O40Sdb!C&u~Gx
z1>OOfV2mM=U`&>sg9GU2N9rASx_L0TB+7C_^$s0I0Gz}>99{eCO5t6@6=B1(UtvK3
zJVqCoWjb<0WyX5M5H^U%OeboiKZvI{=sReT(8Y49V|F+dvXaJiB=Mrh$vT~PvW^oZ
zJyy4W+L)qwq;08&cW4RIDdFy45Yq+Q1k!&d+Z(dvF?fKM7kLb}a2Z_Z1@99TAIqej
ztWNMdk0Dal%<2MSW^=uwV@!B$uiuSpRVte3!l$0ZNV0`Oi4Rjl3-!jdT1K#vlIFy6
z1vR5@e-Ht<Z{L6XXfMWKc{*Ak$00~Irz@xoy*!SuD$u;g28tkXbbe@^DEEd1@cTSr
z=xH*R<dWMlVCBp4MTM+FeD0iHo%pNv6MMD{`b|bTdJM|LO4ful5%@_AcYR%V8vLC8
zn^H3SFR`-F6L_lvG2mP>&}q2h+usj}kN!vTpZw`;0EC-NEuWW{N5AF9J#k?1sI0%v
z{P*QWl&tpe>y&yX{ZHJOR{Na){1Mu}{ww|dVcCSb3-tR(NBh(*h~Ga_a{te#uKV%Z
zZT0Ip8CqD>gGJj9)qO0UFuP6N-Sc!@O^1uq7cwer<=n^dzhAY{n7a4YngoWdE<sD9
zudRLi|NSX_i19SqYKf)TCMx#6;<ef+Wsx88IGPBRSVLFWNcqK^Hw*cN`pqo<vBA;0
z+v6X=$!jo{R_OMFwY)g-uS&DEetqJg2aCk~5DhP1U!fP;e&~OcDsunBhnLk~-d#t2
z-JU)tXeI%us0i`x>}B^xkLAVuF9ZM5WM5yHDhF#71}ZnEDZC1lYBttXDmwUucpFYs
zp#wiBcM880-hrZx9BTwVMf&<r>n}r>(Wdiyicbh7Zwm0h-VHRh+YMoLc!mpnN3w<5
z5=rX3H;PyFtQlK>SzBY16!0fJ%vTHeHxu>S9$Oh!*LPVD#mn8=Tz?1k3VJLhGgADE
z+3u<Ut$HQ6Z18IqQlq}CPr#tI3a=frbiSVNp+9ZsLot;o(?P@^t@iy*%4v*F9t$I6
z<p9M#8Z%0kpRCLjje8#ZYqx+-d%2;e*BRu~g9@MMdyCdOCExCkv&p6?nHT%J*-qw0
z^A722SCCN(RDR;{&{E6_9}0qO{uR+X0(+l$`{bV;hOgG*kuSz9W6S(Nt75XwLUYe6
zRyH<9|CT+fI~Zl1WclWEu;nGY8R%5YnzH>1Im3-<fAr}^5My&U_zs<tVHS9xfLxMa
zZa;Kyn?aaN4>K)_C&`vM*EYeWn^d!y+?Hg%L#Ix{Zxs|4@XkDf9uY-cAu4o^E;KzD
z@oaqKMvnjj`~-9hH*eynawj)eFiPMN(!?Tv0C&B3#lqje(Do0l77dp5xsem*r7DnI
zN?a;_Qd0;~w7w1!8ZombKcZbPHI9&mAw^@tyjD^!#9YW=P5n!2{AxOdJXq9Je}LlN
zrfw;Is&favF^|WJ0{YXQj70-sl_P$j%MPGqFAn43<2Vi(u@&r0V+u-2*2rp<4HQBK
z)289TD8?8V?OqjjXq3CP3h<Y%<Fm70s1(b~)g_@>#k4Ql=pX7jGudre-@c#FQ)(u`
zQgx!7<6S7Q)7KUYaVEoX$zW)}zkU9^FE>=Q&b4OkK76>EGF9j>avIK|feOsWCnF!`
z?h3SHUG5iYzV1d`X_7OiR&ZQ0(QV@GpbgF1Aj<XpL&}sw``r2S<pUyGZ~r5%^}G_9
zWD9*=A^f0R$@v>r77ywXBor~2FnauaH!I|z8v)6&sE)B`JGlOs@t?S8(ObYIZ&i>e
zE)!XOpO5MRy{uZcZ?Eh1_`=G0EKoXnHhH>+Ko%NitvMDiT)b$0=UAVexw*Lr&z7+w
zLf$PN3OgJxH|_O>a*K)E+kDPh97X3a9Yh`Ydc=q-QDLCMoI3SqfuAvSgx0+*O}_5P
zM2Ntp!oZ-l#L~(NGu0E^4o)aan+gH<knOQ4um<YerHg3ub6PEE8{jAZT(^8CK4JNA
zsXGo?78qkSXvyOhVDFF>(6i&Alr82{av+fp>>wO-nBv5%Ra8Kf+FRXDv1mL4{;7|o
zGjIl<9wysc`0qXW3=Z+2l@$+84?VR{U6b1*y-%8<VpFF1nJFSjS|?7J&{%`@U(RBY
z8fQ`2!0u!frA?w#1iQnE{RmSbtLYs|w!k`n6lS^*g@fJgVTkTIPo;wnqD|sF?Kxz<
zxt1Cq420_%z;ta^S2sL@>L~5(vFo1SR6->PqM?CJ(WX;0kD^0o)Zc6$_4B5UHP|si
zGHHL#;lZ&KUV(`m!KU1BI({?J>yx;EQ*t8{V2V*A7mr?NZk`QVjSe_<ALy}ogzNn$
z;sS-NhmBR(bK@mvK?H+h0fxc#y9;FTr#S}>k)av9CZcc&sQe$gh6Tko;9^`KJ3;k8
zkV^YYTg}I#=0`pan%?CyvQbplkMtJ#!9Y6&ahn#Ri;(C9d>&Q=lX#a)-zFNPaPo*R
zGk;3-p|94>>wfL#WtZ4lO>a|HchA-1`Oe(cFBEJ|G_Y@Dy><=AOXc%vmdxe^L@ulI
zpn$b}3H7Wb2z0Q|yu*sG8G5Wf9nflqM3wu9@5urnAl62Nr~1>{!FS0lNk)9jE<5hw
zj5Y|_%aug6YvvvKqd)bJKnjo*lZDYqxqIi1;Ydav|5RzU_cn5AES5dA%_Fh|+*&SS
z?E?<@&4tK7j;TJ#k!vH}wQt`AZjCfbtKajaKTWze%C*&5a<Zia8X>c&4Sb-Dtbt^_
zVIE%%xqQzQfTO0(n?G7r3QWNZj8pChro~I1El?cgp5gUriISQ@!skd3*Oh+jt}vO6
zO=&Nh^FelVG^Ovay&XAnWZ}^M{m*7LA=KDLnj5X87rc5qu<yCt7p0WKA-2=!UUqGG
zbbQkbNwUY&G`*A?Hzv*BuSWeI7di);p8wi#x;XC!oU@L!1%0j^T;~ekC2ddrN*cJT
z{_OTl)dkH_5lLF+c-t{Rkn|$wwC!0zPqVD;YqDT2(`dbNM<M#|@NQYf5?nZni<yQs
zNQ1#r?|Dyh5$obL%^ahY{I~oO7od{Ie&^KN^(Qq1SYzu!r^sRHb0#}~K2|FY!&J@6
zs_3xjObBK=d>^{xQ>GRzS7ZvGO6&7S*&HSsQNs&AV^Ufw1r7zS*Q<{oKfZV|=Y3}N
z=84B~uo;h!0-*>B8LBEyCX~zl>DiV#FIqzj9so|%oPfY!>40NT=}Ln?aN$MwjKYkf
z5Mn&^9f>m3kd9eK&~lm)_L?_G^oo=zoK_Fy0cdMHIUv9SN>$S@r+xf(?_P<L2K<hI
zOMGv4PXA_dJr{tq1<BgYn>X=H&C9c`SqE26D9V8LVM}m;^#y;+$wAtZlkv0&9aT_j
zq|VvEfq)G=Aom%LG<I}6+u;!`i*L_4`6p+gXQrWhR+ir`q--#yH|+=Fg&3fXXAF@9
zq?mR5ln5xzy+*p`<?TIT;>1yWV~i9@(1B^`Cvd`(Z1mb(7BQwr;Tu3facL1j@#$=B
z4<o-?9F;T}*(C5)w*?_G9TK}vty(!DC576+=ScjTvsj!cAfUec%$~G$aP9Y825$ty
zz@C?oBU#W>Z$76mGRTT^h#fv+L?Va9@J9IrL;wlN7NA$Tow1_kxZK3EKGA!^%%7Vn
zDTVsqaV}w#2^>#H$8Z!Ans_*~qd3Fa?~8VdUOO^v+1o14E|Qm1U$L$1T(LFlEV~!i
zh<CScBGbEH!2t`$r?CHc4KM3R9CLk{*n^EQ#C-%sDoG}|PWR^obNid^I|MDrhK=Ct
z!uAYcZ)&9Z;?Bk(HwFgra6~}U2<8`=I16X9*iz;Q**&aPIC1CoYws$3sf&wn<Utg7
z>cOIdjiBH<avf|VsUvwUp;$mdBD}Iqw1UHsLC5wKqf9shVs`?!swMU$srmfni>^)<
zcTI3P69x|?DDmjuzZ6Xc=Hcr&-?zD6BCF+tFcIV?(Jyu<t<zdMm@B6eX9g+ZsZ@1s
zhqb#ipPe(0gK9DCd`wK{NXrE80(PHaQt0$06#kqFIr+z#1<ixSLdOcVjkG2VBnBCi
zhWgnw;UP<&LU3OG<ZKG2W)$8esijs8K3Le@xqe2tOc2Q3{)YX76SjUyH^GJ^pNied
z?m~Ms>xiaICkq%&B9X*2zk%E%C51S#{MIfF-f{2OW2Lu>fl!wxex8^dmUDh-0w4kn
zH;&@e{PD5Y?yPID3<Qe8g}ULvZJq10X^@OJ`iz@r^v5#oYFaEj@uYh;eq7<YS+7e<
zB7aV3C25LDSKq!S{(FPYw{9>^AUCLrQn<_6BQHQkaddl^-MeE}95i$;{O!AUk&NEv
zlrTDEyA>>GT;*0VkIu)6Vq832e#IgK*jxDJ3>-29IzbYV0Oq)O)u>sW+qTvAdd$dj
za$fH1!uJCwK;RJ6xo{IwW8M^7*_#-=4jAx-l_D&(`M+l0-xkw*=EQBHfg%A)UCFCg
zUs83WaAyTg4IJs}#BC>WVlGz|AJ7BF>0G0Al5yigj?T8*SL>HU;t8cmgsp?@Easg*
zLMuw2M^EnqmY|?)CJ|Y*OOXxGtZ2^^fS|kuqhKD95T@O}-9p0n7*}{0ATi+a3+N6z
zJAa3A)!@%R1+RDRn+n_t1@ltFi!X~TvQhI+h!+Jtms3G3Og6RXv%=0{rCuWCfGy(x
z@Tqol!6bbpsTo)vJk#VyRmG}j8)J<u=(Ly?PteV?nB^|4g0%&VJ7n1m4iMrt1&?vy
z&}b4M)`l@Ylt;ptI00MwxRe70NE9t&B8?HbJc|;9uE;t2hduCZpio1@%?f9@j!WUU
z6FSB{3VVY6vT9K`KoAA~7DBs;$2oMx7lL<+2i<zeK+rb@^>KS|YolUddUh3hxK4wY
zSh7iwlvNXq=rz42_Bh14VA0R2?6q^$TlH%uPq+PXvXa`N;4%Jq-)5Kmk<+?W3@|uz
zmC0FzW9#Edx1Nee!rW}du{LI_Ngc$ZBV2O~gh-^IT`H|-mfb7bsR2e75SZv&3J+ZJ
zVHJ(R;7}k2^nDJMLyi0L1)%E7w{Nw%ILtN@($M>xVxe-XNx<%Kr(^y%W2TaL|BGL=
zWlWF~M8*soZ^>tJ+_a9C>M9dRI5{5Ag2BUec$|~dYLFN=!b5HC+xue^(w0oeFoiH!
z;}Ec!eku=6m9?bB(kAJs&^^$b4(~w;wR6jsUD6Vuz~pnB=3=s)%msk`0ym}NMG<ah
z3L{;F!h%FDE9vR?)M*Fgrs2ac-VXua*6Zy~9g0`w+x6$b17PF@@dEGPBOhH59~tH6
z_mC3~!-XEx0x`OfJo~Z;S0#8f{#7E<bN;x(`4hk!cP%2TnB*Q_!>n%)@94_Uoaw8*
zh%|u|pL3Lyq2N19>f+yv<5cw?2gYUq&hMMyB92NpIjIT4)Iellc)L-qSHBruqwN$9
zUVzn!kzFcGZELKLa@WLST!tM%+1IbQ!AZz9*yDUy#<BN`w7dK>eEa@3+3<UVmuK^f
z5E`s9&(N!usGt<3mv^kDicQVbE8FO~IFs&lyMOK4xXPkWWY<juZQ>ia>Y~=h?fWuj
z*TKLwKvh_sIOabA=A`6$%{V!^tzch);d6HZhtu!fBZp<{q&_NgqUsd%^Z>IfI}&q}
zF*h5wYxK2-w-f$FYs4p=eywa_Xi18vsTiHIL$C~8&oH1RsP$`>UCrdqL5K$~lG+Rh
zGZv4;zD=_&6{y+Kt|fU%`hENMo%QH3rBo6k8YwC*XEF{qi~ald0bClb<SmkKZh3Q#
zd6&KuABLMnYzSH6l0(Wy)DIELMaQB@r%?nQKnmQ)wV$>0c6oDXtxCzA;(~(iqishp
z!FsLVCGao1g$rpm!_lI7=(e!t%aNK}%a_tY=LJyF#(^uzyVkU{Vs2lq*BYGq`1zfj
z92d?BX|tG7xzC=(8ZEbjT*j|J5|C`cx)gL(ZO}F918UwE7#E@=0hSv1sP|}*v=j$>
ztI~tpWt3qQ#I<&GLzeYyW1;aD)vmKzIE8?s6ojvauSKNZFd>u@1P#UwF8>;1Ip8dW
z@S<z>ix(%%zuhwax!G5u_nm`7jD6stz<$vyIkzb%D)q2|*=IC>bW;{n0zpyE!HZry
zkNM42zM)EhJLDO=`PfW!aWsC2*saJyIlKdvW1~PF%vp9csjf=B$>5_Mn6{{NJpN!2
zR3Pd}W^U0qFOc@EL$_Nut}l`eiy<T8T5{zT*FQiU<gJBWm{Po55g3rqfzc3;sE{RF
z;__S~R=1_WO9#TnhbG^QR)IUQ!XQO5j2k~)pzN92$d=>Xi?DiGGo|adF1^0rc}Ktz
zTp8(mTAn1=$9gbdX2q4^qZ=UKYSz4Yz)LcOR@SR}c`YKcQN5G}n%;ayeJc!1)sE<|
zJ$=6)r!P3S&(^IJ^HlDRLk#ELPD{gz%2$nGaA|h?S1pS&!~ABnT#E^F$8KED6EjV1
z&BB*chFHYgMb&z=nFU>I^uG?4Z>x`!F~28pN=KlF+OXzDOE4*V_S(3QEKGi#-RMbF
z(>(@e{ouSxb{^cf@0M9Pbp=_5`rbX#F!)}UK_?43Thd%`aGVj_W`%3N_tWJ2_iNuw
zscxFl$s}`HEEx}1kzf6*Q6ux9({B{pM9ZQco7D*m^+|8ktlh1wf4xUZa!@rwNZj=j
z2Ezb7z0>#CfdN9tn7e#A^N6AX^seEx7oQ(a&u6-wn?#wwlhJ$neQXrlw`dXc>}X7v
zg$oyI1NKdvJ}=XuoC?@W6$_Mx4~3#6%OO^+^{KmA3zjbJ{q10{=G6D{dC%geA2~Pz
z5&#_;$R)-%_s#ui%QzVh(t-do@ZL_(9L0$n)-;JcfqmBd)e@E)Rbs$P@>pVAc<657
zgt9*D+03*3^%?Zq;obK;Fq)j+Wl#MkbM(G=-zuZG*Dj-rFfCNuwhePn?earDwtra5
zb~tr<jB8Sy6B|K-q}Ca_LD#fx+rl;FX@|U3K}RA&S2w-k`E=WywL0+{7gafZB^;Q=
zHYH8$K-I)>D%Go?f!D4b-yW~PW}kc4L&c$cm?PR`BtXxyg$sM8<fI?YKJl#6*-$4@
z_2Yttq$4xZhOB-0Fum`+oR`9w!E!|VnNCQTZzV=kr}lQZLmNgTR>=*hTiD`!TteHY
zLu;EEBIo2{u_*)k&pZ^<<gYagh55n8Q(E~+^r~Rys&`-<`N)#IV;J~1vWi<xLKFpF
z)mQ@wXfo@6?vcx<cM4`tPL8#Y#VO0_i$=a>dtuua<~}|-xW8#cmgDqT#qgCiO|yS)
zO}%?pJ~|LC^~KSRa+;{0TG#v4db`V0bVfh((1PMF>=@?C4a<qhvdPk`_wJnzU4PB7
zr=y0jnMx^`rv(VnG}+WX?j#nG*&{7o$6U$(2lNHbphN7%X@`cB4BAbpGi2z~@>hj#
z(vG@$!l6BB%T$)m?di4X$=MJx;>-mf!4jZ@rCF?4MHWY=?<wU-e*UG*wt1M0?N}2t
zuXeg%sPzl!mYq9Cvw1izky*e~rHW}1`A_fSYGsYubo74q`JAP*VqJ%rkNCO#Ebgp_
zSBa?x=o$GkXb+vpEcp^aBm3icP9SK~N*}7d2c{!rrZBmysXD)6Ui>!x^c89aAT0f!
zJ#P}%iMJ{1mCFkl(hW|^8INq05wheF4#2_`v)u5s)Kr@D*+8V2^?0dLoqix=35n1-
zo1B=5eyH%oJubjS-H)~Bo=!!July1umWTZ;69x`|?4AFl{|AdkcY;eF{@mQviwJUs
z@+0^=@nn-u>Z7Sgj+jf{Y?{%y?Fv`zoreq)o+MU<=owOU2emfDs<ExQx~3OCf&cF4
z^i>fSoEz77rNX;+Y<~99Nt+Uz<w@#lYTdmi8yk-%xx;ZPCf%@E5Aw+>jtu|d*FYju
zS4M|y-V|O$ZY8`8gq`@J_@$#J_%cI+6)AtYAuA46oTkfSk=`07%(q(GqjI3>owT&4
ze1EcbzEj#BOYWsdWzosAcTB{1KX_vT8zN^0YeWj%bm%&`hr%@#`=#zUwS`C{DQ8Hh
zavIR5WKN0j%qf1#5!|Nlzk8$+S#;#Ji^Wt35=UJxXI{+h%=Ub%k{cNm7MMo4oTvm|
z-SBtX*;#D||I-nS<LL1LZ=vQuZZSR9*eK|+uPov3b9Cz8tgb$?<L;y%C*`9KI%&?i
zToMxKq*L;7d9>Z#oCc+9il)I39URAJDSA9kaV0-F{;s|*>JM%8j;Q*blP9D&0{qj#
z;+)<5`5l<RQuVPxzR}o(4W<cK<^%y?r=8u~9S9ek8Kl<VmKe5_8kryh!kH7dn9K^i
zF0-F8?m4pPM}SY9bcOn&OewKFlad@;>bHM?*rBZ-!WX-b>I*gsx_VR%WypNaGgvp|
z64{5>v3CRvXm5JD3xx%dcAB9f5(}Xh;T|+g_F}LWi5&%;J6JONOCOn-u5KC_eo9IR
z@e9P6;j5QCYLZ6?vR0$?E6)o$MN4*eUH^>h8)i5@A!ere-k+&w_RAbomIdehFbn%Q
z#h~qc*QgOdfByL}+qWLZw8{0%S5LP?YsqFR!5qLSgf?+gYBR+(?9;TaZF)Iva*zqW
zfhG=jc(=Paa8avGKDnL)Rc<KYQ3`{-c8yE)w&yfIuQ!_S;Agb_CJZHjvW&a`v})C(
zA=qLs=WNCm&@;eg!G(1da=iJ<_;5N#$X920MwmyW?YT{|U6^5m4FueUOO*qspZ0=-
z4uNsczI~x4TFAt<m1-r)Ib1-9&)sjnb$mkQ*I6}{7Q;g3jLBKrXPdiqQne`_!H(3w
zcKaiH{gBTY5LO%$H+S5c4jy5g&)Sa;g_8;rL=PGSwGNksbDnC9N2f^hw9INPPIWnb
z{%+Pz4H;y*x8gE{`qZP!g>!+EIC<E?fM0{1^t{^x>?69<8u{LplEiV))7o}RZ6Bk3
ztDL7#(^k5;T>Nk)$Jf#54S5h{#h$qep_<%9C!QffPZOS7QNEHUA`36F<Jn*J2b~HG
z?9g%H)G1T&3pz5=wMWtvBO_v;uEUDxwa%Q2jx<!?>sY(suE=l{f28RL_wSE{e-Zo0
zcEH^J<b}Kc#Dv1*(so3_lg(CNwD;IU>|NbF`{z(s=#s<mxzTbenk#p?)Ze+s1SZ^V
z8#N?;{OXTN4U4MBwWvu@fA?5#wXVIn+}t2v0JEieVVtIg`7P0>p62T`V5uy4ELyxg
zKK*&Tz7LJzP*eOv*G`V~qEDi~rnrpSzKI4oOg~&W3B5CDs|_BUef8?x`$xyx1qwPo
zB%?iZOL`pj-eWY`sD#kz_^&-!6{V!%5aTIR?m!x@Rbbx#@MmZw1q_O-T`!@El`M37
zmktw5g@krnJJTguj--VJ8!mA*-rWj*tnY>qt@fF7=N3bZ`fx@5kYbPQvqS6F6vbyj
z<X~yF8FB;+#pqv*k(mm%((eK0Yr-G-VjdnQ5m^o$Cp=~kW{s5NVInK3#BaoRqUjBX
zYUxLdM(NkP6hvLpdiuXz#?iCBuqSV?yfQq^F*W?<?d1&ad;9&LJ`dXWd;V4Whdxq$
zGXT~;wuxF^aF?7|K2T<T+PRS=Ys26cb^PJ+M>dqA)1XY8{F&|KQyE_e|F_=xxh6!t
z_DWy2%yW6yTN`8s4ymeyEig&>0b;<CO}j%L(#IQyJ}|o<<_Tc87q+KSs|?G2_Hg=I
zgVl=`g{qbD{Lp?m?ka080l?GYpkMH#tjO=<1uO*A-oCvVHdDt!>*w-PQvWr9?$uTC
zvQ^rtPT#gq4wI21U}0^p{FxF6KDqkvI~XlId`MYwHu1~t)KsR{1iYk>^#A7ANR0S+
z{klg=JKxvP(S|pwp;8=v4`LD3WrP$<LCv^7^it_tg^X@z-%UqW^fjXNR`7eS=Kc5v
z^pyh^S(Da)SeEXc3q9?O;n~2m;m(zbyUAB?AGD2DG%pHYz3SGGw2|1Q<6w!QT~$PD
zE@aI4^LphzwO32Y`AI!&VrvV033z^3?c>w9K6tQ(kCaE`re5VMh#>`iW?l?FzV{u7
zgx#!rI6~||3A1`T)-1bsoO3LX#G7IIJkYtG-~N){Yz+*7p%X5yoFi^0frm6262vFW
z)Ei>l+sCOe8!U45_Dpv_rUVo&yFzSh=X)Y__r;Hgqp@n5a&T~OC=$y3z>R^t<oVWI
zEsjEHbHfj8RM9|xqdREc*H53`a_CkR&a<raI3~C#uHi!FwgZ$a$W=6@y;Ui__@xQH
z+U4u*K21rTR$bXdBY45nhMW}YYr?hY`0~~h4vmDyIb&o#Ny3JXbJDaU`jdd^7HQ@z
z4-Rd1{_!)j1)QZ_*XqDOsj$#4$ct%+8YXq=t7fIg72`#e`ZWJ_2&ADmB>|CEQ2{a+
zjsjVDc0nj<v|r30+B-MaI*9^7ve|y%0d9rAdptN*s(Q4zMa8>ca-b$6y_@dWy?gh`
zBj+y83eO_@IJODm_$tm=DU(Zj#|`cMRw(E<Y%rTzIxnA~CF;vh*7SXrgv<tQDNMb`
z@pG)5W)Drxj-%Jl_$H$%;lBZYZhl6JS7fRq#_rLgbY8daF2m6hdafKsrO58*oVutw
zS0H9Le09rZM9H@Fm60C`E7Y_09k&UM--JViU5w4osp8ouMmY_CU4h_F9M`CaU~9x@
z+P61h^AOLpQ`ptI;ftzyf#J!M_OPVm7Cn&otS!k2Ij&O%s|qTuM416=&t@($iksDT
zgx^_FNhHn^j@bA62~QX9;novO$m#8H5_ufyScKZa0|&_960e@s=>`d9*m^eN*D!8B
zi**%hi3+{lYb`Xqavu-sJo3^?(Oz^uc{Vmy9x#+mpSg6%5e^S~8jf&ju}uld6znt}
z&o#%_k!YO5nspjsxy(hU{7?UPhH1-?Pl@44FS(KG*6kT#dJ#l@!^}2DG;zb}XGu@N
z2cbwg^Y3B|F7XT6VNfvfJZul7LJ#^mf|ixqU;(2Olk}3Dy_Y;4&DHIVc06I%s|^iT
z5eBfa&(hP^O=W|z3H4HhvB2YR7Ln%6>=pOyuo#@?NO3gdjDz{{2lS5x$baz-Mm&hC
z%(1^T3>gk!!Kvfp-o$P<uB2jKiMk(W!(p@yRttX6rNd2+-hrUfU}y4dBN`B^MN2=E
zDlNRT4~_5ubj0uo>AfCXFqG{_&CHcP;O`&9WV2(bf9Xm|?!k<^r5J`dA_8)ZlUbpW
zqPMs0=e+}z{ztI-9fs>ZDbI?>T(OYlAvnJIYn!f4x4QT<d_;fkZgjfH+l*E+GVV$<
z;mAT7;`em(yW8v>{*%<Pm$L(*NSbD*aUEc%$xq9@rsj4ODP3=paytexba~AS(br8j
zA{*@8+8A?5fy(E5QgGj-=aX=y9Uw#MG{_Kriynz#fy@vNg!!}lGL3<d{GVr{qxb8t
zU1lX!K*yJjw$ydXK1m}uZAo(q0>HO`kq2dBJNTNMQX00`I`S}?1ThYr{kVAq`%d&d
z86Zx7<_bf)@7|4kpEec6kzNy81CJp&Ls*EA=}?s6td9jmUuqK&?2hUE0wR_R>4Ix-
zpyR@TS%d8}#w#O(6N*BRcI5ILtZ+7A#$aFsrmmua%XBs8ih5uW*(F!U?<JwN(cgdH
zym2Eus~BYiOnf}X3!%%1KG)Zo7@~;&4xe7K1!z>bLIS9hM5o@nC;XZTkj(L1$?B)b
z9`kJkYe38xR?7#}VF&Ryrq@zao9W0Yj}D=lq?q64u7X4FF(>b3(>o4qCJ8T-AL&Ad
zy*~Zr+kLe4j1KM8Ov%2A&-QGTKsYok=Gs#iP@l?VnP@{qnBWs1*uP)U(s|2A{RDJa
zFcget#z%QIREwWTWeq;l?t+rwiUN~256dd%;|N_nfr*93r%#zyNYRQrPOapI$Wx9$
z4k+~pxh0KJYBjv=DQ_WSxl>bxgh0vDrFFf=*s}O%>ht~n&uICCs4TEwE_%~I)g(xC
zD4BjIl0~6hM-VlBmEdgC5~TEJPwv>dbscfWFa)IH(kQx2MAgajXfY2*)b4X9d^_X^
z?HbR=)X_Aqs8wFRe96M=CC_SA)s)pPbT2*_kEpAve!_b=OcqXG3aB1Op_4)%hVA;2
zB}-PHT70=A#Zq4`2_lp4LyA~SO@cnf;s?qkP7^Un<8m0jq<S?s*9~5VXp}{VwwQTo
zy$I}7;_{qEiZ1Zc(!OEO-^{O#Z=M0y8CV6z0B?nyZRU-z*Xc((w`?iRd;%;Q7JMf?
z9W&9e`~AT5_ZUT~_-x<afu@iez1PJ1(&<ZS_K?_5I5ZQjDj54$u}Fwct;{qALI>b$
zPdhL$Y<AHLV1ADL7K#?|vAwI-tdSRQ?oK^RUF2so;x$7<+N!FmwraIEYDh4h2<WiP
zs1am0--)P{R#kmp8J}|Ee`Gg8bc4Hy%GW1yj;h|aiI4O0lJB3x%BPwAdL^Z%eg@;m
z%EADcwPKEil7Dd-_eVyS005L8Vyj!gA5Pv#n~GMXc3A^j#7yP;@bTlUPhl@BJ{&d-
zu||eZ*R#ppY-~G;uc8m@mue@*7z!_b`mG6+?~YCKFp1Z*jtRZr80n9m#jPSp=@-Ce
zTa7sE?~nHFWO~yI1QgM9`Wo&unq{L658(?pqREcNjL047dj3O&O{_yK{up$2i#j=#
z78sPQ_v79#qs$HJmbO01P|QjYCumS_Ka|7@44Ot^=^>Ql`*Md{I8oDp5KSCEUM<#~
z-y%-t0(85#)Gb<X_H_y?HTd{KC$2E?M9-hA?tf+80p=-ZXRn>3A#OWw3`Vk=t?hP_
zC7nh8RC4aX(3NvJr?J-P+_B@#71N4zs|0?p=Hr+O6a3vfn==jK#wT0w*18wY-DYPd
zu`w5S-nxA|Be24)BlU(llkDJ5sO3C^LENw=McQM`3sA@jr1^RCsg!M0?}pJ(^^U)n
z^~d>+nLripg<^ObO{}7d62G_c9oIQKyJ5IeX>8+9Sm<MOeu{E5eW{X8C5k+?@9<xU
z+wA#bjZ?A6qQleY<r<|vc_PpP3YQU}Xlw=gCFp^0)F~02P&Tsn-d6GhFa-e4DYL+F
zw*ohxJaIxrgGs#*SA_C__7iTE-Fo#pb$>C+t>CUXR2apkE?8YhM>lTT6d<IGr_nPN
zVmKxTV^mg9kjYCDE%@|rFXUdiGKbGW*n7tgpy7|>Av+DT1raY`eb7dx1}dt!LD3gO
z(m?Bk%c!HLH!VztU0xQUA2fuozZSoUm=(P{Lpfu>6k{DkHp#AM`#Z$uz^~yg0ZRCN
zq1qvVTy4zo>(5N4g>1mc(ICQJ%GcaT85+rWT9H7{3=Qkix9^$z{Ya*SJOrH|tBGam
zci;d~_95sLnS<dG2k0JfyT}9P27LMaxh-S`8cp!9XyN@lX1Lt<CB3qetcXXDIhm^)
z0DnX-FNF9<u)~{=b}Ein2ysV>tp9=FO~9IM*Up7=hIxA-Qpjn{RuzZpt5*-{#fIgF
zk4hETmfu<Cu}$Wlu{d!~389cdm;ep#B<LfNhaN%iXMv!&2#AazB=HYi;$Vp-lPO@F
za{J;F654A<^v8AJ8E^wf^Jv>-H!H*2d3n{qACKta;TzB-NzAk-b-|&L0#RsCS)CNp
zOq7EAX5rycV_~~4WJCUjIYsA8Z*y5-+pOVY(Uf8WhT)+HoFak@#5H3;^0TbRiuE~W
zdcFN<s`nqR`N!rWf1cdXTkzG$c+sXPJ-3@wHyq5<FyiI#wu;|7nn5XwchN~zV2IU4
z87Ys86#u}VGTVBV>X<<NwR-ycHp&lz#{Df=*R>j$+9#T<+uvSak-jRmIa1qdj*33V
zKY5dS_H(LJM11dDuWKp1O0P>91N;2}0Sn&?3##x=P_r8nEr<NSP{cWryj0uy_5jeW
zrKZb95|*W1e#X|lu>9}(w%821-G;w7;GsUOezjX3k#*g9%{j~7D-;u#w}sIpiAt`-
zF&J4*Mae8U?Jc+M+!?TeQm2;UF1}d7JcNG>-wQ#d;_+!T>+}l_(u#o6PQOlJQ`Ouh
zz5Y2$GChbrFzQ-KaCpA+n@s;7ZG%n#sgmKpfb_q}>SOD#Gp{4lqq)3$0~vl>!L}C<
zwfd@n{~n{c=g^_rnTi57!N*(NTJadO{=YBMmGhMq8%cixKp6c<t(-tSuXF&NxB*Ux
z3+f)<J5w4{q$6UxF->ualq%{W?)oPwDZNe0eEeItQ1^4QMs-*J_wziH>yfzMKl}eG
z_s}^o@%XQ$!^@1(uKzqbGDenQiu_Oe_KV+-|Md=@8GZlrt)AO`Ce;)|)JXG$3m2#m
z<B>z$xTpN<`C4u3lm?zYEP{ie>rFK1C2-!!bmY+k6l+F2+i+7ohixT&@2?b}h#(tl
zFm;p;$b0PD=Jot7Kk@&i2$8UwDRO>BG4??9B|JPzTOLE4q`aoOm@vw~?K*<I($RtS
zS4{nGR@NG_M!=`qiAX>RpDD6xbwx*Cc0er;Q3KC#cp?Lwgs~<Mt$uZD%d-V;L6rxW
zC$C(wLRCVuhrw&SQEIKU$Ugyh5Iz)s<fP;bGt$Bh@^M?m(m8s){sY{20If?Tb5XNq
zDew08M*<K`w<Mz)v^nuIPI{s@EC!x=yje$w-<Eu#{sMTm9PpwhruHLwwm`GyMuIIm
z@V3ct3OS)yI5`boX3oEFqO-mhVkP{P;E<5U8q5^}O?dn9<MRKf2&)@*0y9~_oN~b)
zCs1#Y4_0yeu>c!XtNWU*1ck905k)t&1OY^9ksCtZT6wN+--rELIo_Ng?hNP=Mjp0^
z0-PD+abqe=`sjApf^c!5P51Mi*v*T$C2X;P2B2|oKUBw3H{4mj60AaH;Zc{WUFeud
zwovj7amc(?dnx@lO*~#f*cXf%u<zX58AZ=^qY?!X5m8{RWKUclf9Pqo*XgL^R$t#F
zf-e0dyV})kIFHl<845eHv;`P62R@0FEXbryKc>})M;o7C8ldl;(RA?8Oe&@*BrI0$
zw9r!H1#};@<OM|1^gxs9hf0!viO51DsA+wSz(Q+S^P-n)$mkF9>I)W?uWXBUD*mGL
z+!ikq3%#T@j*jg>&5_(MwX-ujW}tX)ll>2vhb|t8%Cvu00ge8=2_@|ttwv4jxztAj
zaCxAKnAe^WOVzIs@jkxfQPZ+_KMrH)|Nd`V{@>q{_mA*&Ehkz0>+ipDkhdob7T0vp
zpxzG(6>l3_Re!Yc_t`=dwC<JfkVQ_yqTs*CcA|1sRTbVMQ8mssHc$8Hz4-7!O!1Sy
zZHeN#rEeUlV^K@e7R`2jE?8(@Ny_N_e+WVw=0=dR2b2jDcpkFeJ!r?>xL0?yDSpVG
zq3x#l)(O3T{R|m*J&O7JXTOwczkf7l)_-Dj-H-pPrmOqW{nr1#qiR%5gMYkycE6ii
S@I$<q!Gvk!PfKQR_`d+SPvjf`


From 3cfa63ad991665b2440155cd29352342024072fd Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Tue, 25 Nov 2025 05:02:21 +0800
Subject: [PATCH 020/134] [XPU]fix Kimi-VL-A3B-thinking on xpu (#29309)

Signed-off-by: Yan Ma <yan.ma@intel.com>
---
 vllm/model_executor/models/moonvit.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index 2e3e6dc166ad8..63ea6b259a71d 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -56,10 +56,13 @@ from transformers.utils import is_flash_attn_2_available
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.models.utils import maybe_prefix
+from vllm.platforms import current_platform
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_varlen_func
+elif current_platform.is_xpu():
+    from vllm.attention.utils.fa_utils import flash_attn_varlen_func
 else:
     flash_attn_varlen_func = None
 
@@ -106,10 +109,10 @@ def multihead_attention(
         q,
         k,
         v,
-        q_cu_seqlens,
-        k_cu_seqlens,
-        max_seqlen_q,
-        max_seqlen_k,
+        cu_seqlens_q=q_cu_seqlens,
+        cu_seqlens_k=k_cu_seqlens,
+        max_seqlen_q=max_seqlen_q,
+        max_seqlen_k=max_seqlen_k,
         causal=False,
     )
     attn_out = attn_out.flatten(start_dim=-2)
@@ -291,7 +294,12 @@ class Rope2DPosEmb(nn.Module):
     """
 
     def __init__(
-        self, dim: int, max_height: int, max_width: int, theta_base=10000, device="cuda"
+        self,
+        dim: int,
+        max_height: int,
+        max_width: int,
+        theta_base=10000,
+        device=current_platform.device_type,
     ):
         super().__init__()
         self.dim = dim
@@ -437,7 +445,7 @@ class MoonVitEncoderLayer(nn.Module):
         self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
         self.attn_implementation = attn_implementation
         # use fa2 in vllm by default
-        if is_flash_attn_2_available():
+        if is_flash_attn_2_available() or current_platform.is_xpu():
             self.attn_implementation = "flash_attention_2"
 
         self.norm0 = nn.LayerNorm(hidden_dim)

From f32c7d6f5455de2684686c7238f9c7ecca6b58b7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Nov 2025 13:54:59 -0800
Subject: [PATCH 021/134] [Model Runner V2] Simplify Eagle bookkeeping with
 num_rejected (#29347)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py             | 19 ++++--------
 vllm/v1/worker/gpu/model_runner.py            | 30 ++++++++++++++-----
 vllm/v1/worker/gpu/spec_decode/eagle.py       | 19 ++++++------
 .../gpu/spec_decode/rejection_sample.py       | 12 ++++++++
 4 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 3ac43ea4952de..2a7048ae3c0e0 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -344,8 +344,8 @@ def _post_update_kernel(
     sampled_tokens_ptr,
     sampled_tokens_stride,
     num_sampled_ptr,
+    num_rejected_ptr,
     query_start_loc_ptr,
-    cu_num_logits_ptr,
 ):
     req_id = tl.program_id(0)
     req_state_idx = tl.load(idx_mapping_ptr + req_id)
@@ -360,17 +360,10 @@ def _post_update_kernel(
     query_start = tl.load(query_start_loc_ptr + req_id)
     query_end = tl.load(query_start_loc_ptr + req_id + 1)
     query_len = query_end - query_start
+    num_rejected = tl.load(num_rejected_ptr + req_id)
 
     num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
-    num_computed += query_len
-    # Consider the rejected tokens in spec decoding.
-    if num_sampled > 0:
-        # NOTE(woosuk): We must skip num_sampled == 0 to account for chunked prefills.
-        logits_start = tl.load(cu_num_logits_ptr + req_id)
-        logits_end = tl.load(cu_num_logits_ptr + req_id + 1)
-        num_logits = logits_end - logits_start
-        num_rejected = num_logits - num_sampled
-        num_computed -= num_rejected
+    num_computed += query_len - num_rejected
     tl.store(num_computed_tokens_ptr + req_state_idx, num_computed)
 
 
@@ -385,10 +378,10 @@ def post_update(
     sampled_tokens: torch.Tensor,
     # [num_reqs]
     num_sampled: torch.Tensor,
+    # [num_reqs]
+    num_rejected: torch.Tensor,
     # [num_reqs + 1]
     query_start_loc: torch.Tensor,
-    # [num_reqs + 1]
-    cu_num_logits: torch.Tensor,
 ) -> None:
     num_reqs = idx_mapping.shape[0]
     _post_update_kernel[(num_reqs,)](
@@ -398,7 +391,7 @@ def post_update(
         sampled_tokens,
         sampled_tokens.stride(0),
         num_sampled,
+        num_rejected,
         query_start_loc,
-        cu_num_logits,
         num_warps=1,
     )
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index e0ed183d3c5b0..e34a45f979807 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -46,7 +46,10 @@ from vllm.v1.worker.gpu.input_batch import (
 )
 from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
 from vllm.v1.worker.gpu.spec_decode import init_speculator
-from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
+from vllm.v1.worker.gpu.spec_decode.rejection_sample import (
+    get_num_rejected,
+    rejection_sample,
+)
 from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
 from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
@@ -311,12 +314,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             device=self.device,
         )
         num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device)
+        num_rejected = torch.zeros(num_reqs, dtype=torch.int32, device=self.device)
         self.propose_draft(
             input_batch=input_batch,
             sampling_metadata=sampling_metadata,
             last_hidden_states=hidden_states,
             aux_hidden_states=aux_hidden_states,
             num_sampled=num_sampled,
+            num_rejected=num_rejected,
         )
 
     @torch.inference_mode()
@@ -606,7 +611,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         input_batch: InputBatch,
         sampling_metadata: SamplingMetadata,
         grammar_output: GrammarOutput | None,
-    ) -> tuple[SamplerOutput, torch.Tensor]:
+    ) -> tuple[SamplerOutput, torch.Tensor, torch.Tensor]:
         sample_hidden_states = hidden_states[input_batch.logits_indices]
         logits = self.model.compute_logits(sample_hidden_states)
         if grammar_output is not None:
@@ -632,6 +637,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # No draft tokens (common case).
             # 0 if chunked-prefilling, 1 if not.
             num_sampled = (~is_chunked_prefilling).int()
+            num_rejected = torch.zeros_like(num_sampled)
         else:
             # Draft tokens for spec decoding.
             input_ids = input_batch.input_ids[input_batch.logits_indices]
@@ -642,9 +648,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 self.num_speculative_steps,
             )
             num_sampled *= ~is_chunked_prefilling
+            num_rejected = get_num_rejected(
+                input_batch.cu_num_logits,
+                num_sampled,
+            )
             sampler_output.sampled_token_ids = sampled_tokens
             # TODO(woosuk): Support logprobs with spec decoding.
-        return sampler_output, num_sampled
+        return sampler_output, num_sampled, num_rejected
 
     def compute_prompt_logprobs(
         self,
@@ -750,6 +760,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         input_batch: InputBatch,
         sampled_tokens: torch.Tensor,
         num_sampled: torch.Tensor,
+        num_rejected: torch.Tensor,
     ) -> None:
         # Update the number of computed tokens.
         post_update(
@@ -758,8 +769,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.req_states.last_sampled_tokens,
             sampled_tokens,
             num_sampled,
+            num_rejected,
             input_batch.query_start_loc,
-            input_batch.cu_num_logits,
         )
 
         # Update the number of computed prefill tokens.
@@ -779,6 +790,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         last_hidden_states: torch.Tensor,
         aux_hidden_states: list[torch.Tensor] | None,
         num_sampled: torch.Tensor,
+        num_rejected: torch.Tensor,
     ) -> torch.Tensor:
         num_reqs = input_batch.num_reqs
         idx_mapping_np = input_batch.idx_mapping_np
@@ -800,6 +812,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             last_hidden_states,
             aux_hidden_states,
             num_sampled,
+            num_rejected,
             self.req_states.last_sampled_tokens,
             next_prefill_tokens,
         )
@@ -958,7 +971,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.execute_model_state = None  # type: ignore
         assert sampling_metadata is not None
 
-        sampler_output, num_sampled_tokens = self.sample(
+        sampler_output, num_sampled, num_rejected = self.sample(
             hidden_states, input_batch, sampling_metadata, grammar_output
         )
         prompt_logprobs_dict = self.compute_prompt_logprobs(hidden_states, input_batch)
@@ -979,7 +992,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         async_output = AsyncOutput(
             model_runner_output=model_runner_output,
             sampler_output=sampler_output,
-            num_sampled_tokens=num_sampled_tokens,
+            num_sampled_tokens=num_sampled,
             copy_stream=self.output_copy_stream,
             copy_event=self.output_copy_event,
         )
@@ -990,7 +1003,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # This sequencing may slightly reduce latency as async D2H copy does not
         # need to wait for the postprocess to finish.
         self.postprocess(
-            input_batch, sampler_output.sampled_token_ids, num_sampled_tokens
+            input_batch, sampler_output.sampled_token_ids, num_sampled, num_rejected
         )
         if self.do_spec_decode:
             _ = self.propose_draft(
@@ -998,7 +1011,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 sampling_metadata,
                 hidden_states,
                 None,  # aux_hidden_states
-                num_sampled_tokens,
+                num_sampled,
+                num_rejected,
             )
 
         if self.use_async_scheduling:
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 59d0f313d96a2..3c8621cc69c97 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -60,6 +60,8 @@ class EagleSpeculator:
         aux_hidden_states: list[torch.Tensor] | None,
         # [num_reqs]
         num_sampled: torch.Tensor,
+        # [num_reqs]
+        num_rejected: torch.Tensor,
         # [max_num_reqs, 1]
         last_sampled: torch.Tensor,
         # [num_reqs]
@@ -84,6 +86,7 @@ class EagleSpeculator:
             self.input_ids,
             input_batch,
             num_sampled,
+            num_rejected,
             last_sampled,
             next_prefill_tokens,
         )
@@ -139,8 +142,8 @@ def _prepare_eagle_inputs_kernel(
     last_sampled_ptr,
     next_prefill_tokens_ptr,
     num_sampled_ptr,
+    num_rejected_ptr,
     query_start_loc_ptr,
-    cu_num_logits_ptr,
     BLOCK_SIZE: tl.constexpr,
 ):
     batch_idx = tl.program_id(0)
@@ -149,17 +152,13 @@ def _prepare_eagle_inputs_kernel(
     query_len = query_end - query_start
 
     # Get the true query length and next token after accounting for rejected tokens.
+    num_rejected = tl.load(num_rejected_ptr + batch_idx)
+    query_len -= num_rejected
+
     num_sampled = tl.load(num_sampled_ptr + batch_idx)
     if num_sampled > 0:
         req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
         next_token = tl.load(last_sampled_ptr + req_state_idx).to(tl.int32)
-
-        logits_start = tl.load(cu_num_logits_ptr + batch_idx)
-        logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1)
-        num_logits = logits_end - logits_start
-
-        num_rejected = num_logits - num_sampled
-        query_len -= num_rejected
     else:
         # Chunked prefilling.
         # Get the next prefill token.
@@ -182,6 +181,8 @@ def prepare_eagle_inputs(
     input_batch: InputBatch,
     # [num_reqs]
     num_sampled: torch.Tensor,
+    # [num_reqs]
+    num_rejected: torch.Tensor,
     # [max_num_reqs, 1]
     last_sampled: torch.Tensor,
     # [max_num_reqs]
@@ -201,8 +202,8 @@ def prepare_eagle_inputs(
         last_sampled,
         next_prefill_tokens,
         num_sampled,
+        num_rejected,
         input_batch.query_start_loc,
-        input_batch.cu_num_logits,
         BLOCK_SIZE=1024,
     )
     return last_token_indices
diff --git a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
index 8a7bf28bacbd4..43c6ac518bccc 100644
--- a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
+++ b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
@@ -69,3 +69,15 @@ def rejection_sample(
         num_warps=1,
     )
     return sampled, num_sampled
+
+
+@torch.compile(dynamic=True)
+def get_num_rejected(
+    cu_num_logits: torch.Tensor,
+    num_sampled: torch.Tensor,
+) -> torch.Tensor:
+    num_logits = cu_num_logits[1:] - cu_num_logits[:-1]
+    num_rejected = num_logits - num_sampled
+    # No token is rejected for chunked prefills.
+    num_rejected *= num_sampled > 0
+    return num_rejected

From 84371daf75507c849a38a9a44b2fb2af89e96dd3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 24 Nov 2025 14:04:31 -0800
Subject: [PATCH 022/134] [Tests] Verify gpt_oss package is installed in
 harmony tests (#29336)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/entrypoints/openai/test_response_api_with_harmony.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 6251e1776c30a..8fd3545eccffa 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import importlib
 import json
 import time
 
@@ -35,6 +35,10 @@ GET_WEATHER_SCHEMA = {
 
 @pytest.fixture(scope="module")
 def server():
+    assert importlib.util.find_spec("gpt_oss") is not None, (
+        "Harmony tests require gpt_oss package to be installed"
+    )
+
     args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
     env_dict = dict(
         VLLM_ENABLE_RESPONSES_API_STORE="1",

From 4dd42db566097cc2cacb2dddff3a8f3b0c007be0 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 24 Nov 2025 17:16:05 -0500
Subject: [PATCH 023/134] Remove VLLM_SKIP_WARMUP tip (#29331)

Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
---
 docs/features/quantization/inc.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/features/quantization/inc.md b/docs/features/quantization/inc.md
index 5e86e9388f328..9875bc44c9144 100644
--- a/docs/features/quantization/inc.md
+++ b/docs/features/quantization/inc.md
@@ -22,9 +22,6 @@ export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxab
 vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
 ```
 
-!!! tip
-    If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
-
 !!! tip
     When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
     `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.

From 71df2a57effc15b5f67cdbf55f3d1e1b71f90e86 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 24 Nov 2025 14:28:32 -0800
Subject: [PATCH 024/134] [Hybrid Allocator] Better layer padding strategy for
 gpt-oss eagle (#29303)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/core/test_kv_cache_utils.py | 59 ++++++++++++++++++++++++++++
 vllm/v1/core/kv_cache_utils.py       | 11 +++++-
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 24611a4aaa1b8..12ed59b6e863b 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1436,6 +1436,65 @@ def test_get_kv_cache_config_one_worker():
         ],
     )
 
+    # 6 full + 5 sliding, pad to 6 full + 6 sliding. This is a typical case for gpt-oss
+    # eagle where there is only one more full attention layer than sliding window layers
+    kv_cache_specs_hybrid = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+        "layer_3": new_kv_cache_spec(),
+        "layer_4": new_kv_cache_spec(),
+        "layer_5": new_kv_cache_spec(),
+        "layer_6": new_kv_cache_spec(),
+        "layer_7": new_sliding_window_spec(),
+        "layer_8": new_sliding_window_spec(),
+        "layer_9": new_sliding_window_spec(),
+        "layer_10": new_sliding_window_spec(),
+        "layer_11": new_sliding_window_spec(),
+    }
+
+    kv_cache_config_hybrid = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 6 * 32]
+    )[0]
+    print(kv_cache_config_hybrid)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_1", "layer_7"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_2", "layer_8"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_3", "layer_9"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_4", "layer_10"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_5", "layer_11"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_6"],
+            ),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer_1", "layer_2", "layer_3", "layer_4", "layer_5", "layer_6"],
+                new_kv_cache_spec(),
+            ),
+            KVCacheGroupSpec(
+                ["layer_7", "layer_8", "layer_9", "layer_10", "layer_11"],
+                new_sliding_window_spec(),
+            ),
+        ],
+    )
     # different hidden size
     kv_cache_specs_hybrid = {
         "layer_1": new_kv_cache_spec(head_size=128),
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index b18ba8e8b2c7b..a0033fa650baa 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -971,7 +971,16 @@ def _get_kv_cache_groups_uniform_page_size(
     # is the minimum number of layers among all attention types. Need a better
     # strategy if we want to support more complex patterns (e.g., 20 full + 30
     # sw, where the group size should be 10).
-    group_size = min([len(layers) for layers in same_type_layers.values()])
+    min_num_layers = min([len(layers) for layers in same_type_layers.values()])
+    group_size = min_num_layers
+    max_num_layers = max([len(layers) for layers in same_type_layers.values()])
+    if max_num_layers < min_num_layers * 1.25:
+        # If the number of layers is not much larger than the minimum number of layers,
+        # use the maximum number of layers as the group size to avoid too many padding
+        # layers. A typical example is gpt-oss-20b + eagle, with 12 sw + 13 full. We
+        # pad it to (13 sw, 13 full) instead of (12 sw, 24 full). 1.25 is just a
+        # magic number to avoid too many padding layers.
+        group_size = max_num_layers
     grouped_layers = []
     for layers in same_type_layers.values():
         num_padding_layers = group_size - len(layers) % group_size

From c17610e2baf5e40b3b0638b272bfe7e04e471bfe Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 24 Nov 2025 18:22:46 -0500
Subject: [PATCH 025/134] [Bugfix] Only use triton_kernels for MXFP4 on SM90
 and SM100 (#29339)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 255b5aad17853..198feb03be3e4 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -132,12 +132,15 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
             )
 
         # If FlashInfer is not available, try either Marlin or Triton
-        if (
-            envs.VLLM_MXFP4_USE_MARLIN
-            or current_platform.get_device_capability()[0] < 9
-            or not has_triton_kernels()
-            or not is_torch_equal_or_newer("2.8.0")
-        ):
+        triton_kernels_supported = (
+            has_triton_kernels()
+            and is_torch_equal_or_newer("2.8.0")
+            # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
+            # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
+            # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
+            and (9, 0) <= current_platform.get_device_capability() < (11, 0)
+        )
+        if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
             logger.info_once("Using Marlin backend")
             return Mxfp4Backend.MARLIN
         else:

From 699bca76c00b81ba6c7ead38fed01712f5f56aa1 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 24 Nov 2025 19:49:01 -0500
Subject: [PATCH 026/134] [UX] Raise error for attn backend of batch invariant
 (#29348)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/batch_invariant.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 8b33727f05fbc..be7f673e5618f 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -812,19 +812,19 @@ def override_envs_for_invariance():
         # "TRITON_MLA",
     ]
     if curr_attn_backend not in supported_backends:
-        warning = (
-            "Forcibly updating attention backend to"
-            f" {supported_backends[0]} for batch_invariant. "
-            f" Supported backends: {supported_backends}."
+        error = (
+            "VLLM batch_invariant mode requires an attention backend in "
+            f"{supported_backends}, but got '{curr_attn_backend}'. "
+            "Please set the 'VLLM_ATTENTION_BACKEND' environment variable "
+            "to one of the supported backends before enabling batch_invariant."
         )
-        logger.warning_once(warning)
-        os.environ["VLLM_ATTENTION_BACKEND"] = supported_backends[0]
+        raise RuntimeError(error)
     if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]:
         warning = (
             "You are using a decode-invariant form of batch invariance. "
             "This will not be invariant between prefill and decode."
         )
-        logger.warning_once(warning)
+        logger.warning_once(warning, scope="local")
     os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
 
     os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

From 5f9679a43bf92fc0fc8610f0ba5cc9c857148ccf Mon Sep 17 00:00:00 2001
From: Hanjie Qiu <50634613+hjjq@users.noreply.github.com>
Date: Mon, 24 Nov 2025 20:13:12 -0500
Subject: [PATCH 027/134] [Spec Decode] Add support for EAGLE3 heads that do
 not use_aux_hidden_states (#27688)

Signed-off-by: hjjq <hanjieq@nvidia.com>
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
---
 vllm/model_executor/models/llama_eagle3.py | 38 ++++++++++++++--------
 vllm/v1/spec_decode/eagle.py               | 19 +++++++++++
 vllm/v1/worker/gpu_model_runner.py         |  4 ++-
 3 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 3eaf2d80082f1..7a57644db1b13 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -142,6 +142,12 @@ class LlamaModel(nn.Module):
         # Get drafter's quantization config
         self.quant_config = get_draft_quant_config(vllm_config)
 
+        eagle_config = getattr(self.config, "eagle_config", None)
+        if eagle_config is not None and "use_aux_hidden_state" in eagle_config:
+            self.use_aux_hidden_state = eagle_config["use_aux_hidden_state"]
+        else:
+            self.use_aux_hidden_state = True
+
         current_vllm_config = get_current_vllm_config()
 
         self.embed_tokens = VocabParallelEmbedding(
@@ -161,20 +167,20 @@ class LlamaModel(nn.Module):
                 for layer_idx in range(self.config.num_hidden_layers)
             ]
         )
-        if hasattr(self.config, "target_hidden_size"):
-            fc_input_size = self.config.target_hidden_size * 3
-        else:
-            fc_input_size = self.config.hidden_size * 3
-        self.fc = ReplicatedLinear(
-            input_size=fc_input_size,
-            output_size=self.config.hidden_size,
-            bias=False,
-            params_dtype=vllm_config.model_config.dtype,
-            quant_config=self.quant_config,
-            prefix=maybe_prefix(prefix, "fc"),
-            return_bias=False,
-        )
-
+        if self.use_aux_hidden_state:
+            if hasattr(self.config, "target_hidden_size"):
+                fc_input_size = self.config.target_hidden_size * 3
+            else:
+                fc_input_size = self.config.hidden_size * 3
+            self.fc = ReplicatedLinear(
+                input_size=fc_input_size,
+                output_size=self.config.hidden_size,
+                bias=False,
+                params_dtype=vllm_config.model_config.dtype,
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, "fc"),
+                return_bias=False,
+            )
         self.norm = RMSNorm(
             self.config.hidden_size,
             eps=self.config.rms_norm_eps,
@@ -332,6 +338,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
         self,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
+        if not self.model.use_aux_hidden_state:
+            return hidden_states
         # combine multiple auxiliary hidden states returned by eagle3
         return self.model.fc(hidden_states)
 
@@ -357,6 +365,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
             skip_substrs.append("draft_id_to_target_id")
         if not includes_embed_tokens:
             skip_substrs.append("embed_tokens")
+        if not self.model.use_aux_hidden_state:
+            skip_substrs.append("fc.")
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=None,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 3de418f1d13c8..afa16573eea10 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -83,6 +83,9 @@ class EagleProposer:
         self.draft_indexer_metadata_builder: AttentionMetadataBuilder | None = None
         self.attn_layer_names: list[str] = []
         self.indexer_layer_names: list[str] = []
+        self.eagle3_use_aux_hidden_state: bool = (
+            self._get_eagle3_use_aux_hidden_state_from_config()
+        )
 
         self.use_cuda_graph = False
 
@@ -1169,6 +1172,22 @@ class EagleProposer:
         )
         return builder
 
+    def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool:
+        """
+        Some eagle3 heads (e.g., nvidia/gpt-oss-120b-Eagle3-v2) do not use auxiliary
+        hidden states and directly uses the last layer output just like eagle1.
+        They might indicate this by setting "use_aux_hidden_state" to False
+        inside the "eagle_config" dict of their hf_config.
+        """
+        if self.method != "eagle3":
+            return False
+        # Assume that eagle3 heads use aux hidden states by default
+        use_aux_hidden_state = True
+        eagle_config = getattr(self.draft_model_config.hf_config, "eagle_config", None)
+        if eagle_config is not None:
+            use_aux_hidden_state = eagle_config.get("use_aux_hidden_state", True)
+        return use_aux_hidden_state
+
     def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Validate that all eagle layers belong to the same KVCacheGroup.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index cbafc9c993cc2..6a83ac14e0b3f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -375,7 +375,9 @@ class GPUModelRunner(
             elif self.speculative_config.use_eagle():
                 self.drafter = EagleProposer(self.vllm_config, self.device, self)
                 if self.speculative_config.method == "eagle3":
-                    self.use_aux_hidden_state_outputs = True
+                    self.use_aux_hidden_state_outputs = (
+                        self.drafter.eagle3_use_aux_hidden_state
+                    )
             elif self.speculative_config.method == "medusa":
                 self.drafter = MedusaProposer(
                     vllm_config=self.vllm_config, device=self.device

From b8328b49fb9954575bd1d7b30b22bb626ee47624 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 25 Nov 2025 09:34:47 +0800
Subject: [PATCH 028/134] [XPU] upgrade torch & ipex 2.9 on XPU platform
 (#29307)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 docker/Dockerfile.xpu | 13 +++++++++----
 requirements/xpu.txt  |  6 +++---
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 5d5b82c4fa5af..adac43c6accbe 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -1,4 +1,4 @@
-FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
+FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
@@ -25,10 +25,14 @@ RUN apt clean && apt-get update -y && \
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
 
-RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
+RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
+
+# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
+RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh
+RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \
+    echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
+    echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
 
-RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
-RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
 SHELL ["bash", "-c"]
 CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
 
@@ -72,6 +76,7 @@ RUN python3 -m pip install -e tests/vllm_test_utils
 ENV NIXL_VERSION=0.7.0
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
 
+# remove torch bundled oneccl to avoid conflicts
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip uninstall oneccl oneccl-devel -y
 
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 59ea710684a2c..c1dc4195b5231 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -10,9 +10,9 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.61.2 # Required for N-gram speculative decoding
-torch==2.8.0+xpu
+--extra-index-url=https://download.pytorch.org/whl/xpu
+torch==2.9.0+xpu
 torchaudio
 torchvision
---extra-index-url=https://download.pytorch.org/whl/xpu
 
-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.9.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl

From a178a0b40b50bf448ab50a853b7eb1744af18f31 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 24 Nov 2025 17:54:26 -0800
Subject: [PATCH 029/134] [BugFix] Fix duplicate id tool-call race condition
 (#29355)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/serving_chat.py   | 13 +++++++++----
 vllm/entrypoints/openai/serving_engine.py | 10 +++++++---
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 6cc685acd6728..2a870dbc3afac 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -273,6 +273,11 @@ class OpenAIServingChat(OpenAIServing):
         try:
             for i, engine_prompt in enumerate(engine_prompts):
                 prompt_text, _, _ = self._get_prompt_components(request_prompts[i])
+                # If we are creating sub requests for multiple prompts, ensure that they
+                # have unique request ids.
+                sub_request_id = (
+                    request_id if len(engine_prompts) == 1 else f"{request_id}_{i}"
+                )
 
                 if self.default_sampling_params is None:
                     self.default_sampling_params = {}
@@ -301,7 +306,7 @@ class OpenAIServingChat(OpenAIServing):
                     )
 
                 self._log_inputs(
-                    request_id,
+                    sub_request_id,
                     request_prompts[i],
                     params=sampling_params,
                     lora_request=lora_request,
@@ -316,14 +321,14 @@ class OpenAIServingChat(OpenAIServing):
                 if isinstance(sampling_params, BeamSearchParams):
                     generator = self.beam_search(
                         prompt=engine_prompt,
-                        request_id=request_id,
+                        request_id=sub_request_id,
                         params=sampling_params,
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                     )
                 else:
                     engine_request, tokenization_kwargs = await self._process_inputs(
-                        request_id,
+                        sub_request_id,
                         engine_prompt,
                         sampling_params,
                         lora_request=lora_request,
@@ -334,7 +339,7 @@ class OpenAIServingChat(OpenAIServing):
                     generator = self.engine_client.generate(
                         engine_request,
                         sampling_params,
-                        request_id,
+                        sub_request_id,
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 7dab5dbacd28c..de22c48809dc8 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1242,16 +1242,19 @@ class OpenAIServing:
     ):
         prompt_text, _, _ = self._get_prompt_components(request_prompt)
         orig_priority = priority
+        sub_request = 0
         while True:
+            # Ensure that each sub-request has a unique request id.
+            sub_request_id = f"{request_id}_{sub_request}"
             self._log_inputs(
-                request_id,
+                sub_request_id,
                 request_prompt,
                 params=sampling_params,
                 lora_request=lora_request,
             )
             trace_headers = kwargs.get("trace_headers")
             engine_request, tokenization_kwargs = await self._process_inputs(
-                request_id,
+                sub_request_id,
                 engine_prompt,
                 sampling_params,
                 lora_request=lora_request,
@@ -1262,7 +1265,7 @@ class OpenAIServing:
             generator = self.engine_client.generate(
                 engine_request,
                 sampling_params,
-                request_id,
+                sub_request_id,
                 lora_request=lora_request,
                 priority=priority,
                 prompt_text=prompt_text,
@@ -1295,6 +1298,7 @@ class OpenAIServing:
             sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids)
             # OPTIMIZATION
             priority = orig_priority - 1
+            sub_request += 1
 
     def _get_prompt_components(
         self,

From a4ad43ad5a819aabc7d9b48b46a7f11e2552befc Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 01:58:58 +0000
Subject: [PATCH 030/134] Scheduled removal of `ParallelConfig`'s direct child
 EPLB fields (#29324)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/parallel.py  | 50 ----------------------------------------
 vllm/engine/arg_utils.py | 24 -------------------
 2 files changed, 74 deletions(-)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index ad438a8b464e0..913e97250d3d3 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -141,22 +141,6 @@ class ParallelConfig:
     - "deepep_high_throughput": Use deepep high-throughput kernels
     - "deepep_low_latency": Use deepep low-latency kernels
     - "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
-    num_redundant_experts: int | None = None
-    """`num_redundant_experts` is deprecated and has been replaced with
-    `eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
-    Please use `eplb_config.num_redundant_experts` instead."""
-    eplb_window_size: int | None = None
-    """`eplb_window_size` is deprecated and has been replaced with
-    `eplb_config.window_size`. This will be removed in v0.12.0.
-    Please use `eplb_config.window_size` instead."""
-    eplb_step_interval: int | None = None
-    """`eplb_step_interval` is deprecated and has been replaced with
-    `eplb_config.step_interval`. This will be removed in v0.12.0.
-    Please use `eplb_config.step_interval` instead."""
-    eplb_log_balancedness: bool | None = None
-    """`eplb_log_balancedness` is deprecated and has been replaced with
-    `eplb_config.log_balancedness`. This will be removed in v0.12.0.
-    Please use `eplb_config.log_balancedness` instead."""
 
     max_parallel_loading_workers: int | None = None
     """Maximum number of parallel loading workers when loading model
@@ -516,40 +500,6 @@ class ParallelConfig:
                     "--all2all-backend command-line argument instead."
                 )
 
-        # Forward deprecated fields to their new location
-        if self.num_redundant_experts is not None:
-            self.eplb_config.num_redundant_experts = self.num_redundant_experts
-            logger.warning_once(
-                "num_redundant_experts is deprecated and has been replaced "
-                "with eplb_config.num_redundant_experts. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-        if self.eplb_window_size is not None:
-            self.eplb_config.window_size = self.eplb_window_size
-            logger.warning_once(
-                "eplb_window_size is deprecated and has been replaced "
-                "with eplb_config.window_size. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-        if self.eplb_step_interval is not None:
-            self.eplb_config.step_interval = self.eplb_step_interval
-            logger.warning_once(
-                "eplb_step_interval is deprecated and has been replaced "
-                "with eplb_config.step_interval. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-        if self.eplb_log_balancedness is not None:
-            self.eplb_config.log_balancedness = self.eplb_log_balancedness
-            logger.warning_once(
-                "eplb_log_balancedness is deprecated and has been replaced "
-                "with eplb_config.log_balancedness. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-
         # Continue with the rest of the initialization
         self.world_size = (
             self.pipeline_parallel_size
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b7c8f56e18c52..a7c6b11ccd5a8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -855,30 +855,6 @@ class EngineArgs:
             "--expert-placement-strategy",
             **parallel_kwargs["expert_placement_strategy"],
         )
-        parallel_group.add_argument(
-            "--num-redundant-experts",
-            type=int,
-            help="[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
-            deprecated=True,
-        )
-        parallel_group.add_argument(
-            "--eplb-window-size",
-            type=int,
-            help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
-            deprecated=True,
-        )
-        parallel_group.add_argument(
-            "--eplb-step-interval",
-            type=int,
-            help="[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
-            deprecated=True,
-        )
-        parallel_group.add_argument(
-            "--eplb-log-balancedness",
-            action=argparse.BooleanOptionalAction,
-            help="[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
-            deprecated=True,
-        )
 
         parallel_group.add_argument(
             "--max-parallel-loading-workers",

From 6f1355a1b74e4502e6a4e6ba9a811cc50729ee1f Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 24 Nov 2025 21:01:40 -0500
Subject: [PATCH 031/134] [Perf] Disable DeepGEMM MoE by default when TP=8 is
 used (#29346)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../model_executor/layers/quantization/fp8.py | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 9e2718057038d..e033032903e87 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -28,6 +28,7 @@ from vllm.model_executor.layers.fused_moe import (
     FusedMoeWeightScaleSupported,
 )
 from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEParallelConfig,
     FusedMoEQuantConfig,
     RoutingMethodType,
     fp8_w8a8_moe_quant_config,
@@ -118,7 +119,9 @@ class Fp8MoeBackend(Enum):
     TRITON = 6
 
 
-def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
+def get_fp8_moe_backend(
+    block_quant: bool, moe_parallel_config: FusedMoEParallelConfig
+) -> Fp8MoeBackend:
     """
     Select the primary FP8 MoE backend
     Note: Shape-specific fallbacks may still occur at runtime.
@@ -159,8 +162,19 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
         logger.info_once("Using Marlin backend for FP8 MoE")
         return Fp8MoeBackend.MARLIN
 
-    # deepGEMM on supported platforms with block-quantized weights
-    if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
+    # Determine if we should use DeepGEMM with block-quantized weights:
+    # - If explicitly set by user, respect their choice
+    # - If not explicitly set (default), disable when TP size is >= 8
+    moe_use_deep_gemm = envs.VLLM_MOE_USE_DEEP_GEMM
+    if not envs.is_set("VLLM_MOE_USE_DEEP_GEMM") and moe_parallel_config.tp_size >= 8:
+        moe_use_deep_gemm = False
+        logger.info_once(
+            "DeepGEMM MoE is disabled by default when TP size is >= 8. "
+            "Set VLLM_MOE_USE_DEEP_GEMM=1 to enable it.",
+            scope="local",
+        )
+
+    if envs.VLLM_USE_DEEP_GEMM and moe_use_deep_gemm and block_quant:
         if not has_deep_gemm():
             logger.warning_once(
                 "DeepGEMM backend requested but not available.", scope="local"
@@ -641,7 +655,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self.quant_config = quant_config
         self.weight_block_size = self.quant_config.weight_block_size
         self.block_quant: bool = self.weight_block_size is not None
-        self.fp8_backend = get_fp8_moe_backend(self.block_quant)
+        self.fp8_backend = get_fp8_moe_backend(
+            self.block_quant, layer.moe_parallel_config
+        )
 
         self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
         self.flashinfer_moe_backend: FlashinferMoeBackend | None = None

From 77e10c9cab751c83de0b2200977212922cc3776f Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Tue, 25 Nov 2025 10:05:46 +0800
Subject: [PATCH 032/134] [Perf][Deepseek] optimize
 gather_and_maybe_dequant_cache kernel's perf for extremely long sequence
 (#28029)

Signed-off-by: ganyi <ygan@amd.com>
---
 csrc/cache.h                             |  11 +-
 csrc/cache_kernels.cu                    | 176 +++++++++++------------
 csrc/torch_bindings.cpp                  |   3 +-
 tests/kernels/attention/test_cache.py    |  12 +-
 vllm/_custom_ops.py                      |   6 +-
 vllm/v1/attention/backends/mla/common.py |  28 +++-
 6 files changed, 131 insertions(+), 105 deletions(-)

diff --git a/csrc/cache.h b/csrc/cache.h
index b162a4a2bc31f..f2a5ec0acf5cd 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -41,11 +41,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
 
 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t num_tokens, const std::string& kv_cache_dtype,
     torch::Tensor const& scale,
     std::optional<torch::Tensor> seq_starts = std::nullopt);
 
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 32960cc8073bb..8a5457206c706 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -905,91 +905,79 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
 namespace vllm {
 
 // grid is launched with dimensions (batch, num_splits)
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
+          int ENTRY_SIZE, int CTA_SIZE>
 __global__ void gather_and_maybe_dequant_cache(
-    const cache_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE,
-                                              // ENTRIES...]
-    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
-    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
-    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
-    const int32_t block_size, const int32_t entry_size,
+    const cache_t* __restrict__ src_cache,     // [NUM_BLOCKS, BLOCK_SIZE,
+                                               // ENTRIES...]
+    scalar_t* __restrict__ dst,                // [TOT_TOKENS, ENTRIES...]
+    const int32_t* __restrict__ block_table,   // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ cu_seq_lens,   // [BATCH+1]
+    const int32_t* __restrict__ token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNK]
+    const int32_t num_tokens, const int32_t block_size,
     const int64_t block_table_stride, const int64_t cache_block_stride,
     const int64_t cache_entry_stride, const int64_t dst_entry_stride,
     const float* __restrict__ scale,
     const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
                                                // batch
+  constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
+  using ltype = vllm::vec_n_t<cache_t, vec_size>;
+  using stype = vllm::vec_n_t<scalar_t, vec_size>;
+  // We are adding this for code readability which will be optimized out when
+  // build in release.
+  assert(CTA_SIZE == blockDim.x);
 
-  const int64_t bid = blockIdx.x;  // Batch ID
-  const int32_t num_splits = gridDim.y;
-  const int32_t split = blockIdx.y;
-  const int32_t seq_start = cu_seq_lens[bid];
-  const int32_t seq_end = cu_seq_lens[bid + 1];
-  const int32_t seq_len = seq_end - seq_start;
-  const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
-  const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
+#pragma unroll
+  for (int token_id = blockIdx.x; token_id < num_tokens;
+       token_id += gridDim.x) {
+    int64_t batch_id = token_to_seq[token_id];
+    int64_t batch_start = cu_seq_lens[batch_id];
+    int64_t batch_end = cu_seq_lens[batch_id + 1];
+    int32_t batch_offset = token_id - batch_start;
 
-  const int32_t split_start = split * split_blocks;
-  const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
+    if (token_id >= batch_end) return;
+    int32_t offset = 0;
+    if (seq_starts != nullptr) {
+      offset = seq_starts[batch_id];
+    }
+    batch_offset += offset;
+    int32_t block_table_id = batch_offset / block_size;
+    int32_t slot_id = batch_offset % block_size;
+    int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
+    int32_t block_id = block_table[block_table_offset];
+    int64_t cache_offset =
+        block_id * cache_block_stride + slot_id * cache_entry_stride;
+    constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
+    scalar_t* dst_ = dst + token_id * dst_entry_stride;
+    cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
 
-  const bool is_active_split = (split_start < tot_blocks);
-  const bool is_last_split = (split_end == tot_blocks);
-
-  if (!is_active_split) return;
-
-  int32_t full_blocks_end = split_end;
-  int32_t partial_block_size = 0;
-
-  // Adjust the pointer for the block_table for this batch.
-  // If seq_starts is provided, compute an offset based on (seq_starts[bid] /
-  // page_size)
-  const int32_t batch_offset = bid * block_table_stride;
-  int32_t offset = 0;
-  if (seq_starts != nullptr) {
-    offset = seq_starts[bid] / block_size;
-  }
-  const int32_t* batch_block_table = block_table + batch_offset + offset;
-
-  // Adjust dst pointer based on the cumulative sequence lengths.
-  dst += seq_start * dst_entry_stride;
-
-  if (is_last_split) {
-    partial_block_size = seq_len % block_size;
-    if (partial_block_size) full_blocks_end -= 1;
-  }
-
-  auto copy_entry = [&](const cache_t* __restrict__ _src,
-                        scalar_t* __restrict__ _dst) {
-    for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
+#pragma unroll
+    for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
       if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        _dst[i] = static_cast<scalar_t>(_src[i]);
+        reinterpret_cast<stype*>(dst_)[idx] =
+            static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
       } else {
-        _dst[i] =
-            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
+        ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
+        stype store_val;
+#pragma unroll
+        for (int j = 0; j < vec_size; ++j) {
+          store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
+              loaded_val.val[j], *scale);
+        }
+        reinterpret_cast<stype*>(dst_)[idx] = store_val;
       }
     }
-  };
-
-  const auto loop_end =
-      std::min((int64_t)full_blocks_end, block_table_stride - offset);
-  for (int pid = split_start; pid < loop_end; ++pid) {
-    auto block_id = batch_block_table[pid];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
-    for (int eid = 0; eid < block_size; ++eid) {
-      copy_entry(block_start_ptr + eid * cache_entry_stride,
-                 block_dst_ptr + eid * dst_entry_stride);
-    }
-  }
-
-  if (partial_block_size) {
-    if (offset + full_blocks_end < block_table_stride) {
-      auto block_id = batch_block_table[full_blocks_end];
-      auto block_start_ptr = src_cache + block_id * cache_block_stride;
-      auto block_dst_ptr =
-          dst + full_blocks_end * block_size * dst_entry_stride;
-      for (int eid = 0; eid < partial_block_size; ++eid) {
-        copy_entry(block_start_ptr + eid * cache_entry_stride,
-                   block_dst_ptr + eid * dst_entry_stride);
+    // process tail
+    constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
+    dst_ = dst_ + ENTRY_SIZE - tail_cnt;
+    src_ = src_ + ENTRY_SIZE - tail_cnt;
+#pragma unroll
+    for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst_[idx] = static_cast<scalar_t>(src_[idx]);
+      } else {
+        dst_[idx] =
+            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
       }
     }
   }
@@ -1001,34 +989,38 @@ __global__ void gather_and_maybe_dequant_cache(
 // SCALAR_T is the data type of the destination tensor.
 // CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                      \
-  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE>         \
-      <<<grid, block, 0, stream>>>(                                         \
-          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                 \
-          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                      \
-          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
-          block_size, entry_size, block_table_stride, cache_block_stride,   \
-          cache_entry_stride, dst_entry_stride,                             \
-          reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
+#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                        \
+  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576,      \
+                                       thread_block_size>                     \
+      <<<grid, block, 0, stream>>>(                                           \
+          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                   \
+          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                        \
+          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(),   \
+          token_to_seq.data_ptr<int32_t>(), num_tokens, block_size,           \
+          block_table_stride, cache_block_stride, cache_entry_stride,         \
+          dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
+          seq_starts_ptr);
 
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
+//  - token_to_seq contains the back mapping from token_id to batch_id
 //  - Optionally, seq_starts (if provided) offsets the starting block index by
 //  (seq_starts[bid] / page_size)
 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t num_tokens, const std::string& kv_cache_dtype,
     torch::Tensor const& scale,
     std::optional<torch::Tensor> seq_starts = std::nullopt) {
   at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   int32_t block_size = src_cache.size(1);
-  int32_t entry_size = src_cache.flatten(2, -1).size(2);
+  int32_t head_dim = dst.size(-1);
 
   TORCH_CHECK(block_table.dtype() == torch::kInt32,
               "block_table must be int32");
@@ -1038,6 +1030,9 @@ void gather_and_maybe_dequant_cache(
     TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
                 "seq_starts must be int32");
   }
+  TORCH_CHECK(head_dim == 576,
+              "gather_and_maybe_dequant_cache only support the head_dim to 576 "
+              "for better performance")
 
   TORCH_CHECK(src_cache.device() == dst.device(),
               "src_cache and dst must be on the same device");
@@ -1055,10 +1050,9 @@ void gather_and_maybe_dequant_cache(
   int64_t cache_entry_stride = src_cache.stride(1);
   int64_t dst_entry_stride = dst.stride(0);
 
-  // Decide on the number of splits based on the batch size.
-  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
-  dim3 grid(batch_size, num_splits);
-  dim3 block(1024);
+  constexpr int32_t thread_block_size = 64;
+  dim3 grid(num_tokens);
+  dim3 block(thread_block_size);
 
   const int32_t* seq_starts_ptr =
       seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 5af74c2c2a6b0..14913bef13125 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -695,7 +695,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
   cache_ops.def(
       "gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, "
       "                               Tensor block_table, Tensor cu_seq_lens, "
-      "                               int batch_size, "
+      "                               Tensor token_to_seq, "
+      "                               int num_tokens, "
       "                               str kv_cache_dtype, "
       "                               Tensor scale, Tensor? seq_starts) -> ()");
   cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 028e164cb801b..acf46d75d62eb 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -921,12 +921,16 @@ def test_gather_and_maybe_dequant_cache_mla(
     )
     _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
 
-    seq_len_tensor = torch.randint(0, max_seq_len + 1, (batch_size,), device=device)
+    seq_len_tensor = torch.randint(
+        max_seq_len, max_seq_len + 1, (batch_size,), device=device
+    )
 
     total_tokens = seq_len_tensor.sum()
     cu_seq_lens = torch.empty((batch_size + 1), dtype=torch.int32, device=device)
     cu_seq_lens[0] = 0
     cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
+    token_to_seq = torch.arange(0, batch_size, dtype=torch.int32, device=device)
+    token_to_seq = torch.repeat_interleave(token_to_seq, seq_len_tensor)
     print("seq_len_tensor", seq_len_tensor)
 
     tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
@@ -977,7 +981,8 @@ def test_gather_and_maybe_dequant_cache_mla(
             dst,
             block_table,
             cu_seq_lens,
-            batch_size,
+            token_to_seq,
+            total_tokens,
             kv_cache_dtype,
             scale,
             None,
@@ -990,7 +995,8 @@ def test_gather_and_maybe_dequant_cache_mla(
         dst,
         block_table,
         cu_seq_lens,
-        batch_size,
+        token_to_seq,
+        total_tokens,
         kv_cache_dtype,
         scale,
         None,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 0f625a7945241..4a1bcc761f994 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2201,7 +2201,8 @@ def gather_and_maybe_dequant_cache(
     dst: torch.Tensor,
     block_table: torch.Tensor,
     cu_seq_lens: torch.Tensor,
-    batch_size: int,
+    token_to_seq: torch.Tensor,
+    num_tokens: int,
     kv_cache_dtype: str,
     scale: torch.Tensor,
     seq_starts: torch.Tensor | None = None,
@@ -2211,7 +2212,8 @@ def gather_and_maybe_dequant_cache(
         dst,
         block_table,
         cu_seq_lens,
-        batch_size,
+        token_to_seq,
+        num_tokens,
         kv_cache_dtype,
         scale,
         seq_starts,
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 43aef8a7cca91..87a3aac21d2c3 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -340,6 +340,8 @@ class MLACommonPrefillMetadata:
         max_seq_lens: list[int]
         seq_lens: torch.Tensor
         workspace: torch.Tensor
+        token_to_seq: torch.Tensor
+        chunk_total_token: list[int]
 
         # for mla DCP
         padded_local_chunk_seq_lens: list[list[int]] | None = None
@@ -839,6 +841,19 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 torch.cumsum(
                     chunk_seq_lens, dim=1, out=cu_seq_lens_cpu[:, 1:], dtype=torch.int32
                 )
+                chunk_total_token = cu_seq_lens_cpu[:, -1]
+
+                max_token_num_over_chunk = chunk_total_token.max().item()
+                token_to_seq_tensor_cpu = torch.zeros(
+                    [num_chunks, max_token_num_over_chunk], dtype=torch.int32
+                )
+                range_idx = torch.arange(num_prefills, dtype=torch.int32)
+                for i in range(num_chunks):
+                    chunk_token_to_seq_tensor = torch.repeat_interleave(
+                        range_idx, chunk_seq_lens[i]
+                    )
+                    chunk_len = chunk_token_to_seq_tensor.shape[0]
+                    token_to_seq_tensor_cpu[i, :chunk_len] = chunk_token_to_seq_tensor
 
                 if self.dcp_world_size > 1:
                     local_context_lens_allranks = get_dcp_local_seq_lens(
@@ -906,6 +921,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                         seq_tot=padded_local_chunk_seq_lens.sum(dim=1).tolist(),
                         max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                         seq_lens=chunk_seq_lens,
+                        token_to_seq=token_to_seq_tensor_cpu.to(
+                            device, non_blocking=True
+                        ),
+                        chunk_total_token=chunk_total_token.tolist(),
                         workspace=self.chunked_prefill_workspace,
                         padded_local_chunk_seq_lens=padded_local_chunk_seq_lens.tolist(),
                         local_context_lens_allranks=local_context_lens_allranks.tolist(),
@@ -922,6 +941,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                         seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
                         max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                         seq_lens=chunk_seq_lens,
+                        token_to_seq=token_to_seq_tensor_cpu.to(
+                            device, non_blocking=True
+                        ),
+                        chunk_total_token=chunk_total_token,
                         workspace=self.chunked_prefill_workspace,
                     )
 
@@ -1638,16 +1661,15 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         output = None
         iters = len(prefill_metadata.chunked_context.seq_tot)
         workspace = prefill_metadata.chunked_context.workspace
-
         for i in range(iters):
             toks = prefill_metadata.chunked_context.seq_tot[i]
-
             ops.gather_and_maybe_dequant_cache(
                 src_cache=kv_c_and_k_pe_cache,
                 dst=workspace,
                 block_table=prefill_metadata.block_table,
                 cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i],
-                batch_size=attn_metadata.num_prefills,
+                token_to_seq=prefill_metadata.chunked_context.token_to_seq[i],
+                num_tokens=prefill_metadata.chunked_context.chunk_total_token[i],
                 kv_cache_dtype=self.kv_cache_dtype,
                 scale=k_scale,
                 seq_starts=prefill_metadata.chunked_context.starts[i],

From cb7214d8eaa231c67416282668f1ca274f8068ba Mon Sep 17 00:00:00 2001
From: gbyu-amd <Guanbao.Yu@amd.com>
Date: Tue, 25 Nov 2025 10:15:02 +0800
Subject: [PATCH 033/134] [ROCm][MLA] enable fp8 MLA decode on ROCm (#28032)

Signed-off-by: guanbao <gyu@amd.com>
Signed-off-by: Guanbao Yu <gyu@amd.com>
Signed-off-by: gbyu-amd <Guanbao.Yu@amd.com>
Co-authored-by: guanbao <gyu@amd.com>
---
 vllm/_aiter_ops.py                               | 10 ++++++++++
 vllm/v1/attention/backends/mla/rocm_aiter_mla.py | 12 +++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index db79b3f5e8bcb..a8f472d147a0d 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -294,6 +294,8 @@ def _rocm_aiter_mla_decode_fwd_impl(
     kv_last_page_lens: torch.Tensor | None = None,
     sm_scale: float = 1.0,
     logit_cap: float = 0.0,
+    q_scale: torch.Tensor | None = None,
+    kv_scale: torch.Tensor | None = None,
 ) -> None:
     from aiter.mla import mla_decode_fwd
 
@@ -308,6 +310,8 @@ def _rocm_aiter_mla_decode_fwd_impl(
         max_seqlen_qo,
         sm_scale=sm_scale,
         logit_cap=logit_cap,
+        q_scale=q_scale,
+        kv_scale=kv_scale,
     )
 
 
@@ -322,6 +326,8 @@ def _rocm_aiter_mla_decode_fwd_fake(
     kv_last_page_lens: torch.Tensor | None = None,
     sm_scale: float = 1.0,
     logit_cap: float = 0.0,
+    q_scale: torch.Tensor | None = None,
+    kv_scale: torch.Tensor | None = None,
 ) -> None:
     pass
 
@@ -806,6 +812,8 @@ class rocm_aiter_ops:
         kv_indices: torch.Tensor | None = None,
         kv_last_page_lens: torch.Tensor | None = None,
         logit_cap: float = 0.0,
+        q_scale: torch.Tensor | None = None,
+        kv_scale: torch.Tensor | None = None,
     ):
         torch.ops.vllm.rocm_aiter_mla_decode_fwd(
             q,
@@ -818,6 +826,8 @@ class rocm_aiter_ops:
             kv_last_page_lens,
             sm_scale=sm_scale,
             logit_cap=logit_cap,
+            q_scale=q_scale,
+            kv_scale=kv_scale,
         )
 
     @staticmethod
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 56f9c7a281e7f..00a0a77a1c2f7 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -49,6 +49,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata):
     paged_kv_last_page_len: torch.Tensor | None = None
     # The query indptr, shape : [num_decode + 1]
     qo_indptr: torch.Tensor | None = None
+    # The dtype of MLA out tensor
+    attn_out_dtype: torch.dtype = torch.bfloat16
 
 
 class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
@@ -74,6 +76,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         )
 
         self.compilation_config = vllm_config.compilation_config
+        self.decode_attn_out_dtype = vllm_config.model_config.dtype
         # kernel block size is always 1.
         max_num_pages_per_req = vllm_config.model_config.max_model_len
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
@@ -162,6 +165,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
             paged_kv_last_page_len=paged_kv_last_page_len,
             qo_indptr=qo_indptr,
             dcp_tot_seq_lens=dcp_tot_seq_lens_device,
+            attn_out_dtype=self.decode_attn_out_dtype,
         )
 
         return attn_metadata
@@ -242,7 +246,11 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
         assert isinstance(q, torch.Tensor)
         B = q.shape[0]
         o = torch.zeros(
-            B, self.num_heads, self.kv_lora_rank, dtype=q.dtype, device=q.device
+            B,
+            self.num_heads,
+            self.kv_lora_rank,
+            dtype=attn_metadata.decode.attn_out_dtype,
+            device=q.device,
         )
 
         kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
@@ -260,6 +268,8 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
             attn_metadata.decode.paged_kv_indptr,
             attn_metadata.decode.paged_kv_indices,
             attn_metadata.decode.paged_kv_last_page_len,
+            q_scale=layer._q_scale,
+            kv_scale=layer._k_scale,
         )
 
         return o, None

From 22b42b5402f887c7d4b9f9aa4e82c970a6fd11a9 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Mon, 24 Nov 2025 20:15:39 -0600
Subject: [PATCH 034/134] [CI][ROCm] Install arctic-inference on ROCm tests
 (#29344)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 requirements/rocm-test.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 2d57e7e167869..f9bddc23420b4 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -45,3 +45,6 @@ multiprocess==0.70.16
 
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
+
+# Required for suffix decoding test
+arctic-inference == 0.1.1
\ No newline at end of file

From 7012d8b45e677a4316e38be6fb9547de2993b519 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Tue, 25 Nov 2025 10:54:00 +0800
Subject: [PATCH 035/134] [Docker] Optimize Dockerfile: consolidate apt-get and
 reduce image size by ~200MB (#29060)

Signed-off-by: princepride <wangzhipeng628@gmail.com>
---
 docker/Dockerfile | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index e03b9989a190c..84a1802dbe03a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -85,7 +85,20 @@ ARG GET_PIP_URL
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \
+    && apt-get install -y --no-install-recommends \
+        ccache \
+        software-properties-common \
+        git \
+        curl \
+        sudo \
+        python3-pip \
+        libibverbs-dev \
+        # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+        # as it was causing spam when compiling the CUTLASS kernels
+        gcc-10 \
+        g++-10 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
+    && rm -rf /var/lib/apt/lists/* \
     && curl -LsSf https://astral.sh/uv/install.sh | sh \
     && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
     && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
@@ -110,10 +123,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
-# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
-# as it was causing spam when compiling the CUTLASS kernels
-RUN apt-get install -y gcc-10 g++-10
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
 RUN <<EOF
 gcc --version
 EOF
@@ -268,7 +277,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy
 
 # Install libnuma-dev, required by fastsafetensors (fixes #20384)
-RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
@@ -305,8 +314,15 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y software-properties-common curl sudo python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && apt-get install -y --no-install-recommends \
+        software-properties-common \
+        curl \
+        sudo \
+        python3-pip \
+        ffmpeg \
+        libsm6 \
+        libxext6 \
+        libgl1 \
     && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
         if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
             mkdir -p -m 0755 /etc/apt/keyrings ; \
@@ -321,7 +337,12 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
         done ; \
     fi \
     && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && apt-get install -y --no-install-recommends \
+        python${PYTHON_VERSION} \
+        python${PYTHON_VERSION}-dev \
+        python${PYTHON_VERSION}-venv \
+        libibverbs-dev \
+    && rm -rf /var/lib/apt/lists/* \
     && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
     && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
     && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \

From 9cf4edae6ef2c972429560ca8f72d40688d10495 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 25 Nov 2025 03:15:13 +0000
Subject: [PATCH 036/134] [Metrics] Scheduled removal of deprecated metrics
 (#29330)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |   3 -
 vllm/v1/metrics/loggers.py               | 134 +++++++----------------
 2 files changed, 37 insertions(+), 100 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 4e7b765d7713f..65a6fd20bd0d1 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -183,9 +183,6 @@ async def test_metrics_counts(
 EXPECTED_METRICS_V1 = [
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
-    "vllm:gpu_cache_usage_perc",
-    "vllm:gpu_prefix_cache_queries",
-    "vllm:gpu_prefix_cache_hits",
     "vllm:kv_cache_usage_perc",
     "vllm:prefix_cache_queries",
     "vllm:prefix_cache_hits",
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index e2d82241ce210..bd18a152ffc08 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -440,57 +440,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
         # Setting default values
         self.record_sleep_state()
 
-        # GPU cache
-        #
-        # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
-        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
-        # TODO: remove in 0.12.0
-        if self.show_hidden_metrics:
-            gauge_gpu_cache_usage = self._gauge_cls(
-                name="vllm:gpu_cache_usage_perc",
-                documentation=(
-                    "GPU KV-cache usage. 1 means 100 percent usage."
-                    "DEPRECATED: Use vllm:kv_cache_usage_perc instead."
-                ),
-                multiprocess_mode="mostrecent",
-                labelnames=labelnames,
-            )
-            self.gauge_gpu_cache_usage = make_per_engine(
-                gauge_gpu_cache_usage, engine_indexes, model_name
-            )
-
-        # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
-        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
-        # TODO: remove in 0.12.0
-        if self.show_hidden_metrics:
-            counter_gpu_prefix_cache_queries = self._counter_cls(
-                name="vllm:gpu_prefix_cache_queries",
-                documentation=(
-                    "GPU prefix cache queries, in terms of number of queried"
-                    "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
-                ),
-                labelnames=labelnames,
-            )
-            self.counter_gpu_prefix_cache_queries = make_per_engine(
-                counter_gpu_prefix_cache_queries, engine_indexes, model_name
-            )
-
-        # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
-        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
-        # TODO: remove in 0.12.0
-        if self.show_hidden_metrics:
-            counter_gpu_prefix_cache_hits = self._counter_cls(
-                name="vllm:gpu_prefix_cache_hits",
-                documentation=(
-                    "GPU prefix cache hits, in terms of number of cached "
-                    "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
-                ),
-                labelnames=labelnames,
-            )
-            self.counter_gpu_prefix_cache_hits = make_per_engine(
-                counter_gpu_prefix_cache_hits, engine_indexes, model_name
-            )
-
         gauge_kv_cache_usage = self._gauge_cls(
             name="vllm:kv_cache_usage_perc",
             documentation="KV-cache usage. 1 means 100 percent usage.",
@@ -735,39 +684,41 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
         )
 
         # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
-        # TODO: in 0.12, only enable if show_hidden_metrics=True
-        histogram_time_per_output_token = self._histogram_cls(
-            name="vllm:time_per_output_token_seconds",
-            documentation=(
-                "Histogram of time per output token in seconds."
-                "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
-            ),
-            buckets=[
-                0.01,
-                0.025,
-                0.05,
-                0.075,
-                0.1,
-                0.15,
-                0.2,
-                0.3,
-                0.4,
-                0.5,
-                0.75,
-                1.0,
-                2.5,
-                5.0,
-                7.5,
-                10.0,
-                20.0,
-                40.0,
-                80.0,
-            ],
-            labelnames=labelnames,
-        )
-        self.histogram_time_per_output_token = make_per_engine(
-            histogram_time_per_output_token, engine_indexes, model_name
-        )
+        # With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
+        # TODO: remove in 0.13.0
+        if self.show_hidden_metrics:
+            histogram_time_per_output_token = self._histogram_cls(
+                name="vllm:time_per_output_token_seconds",
+                documentation=(
+                    "Histogram of time per output token in seconds."
+                    "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
+                ),
+                buckets=[
+                    0.01,
+                    0.025,
+                    0.05,
+                    0.075,
+                    0.1,
+                    0.15,
+                    0.2,
+                    0.3,
+                    0.4,
+                    0.5,
+                    0.75,
+                    1.0,
+                    2.5,
+                    5.0,
+                    7.5,
+                    10.0,
+                    20.0,
+                    40.0,
+                    80.0,
+                ],
+                labelnames=labelnames,
+            )
+            self.histogram_time_per_output_token = make_per_engine(
+                histogram_time_per_output_token, engine_indexes, model_name
+            )
 
         histogram_inter_token_latency = self._histogram_cls(
             name="vllm:inter_token_latency_seconds",
@@ -966,20 +917,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
             self.gauge_scheduler_waiting[engine_idx].set(
                 scheduler_stats.num_waiting_reqs
             )
-            if self.show_hidden_metrics:
-                self.gauge_gpu_cache_usage[engine_idx].set(
-                    scheduler_stats.kv_cache_usage
-                )
             self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage)
 
-            if self.show_hidden_metrics:
-                self.counter_gpu_prefix_cache_queries[engine_idx].inc(
-                    scheduler_stats.prefix_cache_stats.queries
-                )
-                self.counter_gpu_prefix_cache_hits[engine_idx].inc(
-                    scheduler_stats.prefix_cache_stats.hits
-                )
-
             self.counter_prefix_cache_queries[engine_idx].inc(
                 scheduler_stats.prefix_cache_stats.queries
             )
@@ -1050,7 +989,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
             self.histogram_time_to_first_token[engine_idx].observe(ttft)
         for itl in iteration_stats.inter_token_latencies_iter:
             self.histogram_inter_token_latency[engine_idx].observe(itl)
-            self.histogram_time_per_output_token[engine_idx].observe(itl)
+            if self.show_hidden_metrics:
+                self.histogram_time_per_output_token[engine_idx].observe(itl)
 
         for finished_request in iteration_stats.finished_requests:
             self.counter_request_success[finished_request.finish_reason][

From 87185c88d54bd97c4c08f1fd3c5a8564e4924e2a Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Tue, 25 Nov 2025 03:19:52 +0000
Subject: [PATCH 037/134] =?UTF-8?q?[Bugfix]=20Make=20deprecated=20`--task?=
 =?UTF-8?q?=20embedding`=20consistent=20with=20`--runner=E2=80=A6=20(#2931?=
 =?UTF-8?q?2)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 vllm/config/model.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 49688e17cf932..c37dd7c15f2a7 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -585,16 +585,26 @@ class ModelConfig:
                 else:  # task == "auto"
                     pass
             else:
-                debug_info = {
-                    "architectures": architectures,
-                    "is_generative_model": is_generative_model,
-                    "is_pooling_model": is_pooling_model,
-                }
-                raise AssertionError(
-                    "The model should be a generative or "
-                    "pooling model when task is set to "
-                    f"{self.task!r}. Found: {debug_info}"
-                )
+                # Neither generative nor pooling model - try to convert if possible
+                if is_pooling_task:
+                    runner = "pooling"
+                    convert = _task_to_convert(self.task)
+                    msg_hint = (
+                        "Please replace this option with `--runner pooling "
+                        f"--convert {convert}` to continue using this model "
+                        "as a pooling model."
+                    )
+                else:
+                    debug_info = {
+                        "architectures": architectures,
+                        "is_generative_model": is_generative_model,
+                        "is_pooling_model": is_pooling_model,
+                    }
+                    raise AssertionError(
+                        "The model should be a generative or "
+                        "pooling model when task is set to "
+                        f"{self.task!r}. Found: {debug_info}"
+                    )
 
             self.runner = runner
             self.convert = convert

From 92effb07a48e56c531a95b696acd5f699baf16da Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 25 Nov 2025 11:28:51 +0800
Subject: [PATCH 038/134] [Model] Add HunyuanOCR support (#29327)

Signed-off-by: manayang <jackmanayang@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: sergeywang <sergeywang@tencent.com>
Co-authored-by: manayang <jackmanayang@gmail.com>
Co-authored-by: manayang <manayang@tencent.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 docs/models/supported_models.md               |    1 +
 examples/offline_inference/vision_language.py |   26 +
 tests/models/registry.py                      |    4 +
 vllm/config/model.py                          |    5 +
 .../layers/rotary_embedding/__init__.py       |   13 +
 .../layers/rotary_embedding/xdrope.py         |  102 ++
 vllm/model_executor/models/hunyuan_v1.py      |   11 +-
 vllm/model_executor/models/hunyuan_vision.py  | 1028 +++++++++++++++++
 vllm/model_executor/models/interfaces.py      |   51 +-
 vllm/model_executor/models/registry.py        |    4 +
 vllm/transformers_utils/config.py             |   18 +
 vllm/transformers_utils/configs/__init__.py   |    8 +
 vllm/transformers_utils/configs/hunyuan_vl.py |  322 ++++++
 .../transformers_utils/processors/__init__.py |   10 +-
 .../processors/hunyuan_vl.py                  |  233 ++++
 .../processors/hunyuan_vl_image.py            |  477 ++++++++
 vllm/v1/worker/gpu_input_batch.py             |    2 +
 vllm/v1/worker/gpu_model_runner.py            |  104 +-
 18 files changed, 2415 insertions(+), 4 deletions(-)
 create mode 100644 vllm/model_executor/layers/rotary_embedding/xdrope.py
 create mode 100644 vllm/model_executor/models/hunyuan_vision.py
 create mode 100644 vllm/transformers_utils/configs/hunyuan_vl.py
 create mode 100644 vllm/transformers_utils/processors/hunyuan_vl.py
 create mode 100644 vllm/transformers_utils/processors/hunyuan_vl_image.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 404519f887dc6..25579835faf63 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -680,6 +680,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
+| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 624de2a2debc3..65ea4df4a3099 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -538,6 +538,31 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# HunyuanOCR
+def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "tencent/HunyuanOCR"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    placeholder = "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+    prompts = [
+        f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=None,
+    )
+
+
 # naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
 def run_hyperclovax_seed_vision(
     questions: list[str], modality: str
@@ -1820,6 +1845,7 @@ model_example_map = {
     "glm4_5v": run_glm4_5v,
     "glm4_5v_fp8": run_glm4_5v_fp8,
     "h2ovl_chat": run_h2ovl,
+    "hunyuan_vl": run_hunyuan_vl,
     "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
     "idefics3": run_idefics3,
     "interns1": run_interns1,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 758ec54493aa3..f8b3470e6d39b 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -626,6 +626,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
         trust_remote_code=True,
     ),
+    "HunYuanVLForConditionalGeneration": _HfExamplesInfo(
+        "tencent/HunyuanOCR",
+        is_available_online=False,
+    ),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo(
         "HuggingFaceM4/Idefics3-8B-Llama3",
         extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},
diff --git a/vllm/config/model.py b/vllm/config/model.py
index c37dd7c15f2a7..caa9a3440c41d 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -33,6 +33,7 @@ from vllm.transformers_utils.config import (
     try_get_safetensors_metadata,
     try_get_tokenizer_config,
     uses_mrope,
+    uses_xdrope_dim,
 )
 from vllm.transformers_utils.gguf_utils import (
     maybe_patch_hf_config_from_gguf,
@@ -1615,6 +1616,10 @@ class ModelConfig:
     def uses_mrope(self) -> bool:
         return uses_mrope(self.hf_config)
 
+    @property
+    def uses_xdrope_dim(self) -> int:
+        return uses_xdrope_dim(self.hf_config)
+
     @property
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index 152d9401b8e94..0f10bff6ac4f5 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -17,6 +17,7 @@ from .llama4_vision_rope import Llama4VisionRotaryEmbedding
 from .mrope import MRotaryEmbedding
 from .ntk_scaling_rope import NTKScalingRotaryEmbedding
 from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding
+from .xdrope import XDRotaryEmbedding
 from .yarn_scaling_rope import YaRNScalingRotaryEmbedding
 
 _ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
@@ -184,6 +185,18 @@ def get_rope(
                 raise ValueError(
                     "Dynamic rope scaling must contain either 'alpha' or 'factor' field"
                 )
+        elif scaling_type == "xdrope":
+            scaling_alpha = rope_parameters["alpha"]
+            rotary_emb = XDRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_alpha,
+                dtype,
+                xdrope_section=rope_parameters["xdrope_section"],
+            )
         elif scaling_type == "yarn":
             scaling_factor = rope_parameters["factor"]
             original_max_position = rope_parameters["original_max_position_embeddings"]
diff --git a/vllm/model_executor/layers/rotary_embedding/xdrope.py b/vllm/model_executor/layers/rotary_embedding/xdrope.py
new file mode 100644
index 0000000000000..2432273faf195
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/xdrope.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import torch
+
+from .common import apply_rotary_emb_dispatch
+from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
+
+
+class XDRotaryEmbedding(DynamicNTKAlphaRotaryEmbedding):
+    """DynamicNTKAlphaRotaryEmbedding extended with MultiModal(XD) Sections.
+
+    Based on the original DynamicNTKAlphaRotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+        xdrope_section: list[int],
+    ) -> None:
+        self.xdrope_section = xdrope_section
+        super().__init__(
+            head_size,
+            rotary_dim,
+            max_position_embeddings,
+            base,
+            is_neox_style,
+            scaling_alpha,
+            dtype,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [4, num_tokens] (P/W/H/T positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = torch.cat(
+            [m[i] for i, m in enumerate(cos.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+        sin = torch.cat(
+            [m[i] for i, m in enumerate(sin.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    @staticmethod
+    def get_next_input_positions(
+        context_len: int,
+        seq_len: int,
+        xd_sections: int = 4,
+    ) -> list[list[int]]:
+        return [list(range(context_len, seq_len)) for _ in range(xd_sections)]
+
+    @staticmethod
+    def get_next_input_positions_tensor(
+        out: np.ndarray,
+        out_offset: int,
+        context_len: int,
+        num_new_tokens: int,
+    ):
+        values = np.arange(
+            context_len,
+            context_len + num_new_tokens,
+            dtype=out.dtype,
+        )
+        out[:, out_offset : out_offset + num_new_tokens] = values
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 9fa5e2bd33f21..53fb444ed622d 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -576,7 +576,16 @@ class HunYuanDecoderLayer(nn.Module):
         return hidden_states, residual, ori_kv_states
 
 
-@support_torch_compile
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (xd, seq_len) if xdrope is enabled for hunyuan-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
 class HunYuanModel(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
new file mode 100644
index 0000000000000..e83addd0c092f
--- /dev/null
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -0,0 +1,1028 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# coding=utf-8
+# Copyright 2025 The HunYuan team.
+# Copyright 2025 The vLLM team.
+# Copyright 2025 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only HunYuan-VL model compatible with HuggingFace weights."""
+
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from functools import partial
+from typing import Annotated, Any, Literal, TypeAlias
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BatchFeature
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import MultiHeadAttention
+from vllm.config import MultiModalConfig, VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    ImageItem,
+    ModalityData,
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    DictEmbeddingItems,
+    ImageSize,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.hunyuan_vl import (
+    HunYuanVLConfig,
+    HunYuanVLVisionConfig,
+)
+from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
+from vllm.transformers_utils.processors.hunyuan_vl_image import smart_resize
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+    SupportsXDRoPE,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+# === Vision Inputs === #
+
+
+class HunYuanVLImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: Number of patches
+        - ni: Number of images
+        - cps: Number of channels * patch_size * patch_size
+    """
+
+    type: Literal["pixel_values"]
+
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", "cps"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+class HunYuanVLImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - nf: Number of image features
+        - hs: Hidden size
+        - ni: Number of images
+    """
+
+    type: Literal["image_embeds"]
+
+    image_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("nf", "hs"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+HunYuanVLImageInputs: TypeAlias = (
+    HunYuanVLImagePixelInputs | HunYuanVLImageEmbeddingInputs
+)
+
+# === Vision Encoder === #
+
+
+class HunYuanVisionMLP(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        bias: bool = True,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.dense_h_to_4h = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
+            disable_tp=use_data_parallel,
+        )
+        self.dense_4h_to_h = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
+            disable_tp=use_data_parallel,
+        )
+        self.act_fn = act_fn
+
+    def forward(self, x: torch.Tensor):
+        x_up, _ = self.dense_h_to_4h(x)
+        x_down, _ = self.dense_4h_to_h(self.act_fn(x_up))
+        return x_down
+
+
+class HunYuanVisionAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        self.tp_size = (
+            1
+            if use_data_parallel
+            else parallel_state.get_tensor_model_parallel_world_size()
+        )
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size
+        )
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+            disable_tp=use_data_parallel,
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+            disable_tp=use_data_parallel,
+        )
+
+        self.scale = self.hidden_size_per_attention_head**-0.5
+        self.attn = MultiHeadAttention(
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+            self.scale,
+            prefix=f"{prefix}.attn",
+            multimodal_config=multimodal_config,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+        out = self.attn(q, k, v)
+        output, _ = self.o_proj(out)
+        return output
+
+
+class HunYuanVisionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_hidden_dim: int,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.input_layernorm = norm_layer(dim)
+        self.post_attention_layernorm = norm_layer(dim)
+        self.self_attn = HunYuanVisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.self_attn",
+            use_data_parallel=use_data_parallel,
+        )
+        self.mlp = HunYuanVisionMLP(
+            dim,
+            mlp_hidden_dim,
+            act_fn=act_fn,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        x = x + self.self_attn(self.input_layernorm(x))
+        x = x + self.mlp(self.post_attention_layernorm(x))
+        return x
+
+
+class HunYuanVisionPatchEmbed(nn.Module):
+    def __init__(self, config: HunYuanVLVisionConfig):
+        super().__init__()
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.num_channels = config.num_channels
+        self.spatial_merge_size = config.spatial_merge_size
+        self.interpolate_mode = config.interpolate_mode
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=True,
+        )
+
+        self.max_num_patches = (config.max_image_size // self.patch_size) ** 2
+
+        self.num_positions = self.max_num_patches + 1
+        self.position_edge = int(self.num_positions**0.5)
+        # first token is cls token, skip it
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+        self.patch_pos_embed = None
+
+    def forward(
+        self, pixel_values: torch.Tensor, grid_thw: list[list[int]]
+    ) -> torch.Tensor:
+        num_patches = pixel_values.size(0)
+        pixel_values = pixel_values.reshape(
+            num_patches, self.num_channels, self.patch_size, self.patch_size
+        )
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        patch_embeds = patch_embeds.squeeze(-1).squeeze(-1).unsqueeze(0)
+
+        if self.patch_pos_embed is None:
+            patch_pos_shape = (
+                1,
+                self.position_edge,
+                self.position_edge,
+                self.embed_dim,
+            )
+            self.patch_pos_embed = (
+                self.position_embedding.weight[1:, :]
+                .reshape(patch_pos_shape)
+                .permute(0, 3, 1, 2)
+                .float()
+            )
+
+        patch_pos_embed_list = []
+        for grid in grid_thw:
+            _, h0, w0 = grid
+            # we add a small number to avoid floating point error in the interpolation
+            # see discussion at https://github.com/facebookresearch/dino/issues/8
+            h0, w0 = h0 + 0.1, w0 + 0.1
+            patch_pos_embed = nn.functional.interpolate(
+                self.patch_pos_embed,
+                scale_factor=(h0 / self.position_edge, w0 / self.position_edge),
+                mode=self.interpolate_mode,
+                align_corners=False,
+            )
+
+            patch_pos_embed = (
+                patch_pos_embed.reshape(self.embed_dim, -1)
+                .transpose(0, 1)
+                .unsqueeze(0)
+                .to(patch_embeds.dtype)
+            )
+            patch_pos_embed_list.append(patch_pos_embed)
+
+        patch_pos_embed = torch.cat(patch_pos_embed_list, dim=1)
+        embeddings = patch_embeds + patch_pos_embed
+
+        return embeddings
+
+
+class HunYuanVisionPatchMerger(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        spatial_merge_size=2,
+        rms_norm_eps=1e-5,
+        prefix="",
+    ):
+        super().__init__()
+        self.spatial_merge_size = spatial_merge_size
+        embed_std = out_channels**-0.5
+
+        self.proj = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                in_channels * 2,
+                kernel_size=spatial_merge_size,
+                stride=spatial_merge_size,
+            ),
+            nn.GELU(),
+            nn.Conv2d(in_channels * 2, in_channels * 4, kernel_size=1),
+        )
+        self.mlp = nn.Linear(in_channels * 4, out_channels)
+
+        self.image_newline = nn.Parameter(torch.randn(in_channels * 4) * embed_std)
+        self.image_begin = nn.Parameter(torch.randn(out_channels) * embed_std)
+        self.image_end = nn.Parameter(torch.randn(out_channels) * embed_std)
+        self.image_sep = nn.Parameter(torch.randn(out_channels) * embed_std)
+
+        self.before_rms = RMSNorm(in_channels, eps=rms_norm_eps)
+        self.after_rms = RMSNorm(out_channels, eps=rms_norm_eps)
+
+    def forward(self, x, size=(16, 16)):
+        x = self.before_rms(x)
+
+        h, w = size
+        dtype = x.dtype
+        x = x.permute(0, 2, 1).reshape(x.shape[0], -1, h, w)
+
+        x = self.proj(x)  # b,c,h,w
+        b, c, h, w = x.shape
+        x = torch.cat(
+            [x, self.image_newline.reshape(1, c, 1, 1).expand(b, c, h, 1).to(dtype)],
+            dim=-1,
+        )
+        x = x.reshape(b, c, -1).permute(0, 2, 1)
+        x = self.mlp(x)
+
+        begin = self.image_begin.reshape(1, 1, -1).expand(b, 1, x.shape[-1]).to(dtype)
+        end = self.image_end.reshape(1, 1, -1).expand(b, 1, x.shape[-1]).to(dtype)
+        x = torch.cat([begin, x, end], dim=1)
+
+        return self.after_rms(x)
+
+
+class HunYuanVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        vision_config: HunYuanVLVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+        multimodal_config: MultiModalConfig | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
+    ) -> None:
+        super().__init__()
+
+        num_hidden_layers = vision_config.num_hidden_layers
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_attention_heads
+        self.spatial_merge_size = vision_config.spatial_merge_size
+
+        from vllm.compilation.backends import set_model_tag
+
+        with set_model_tag("HunYuanVisionPatchEmbed"):
+            self.embeddings = HunYuanVisionPatchEmbed(vision_config)
+
+        norm_layer = partial(nn.LayerNorm, eps=vision_config.rms_norm_eps)
+
+        with set_model_tag("HunYuanVisionBlock"):
+            self.layers = nn.ModuleList(
+                [
+                    HunYuanVisionBlock(
+                        dim=vision_config.hidden_size,
+                        num_heads=vision_config.num_attention_heads,
+                        mlp_hidden_dim=vision_config.intermediate_size,
+                        act_fn=get_act_fn(vision_config.hidden_act),
+                        norm_layer=norm_layer,
+                        quant_config=quant_config,
+                        multimodal_config=multimodal_config,
+                        prefix=f"{prefix}.layers.{layer_idx}",
+                        use_data_parallel=use_data_parallel,
+                    )
+                    for layer_idx in range(num_hidden_layers)
+                ]
+            )
+
+        with set_model_tag("HunYuanVisionPatchMerger"):
+            self.perceive = HunYuanVisionPatchMerger(
+                vision_config.hidden_size,
+                vision_config.out_hidden_size,
+                spatial_merge_size=vision_config.spatial_merge_size,
+                rms_norm_eps=vision_config.rms_norm_eps,
+                prefix=f"{prefix}.perceive",
+            )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.embeddings.patch_embedding.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.embeddings.patch_embedding.weight.device
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: list[list[int]],
+    ) -> torch.Tensor:
+        # patchify
+        seq_len = x.size(0)
+        cu_seqlens: list = [0]
+
+        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        hidden_states = self.embeddings(hidden_states, grid_thw)
+
+        for t, h, w in grid_thw:
+            t, h, w = int(t), int(h), int(w)
+            cu_seqlens.append(h * w)
+
+        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32)
+        cu_seqlens = torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32)
+
+        cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
+
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        hidden_states = hidden_states.unsqueeze(0)
+        for layer_num, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states)
+
+        # adapter
+        split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        split_items = hidden_states.split(split_lengths, dim=1)
+        image_embeds_list = []
+        for grid, split_item in zip(grid_thw, split_items):
+            image_embeds_list.append(
+                self.perceive(split_item.contiguous(), size=grid[1:]).squeeze(0)
+            )
+
+        return image_embeds_list
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv", ".q_proj", "q"),
+            (".qkv", ".k_proj", "k"),
+            (".qkv", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+def _hunyuan_vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+    image_grid_sizes = image_grid_thw.prod(-1)
+    return dict(
+        pixel_values=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+    )
+
+
+class HunYuanVLMultiModalDataParser(MultiModalDataParser):
+    def _parse_image_data(
+        self,
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
+    ):
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                required_fields={"image_embeds", "image_grid_thw"},
+                fields_factory=_hunyuan_vl_field_config,
+            )
+
+        return super()._parse_image_data(data)
+
+
+class HunYuanVLProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(HunYuanVLConfig)
+
+    def get_hf_processor(
+        self,
+        **kwargs: object,
+    ) -> HunYuanVLProcessor:
+        return self.ctx.get_hf_processor(
+            HunYuanVLProcessor,
+            use_fast=kwargs.pop("use_fast", True),
+            **kwargs,
+        )
+
+    def get_image_processor(
+        self,
+        **kwargs: object,
+    ) -> HunYuanVLProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        max_image_tokens = self.get_max_image_tokens()
+        # TODO: support video
+        max_video_tokens = 0
+        return {"image": max_image_tokens, "video": max_video_tokens}
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+        image_processor: HunYuanVLProcessor | None,
+    ) -> tuple[ImageSize, int]:
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        spatial_merge_size = vision_config.spatial_merge_size
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * spatial_merge_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width, height=image_height)
+
+        grid_t = 1
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_vision_tokens = (
+            grid_t * grid_h // spatial_merge_size * (grid_w // spatial_merge_size + 1)
+            + 2
+        )
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor: HunYuanVLProcessor | None,
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            image_processor=image_processor,
+        )
+        return num_image_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=512,
+            image_height=8192,
+            image_processor=None,
+        )
+        return max_image_size
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            image_processor=None,
+        )
+
+
+class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        image_token: str = hf_processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 1)
+
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        return {
+            "image": self._get_dummy_images(
+                width=target_width, height=target_height, num_images=num_images
+            ),
+        }
+
+
+class HunYuanVLMultiModalProcessor(BaseMultiModalProcessor[HunYuanVLProcessingInfo]):
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return HunYuanVLMultiModalDataParser()
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            dict(**mm_kwargs, **tok_kwargs),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+
+        placeholder = {
+            "image": hf_processor.image_token_id,
+        }
+
+        merge_size = image_processor.merge_size
+
+        def get_replacement_hunyuan_vl(item_idx: int, modality: str):
+            out_item = out_mm_kwargs[modality][item_idx]
+            grid_thw = out_item[f"{modality}_grid_thw"].data
+            assert isinstance(grid_thw, torch.Tensor)
+
+            _, grid_h, grid_w = grid_thw
+            num_tokens = (int(grid_h) // merge_size) * (
+                int(grid_w) // merge_size + 1
+            ) + 2
+            return [placeholder[modality]] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=[placeholder[modality]],
+                replacement=partial(get_replacement_hunyuan_vl, modality=modality),
+            )
+            for modality in ("image",)
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _hunyuan_vl_field_config(hf_inputs)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    HunYuanVLMultiModalProcessor,
+    info=HunYuanVLProcessingInfo,
+    dummy_inputs=HunYuanVLDummyInputsBuilder,
+)
+class HunYuanVLForConditionalGeneration(
+    nn.Module,
+    SupportsMultiModal,
+    SupportsLoRA,
+    SupportsPP,
+    SupportsQuant,
+    SupportsXDRoPE,
+):
+    multimodal_cpu_fields = {"image_grid_thw"}
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "vit.vit.": "visual.",
+            "vit.": "visual.",
+            "model.": "language_model.model.",
+        }
+    )
+
+    supports_encoder_tp_data = True
+
+    def get_xdrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> torch.Tensor:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+
+        hf_config = self.config
+        image_start_token_id = hf_config.image_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        xd_num = len(hf_config.rope_scaling["xdrope_section"])
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        image_start_indices = torch.argwhere(
+            input_tokens_tensor == image_start_token_id
+        ).squeeze(1)
+
+        p_index = torch.arange(len(input_tokens_tensor))
+        w_index = torch.arange(len(input_tokens_tensor))
+        h_index = torch.arange(len(input_tokens_tensor))
+        t_index = torch.arange(len(input_tokens_tensor))
+        for image_index in range(len(image_start_indices)):
+            # +1 : first image_token, +2: for xdrope positions
+            pos = image_start_indices[image_index] + 2
+            t, h, w = image_grid_thw[image_index]
+            _, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+
+            token_num = (llm_grid_w + 1) * llm_grid_h
+            w_index[pos : pos + token_num].copy_(
+                torch.arange(0, llm_grid_w + 1)
+                .reshape(1, -1)
+                .expand(llm_grid_h, -1)
+                .reshape(-1)
+            )
+            h_index[pos : pos + token_num].copy_(
+                torch.arange(0, llm_grid_h)
+                .reshape(-1, 1)
+                .expand(-1, llm_grid_w + 1)
+                .reshape(-1)
+            )
+            h_index[pos : pos + token_num] = 0
+
+        if xd_num == 4:
+            llm_positions = torch.stack([p_index, w_index, h_index, t_index])
+        elif xd_num == 3:
+            llm_positions = torch.stack([w_index, h_index, t_index])
+
+        return llm_positions
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: HunYuanVLConfig = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        if multimodal_config.get_limit_per_prompt("image"):
+            attn_backend_override = (
+                multimodal_config.mm_encoder_attn_backend
+                if multimodal_config is not None
+                else None
+            )
+            self.visual = HunYuanVisionTransformer(
+                config.vision_config,
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, "visual"),
+                multimodal_config=multimodal_config,
+                attn_backend_override=attn_backend_override,
+            )
+        else:
+            self.visual = None
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model.model"),
+            architectures=[
+                "HunYuanDenseV1ForCausalLM",
+                "HunYuanMoEV1ForCausalLM",
+            ],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> HunYuanVLImageInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        # TODO: refine
+        if isinstance(pixel_values, list):
+            pixel_values = torch.cat(pixel_values, dim=0)
+        if len(pixel_values.shape) == 3:
+            last_dim = pixel_values.shape[-1]
+            pixel_values = pixel_values.reshape(-1, last_dim)
+            image_grid_thw = image_grid_thw.reshape(-1, 3)
+
+        if pixel_values is not None:
+            return HunYuanVLImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+
+        if image_embeds is not None:
+            return HunYuanVLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
+
+    def _process_image_input(
+        self, image_input: HunYuanVLImageInputs
+    ) -> tuple[torch.Tensor, ...]:
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
+
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"]
+
+            # TODO: use_data_parallel (split image_embeds in visual)
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+
+        return image_embeds
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "image" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
+                    **kwargs
+                )
+        return mm_input_by_modality
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not mm_input_by_modality:
+            return []
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                image_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += tuple(image_embeddings)
+        return multimodal_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model.model",
+            connector="visual.perceive",
+            tower_model="visual",
+        )
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 9966498e1b4c9..6f6ce32538b71 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1047,7 +1047,7 @@ class SupportsMRoPE(Protocol):
     supports_mrope: ClassVar[Literal[True]] = True
     """
     A flag that indicates this model supports M-RoPE.
-    
+
     Note:
         There is no need to redefine this flag if this class is in the
         MRO of your model class.
@@ -1088,3 +1088,52 @@ def supports_mrope(
     model: type[object] | object,
 ) -> TypeIs[type[SupportsMRoPE]] | TypeIs[SupportsMRoPE]:
     return isinstance(model, SupportsMRoPE)
+
+
+@runtime_checkable
+class SupportsXDRoPE(Protocol):
+    """The interface required for all models that support XD-RoPE."""
+
+    supports_xdrope: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports XD-RoPE.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        XDRope of your model class.
+    """
+
+    def get_xdrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list["MultiModalFeatureSpec"],
+    ) -> torch.Tensor:
+        """
+        Get XD-RoPE input positions and delta value for this specific model.
+
+        This method should be implemented by each model that supports XD-RoPE
+        to provide model-specific logic for computing input positions.
+
+        Args:
+            input_tokens: List of input token IDs
+            mm_features: Information about each multi-modal data item
+
+        Returns:
+            llm_positions: Tensor of shape `[xdrope_dim, num_tokens]` with
+            4D(P/W/H/T) or 3D(W/H/T) positions.
+        """
+        ...
+
+
+@overload
+def supports_xdrope(model: type[object]) -> TypeIs[type[SupportsXDRoPE]]: ...
+
+
+@overload
+def supports_xdrope(model: object) -> TypeIs[SupportsXDRoPE]: ...
+
+
+def supports_xdrope(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsXDRoPE]] | TypeIs[SupportsXDRoPE]:
+    return isinstance(model, SupportsXDRoPE)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b3da64af750c7..a0d8a78a2ae76 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -287,6 +287,10 @@ _MULTIMODAL_MODELS = {
         "GraniteSpeechForConditionalGeneration",
     ),
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
+    "HunYuanVLForConditionalGeneration": (
+        "hunyuan_vision",
+        "HunYuanVLForConditionalGeneration",
+    ),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
     "OpenCUAForConditionalGeneration": (
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 3d282da8c6112..c1880a3fba0ee 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -86,6 +86,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
     deepseek_vl_v2="DeepseekVLV2Config",
     deepseek_v32="DeepseekV3Config",
     flex_olmo="FlexOlmoConfig",
+    hunyuan_vl="HunYuanVLConfig",
     kimi_linear="KimiLinearConfig",
     kimi_vl="KimiVLConfig",
     RefinedWeb="RWConfig",  # For tiiuae/falcon-40b(-instruct)
@@ -549,6 +550,23 @@ def thinker_uses_mrope(config: PretrainedConfig) -> bool:
     return uses_mrope(thinker_text_config)
 
 
+def uses_xdrope_dim(config: PretrainedConfig) -> int:
+    """Detect if the model with this config uses XD-ROPE."""
+    xdrope_section = getattr(config, "xdrope_section", None)
+    if xdrope_section is not None and isinstance(xdrope_section, list):
+        return len(xdrope_section)
+    rope_scaling = getattr(config, "rope_scaling", None)
+    if rope_scaling is None:
+        return 0
+
+    if isinstance(rope_scaling, dict) and "xdrope_section" in rope_scaling:
+        xdrope_section = rope_scaling["xdrope_section"]
+        if xdrope_section is not None and isinstance(xdrope_section, list):
+            return len(xdrope_section)
+
+    return 0
+
+
 def is_encoder_decoder(config: PretrainedConfig) -> bool:
     """Detect if the model with this config is used as an encoder/decoder."""
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index d28fd8d033373..109f2b6986514 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -23,6 +23,11 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
+from vllm.transformers_utils.configs.hunyuan_vl import (
+    HunYuanVLConfig,
+    HunYuanVLTextConfig,
+    HunYuanVLVisionConfig,
+)
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
@@ -53,6 +58,9 @@ __all__ = [
     "DotsOCRConfig",
     "EAGLEConfig",
     "FlexOlmoConfig",
+    "HunYuanVLConfig",
+    "HunYuanVLTextConfig",
+    "HunYuanVLVisionConfig",
     "RWConfig",
     "JAISConfig",
     "Lfm2MoeConfig",
diff --git a/vllm/transformers_utils/configs/hunyuan_vl.py b/vllm/transformers_utils/configs/hunyuan_vl.py
new file mode 100644
index 0000000000000..a826ed9b5155d
--- /dev/null
+++ b/vllm/transformers_utils/configs/hunyuan_vl.py
@@ -0,0 +1,322 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/configuration_hunyuan_vl.py
+
+from transformers import PretrainedConfig
+
+
+class HunYuanVLVisionConfig(PretrainedConfig):
+    model_type = "hunyuan_vl"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_act="gelu",
+        hidden_size=1152,
+        intermediate_size=4304,
+        interpolate_mode="bilinear",
+        rms_norm_eps=1e-05,
+        learnable_mlp_pooling_size=0,
+        num_attention_heads=16,
+        num_key_value_heads=None,
+        num_channels=3,
+        num_hidden_layers=27,
+        out_hidden_size=4096,
+        patch_size=16,
+        remove_prenorm=True,
+        spatial_merge_size=2,
+        temporal_patch_size=1,
+        resize_resolution=2048,
+        img_max_token_num=4096,
+        max_image_size=2048,
+        video_max_image_size=768,
+        video_min_image_size=256,
+        min_image_size=512,
+        anyres_vit_max_image_size=2048,
+        max_vit_seq_len=16384,
+        text_hidden_size=3072,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.interpolate_mode = interpolate_mode
+        self.learnable_mlp_pooling_size = learnable_mlp_pooling_size
+        self.num_attention_heads = num_attention_heads
+        if not num_key_value_heads:
+            self.num_key_value_heads = num_attention_heads
+        else:
+            self.num_key_value_heads = num_key_value_heads
+        self.num_channels = num_channels
+        self.num_hidden_layers = num_hidden_layers
+        self.out_hidden_size = out_hidden_size
+        self.patch_size = patch_size
+        self.remove_prenorm = remove_prenorm
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.rms_norm_eps = rms_norm_eps
+
+        self.resize_resolution = resize_resolution
+        self.img_max_token_num = img_max_token_num
+        self.max_image_size = max_image_size
+        self.min_image_size = min_image_size
+        self.video_max_image_size = video_max_image_size
+        self.video_min_image_size = video_min_image_size
+        self.anyres_vit_max_image_size = anyres_vit_max_image_size
+        self.max_vit_seq_len = max_vit_seq_len
+        self.text_hidden_size = text_hidden_size
+
+
+class HunYuanVLTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HunYuanVLTextConfig`]. It is used to instantiate an
+    HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the HunYuan-7B.
+    Hunyuan-7B-Instruct [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 290943):
+            Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HunYuanVLTextConfig`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations or shared MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        eod_token_id (int, *optional*, defaults to 3):
+            Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
+            Example: In multi-document processing, this token helps the model distinguish between separate documents.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+    """  # noqa: E501
+
+    model_type = "hunyuan_vl_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=290943,
+        hidden_size=4096,
+        intermediate_size: int = 11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        eod_token_id=3,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # self._rope_scaling_validation()   # TODO: Need validation?
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and "
+                f"`factor` or `type` and `alpha`, got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        rope_scaling_alpha = self.rope_scaling.get("alpha", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                "`rope_scaling`'s type field must be one of ['linear', 'dynamic'], "
+                f"got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None and rope_scaling_alpha is None:
+            raise ValueError(
+                "`rope_scaling`'s factor or alpha field must be have one, "
+                "got both of none"
+            )
+        if rope_scaling_factor is not None and (
+            not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0
+        ):
+            raise ValueError(
+                "`rope_scaling`'s factor field must be a float > 1.0, "
+                f"got {rope_scaling_factor}"
+            )
+        if rope_scaling_alpha is not None and (
+            not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0
+        ):
+            raise ValueError(
+                "`rope_scaling`'s alpha field must be a float > 1.0, "
+                f"got {rope_scaling_alpha}"
+            )
+
+
+class HunYuanVLConfig(PretrainedConfig):
+    model_type = "hunyuan_vl"
+    sub_configs = {
+        "vision_config": HunYuanVLVisionConfig,
+        "text_config": HunYuanVLTextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        im_start_id=120118,
+        im_end_id=120119,
+        image_token_id=120120,
+        im_newline_id=120121,
+        video_start_id=120122,
+        video_end_id=120123,
+        **kwargs,
+    ):
+        # We need to init super() here so that it does not reset values
+        # that are in text config to the BaseClass defaults. The Base
+        # config has many text related defaults and not all defaults are
+        # same as for `HunYuanVLTextConfig`.
+        super().__init__(**kwargs)
+
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init `TextConfig`
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+
+        self.image_token_id = image_token_id
+        self.im_start_id = im_start_id
+        self.im_end_id = im_end_id
+        self.im_newline_id = im_newline_id
+        self.video_start_id = video_start_id
+        self.video_end_id = video_end_id
+
+        self.vision_config.text_hidden_size = self.text_config.hidden_size
+
+        # Attention implementation to use. It sets it recursively on sub-configs
+        # so we call it again in the end.
+        self._attn_implementation = kwargs.pop("attn_implementation", None)
+
+    def __setattr__(self, key, value):
+        if (
+            (text_config := super().__getattribute__("__dict__").get("text_config"))
+            is not None
+            and key not in ["dtype", "_attn_implementation_internal"]
+            and key in text_config.__dict__
+        ):
+            setattr(text_config, key, value)
+        else:
+            super().__setattr__(key, value)
+
+    def __getattribute__(self, key):
+        if "text_config" in super().__getattribute__("__dict__") and key not in [
+            "_name_or_path",
+            "model_type",
+            "dtype",
+            "_attn_implementation_internal",
+        ]:
+            text_config = super().__getattribute__("text_config")
+            if key in text_config.__dict__:
+                return getattr(text_config, key)
+
+        return super().__getattribute__(key)
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 76b6d3dc9c99a..b49fdbe9ce776 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -9,7 +9,15 @@ reasons:
 """
 
 from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
+from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
+from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
 from vllm.transformers_utils.processors.ovis import OvisProcessor
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 
-__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
+__all__ = [
+    "DeepseekVLV2Processor",
+    "HunYuanVLProcessor",
+    "HunYuanVLImageProcessor",
+    "OvisProcessor",
+    "Ovis2_5Processor",
+]
diff --git a/vllm/transformers_utils/processors/hunyuan_vl.py b/vllm/transformers_utils/processors/hunyuan_vl.py
new file mode 100644
index 0000000000000..615a8bff85912
--- /dev/null
+++ b/vllm/transformers_utils/processors/hunyuan_vl.py
@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/processing_hunyuan_vl.py
+
+import numpy as np
+import torch
+from transformers import AutoProcessor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.video_utils import VideoInput
+
+
+class HunYuanVLProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"  # ("AutoTokenizer", None)
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        video_processor=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        # TODO Fix the init
+        self.tokenizer = tokenizer
+        self.image_token_id = 120120  # self.tokenizer.image_token_id
+        self.image_token = self.tokenizer.convert_ids_to_tokens(self.image_token_id)
+        self.im_start_token_id = 120118  # self.tokenizer.im_start_id
+        self.im_start_token = self.tokenizer.convert_ids_to_tokens(
+            self.im_start_token_id
+        )
+        self.im_end_token_id = 120119  # self.tokenizer.im_end_id
+        self.im_end_token = self.tokenizer.convert_ids_to_tokens(self.im_end_token_id)
+        self.placeholder_token = self.tokenizer.convert_ids_to_tokens(
+            self.tokenizer.vocab_size - 1
+        )
+        self.pad_id = 120002  # self.tokenizer.pad_token_id
+
+        super().__init__(
+            image_processor, tokenizer, video_processor, chat_template=chat_template
+        )
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
+        videos: VideoInput = None,
+        **kwargs,
+    ) -> BatchFeature:
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images)
+            image_grid_thw = image_inputs["image_grid_thw"]
+
+        if not isinstance(text, list):
+            text = [text]
+
+        text = text.copy()  # below lines change text in-place
+
+        image_tokens_cumsum = [0]
+        if images is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    grid_h, grid_w = image_grid_thw[index][-2:]
+                    patch_h = grid_h // self.image_processor.merge_size
+                    patch_w = grid_w // self.image_processor.merge_size
+                    num_image_tokens = patch_h * (patch_w + 1) + 2
+                    image_tokens_cumsum.append(
+                        image_tokens_cumsum[-1] + num_image_tokens
+                    )
+                    # text[i] = text[i].replace(self.image_token, self.im_start_token + self.placeholder_token * num_image_tokens + self.im_end_token, 1) # noqa: E501
+                    text[i] = text[i].replace(
+                        self.image_token, self.placeholder_token * num_image_tokens, 1
+                    )
+                    index += 1
+                text[i] = text[i].replace(self.placeholder_token, self.image_token)
+                # text[i] = self.tokenizer.bos_token + text[i]
+
+        text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        input_ids = text_inputs["input_ids"]
+        position_ids = torch.arange(len(input_ids[0]))
+        position_ids_w = torch.arange(len(input_ids[0]))
+        position_ids_h = torch.arange(len(input_ids[0]))
+        position_ids_t = torch.arange(len(input_ids[0]))
+
+        if images is not None:
+            image_token_pos_indices = torch.where(input_ids[0] == self.image_token_id)[
+                0
+            ]
+            for i in range(len(image_grid_thw)):
+                grid_h, grid_w = image_grid_thw[i][-2:]
+                patch_h = grid_h // self.image_processor.merge_size
+                patch_w = grid_w // self.image_processor.merge_size
+                start_pos = image_token_pos_indices[image_tokens_cumsum[i]].item() + 1
+                replace_num = (patch_w + 1) * patch_h
+                position_ids_w[start_pos : start_pos + replace_num] = torch.tensor(
+                    list(range(patch_w + 1)) * patch_h, dtype=torch.int64
+                )
+                patch_h_list = []
+                for h in range(patch_h):
+                    patch_h_list += [h] * (patch_w + 1)
+                position_ids_h[start_pos : start_pos + replace_num] = torch.tensor(
+                    patch_h_list, dtype=torch.int64
+                )
+                position_ids_t[start_pos : start_pos + replace_num] = 0
+
+        position_ids = torch.stack(
+            [position_ids, position_ids_w, position_ids_h, position_ids_t]
+        ).unsqueeze(0)
+        text_inputs["position_ids"] = position_ids
+
+        attention_mask = input_ids.ne(self.pad_id)
+        text_inputs["attention_mask"] = attention_mask
+        text_inputs["imgs_pos"] = [self.get_imgs_pos(input_ids)]
+        # image_inputs["imgs"] = [[image_inputs["pixel_values"]]]
+
+        return_tensors = kwargs.pop("return_tensors", None)
+        return BatchFeature(
+            data={**text_inputs, **image_inputs},
+            tensor_type=return_tensors,
+        )
+
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def post_process_image_text_to_text(
+        self,
+        generated_outputs,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        assert 0
+
+    def apply_chat_template(self, *args, **kwargs):
+        token_ids = self.tokenizer.apply_chat_template(*args, **kwargs)
+        return token_ids
+
+    def get_imgs_pos(self, doc_ids):
+        doc_ids = np.array(doc_ids, dtype=np.int64)
+        img_begin_index = np.where(doc_ids == self.im_start_token_id)[0]
+        img_end_index = np.where(doc_ids == self.im_end_token_id)[0]
+        imgs_pos = np.concatenate(
+            (
+                np.reshape(img_begin_index + 1, (-1, 1)),
+                np.reshape(img_end_index, (-1, 1)),
+            ),
+            axis=-1,
+        ).tolist()
+        return imgs_pos
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+def split_image_into_patch_blocks(
+    pixel_values: torch.Tensor,  # shape: [batch_size, 3, H, W]
+    patch_size: int = 16,  # e.g. 16
+    adaptor_patch_div: int = 4,  # e.g. 4 --> each patch_size is cut into 4x4 small regions, i.e. patch_size // 4 # noqa: E501
+) -> torch.Tensor:
+    """
+    Split the input image tensor (supporting batch) into large patches of size `patch_size`,
+    and then further divide each large patch into smaller regions of size
+    (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div).
+    Each small region is extracted as a tensor of shape [3, patch_size, patch_size].
+    The final output contains all such small region tensors.
+
+    Args:
+        pixel_values: Input image tensor of shape [batch_size, 3, H, W].
+        patch_size: Size of the large patch, e.g., 16.
+        adaptor_patch_div: Each large patch is divided into
+                          (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div)
+                          smaller regions.
+
+    Returns:
+        patches: A tensor of shape [N, 3, patch_size, patch_size],
+                 where N = batch_size * (H // patch_size) * (W // patch_size) * (patch_size // adaptor_patch_div)^2.
+                 Each element in the batch corresponds to one small image region.
+    """  # noqa: E501
+    batch_size, channels, height, width = pixel_values.shape
+    assert channels == 3, "Pixel values must have 3 channels in dim=1"
+    assert height % patch_size == 0 and width % patch_size == 0, (
+        "H and W must be divisible by patch_size"
+    )
+
+    patch_height_num = height // patch_size
+    patch_width_num = width // patch_size
+
+    # Reshape to [B, 3, ph, ps, pw, ps]
+    img = pixel_values.reshape(
+        batch_size, 3, patch_height_num, patch_size, patch_width_num, patch_size
+    )
+
+    # Further split each psxps patch into (ps//aps)x(ps//aps) small regions
+    img = img.reshape(
+        batch_size,
+        3,
+        patch_height_num,
+        patch_size // adaptor_patch_div,  # ps // aps
+        adaptor_patch_div,
+        patch_width_num,
+        patch_size // adaptor_patch_div,  # ps // aps
+        adaptor_patch_div,
+    )
+
+    # Permute to group the small regions: [B, ph, pw, ps//aps, ps//aps, 3, aps, aps]
+    img = img.permute(0, 2, 5, 3, 6, 1, 4, 7)
+
+    # Reshape into [B * ph * pw * (ps//aps)^2, 3, patch_size, patch_size]
+    patches = img.reshape(-1, 3, patch_size, patch_size)
+
+    return patches
+
+
+AutoProcessor.register("HunYuanVLProcessor", HunYuanVLProcessor)
diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py
new file mode 100644
index 0000000000000..0a7e7865c783a
--- /dev/null
+++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py
@@ -0,0 +1,477 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/image_processing_hunyuan_vl.py
+"""Image processor class for HunYuanVL."""
+
+# isort conflicts with ruff for transformers imports
+# isort: skip_file
+import math
+
+import numpy as np
+import torchvision.transforms as transforms
+from transformers import AutoImageProcessor
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_flat_list_of_images,
+    make_list_of_images,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, logging
+from transformers.video_utils import VideoInput, make_batched_videos
+
+logger = logging.get_logger(__name__)
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 16,
+    min_pixels: int = 512 * 512,
+    max_pixels: int = 2048 * 2048,
+):
+    """Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+
+    """
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            "absolute aspect ratio must be smaller than 200, got "
+            f"{max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = max(factor, math.floor(height / beta / factor) * factor)
+        w_bar = max(factor, math.floor(width / beta / factor) * factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+class HunYuanVLImageProcessor(BaseImageProcessor):
+    model_input_names = [
+        "pixel_values",
+        "image_grid_thw",
+        "pixel_values_videos",
+        "video_grid_thw",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: int | float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int | None = None,
+        max_pixels: int | None = None,
+        patch_size: int = 16,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if size is not None and (
+            "shortest_edge" not in size or "longest_edge" not in size
+        ):
+            raise ValueError(
+                "size must contain 'shortest_edge' and 'longest_edge' keys."
+            )
+        else:
+            size = {"shortest_edge": 512 * 512, "longest_edge": 2048 * 2048}
+        # backward compatibility: override size with min_pixels and max_pixels
+        # if they are provided.
+        if min_pixels is not None:
+            size["shortest_edge"] = min_pixels
+        if max_pixels is not None:
+            size["longest_edge"] = max_pixels
+        self.min_pixels = size["shortest_edge"]
+        self.max_pixels = size["longest_edge"]
+        self.size = size
+
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.do_convert_rgb = do_convert_rgb
+
+        # hard-code
+
+    def _preprocess(
+        self,
+        images: ImageInput | VideoInput,
+        do_resize: bool | None = None,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool | None = None,
+        rescale_factor: float | None = None,
+        do_normalize: bool | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        patch_size: int = 16,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        do_convert_rgb: bool | None = None,
+        data_format: ChannelDimension | None = ChannelDimension.FIRST,
+        input_data_format: str | ChannelDimension | None = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """  # noqa: E501
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        width, height = images[0].width, images[0].height
+        resized_width, resized_height = width, height
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_width, resized_height = smart_resize(
+                    width,
+                    height,
+                    factor=patch_size * merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = image.resize((resized_width, resized_height))
+
+            if do_normalize:
+                image = transforms.Compose(
+                    [
+                        transforms.ToTensor(),
+                        transforms.Normalize(self.image_mean, self.image_std),
+                    ]
+                )(image)
+            processed_images.append(image)
+
+        patches = np.array(processed_images)
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // temporal_patch_size
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+        patches = patches.reshape(
+            1,
+            channel,
+            grid_h // merge_size,
+            merge_size,
+            patch_size,
+            grid_w // merge_size,
+            merge_size,
+            patch_size,
+        )
+        patches = patches.transpose(0, 2, 3, 5, 6, 1, 4, 7)
+        flatten_patches = patches.reshape(
+            1 * grid_h * grid_w, channel * patch_size * patch_size
+        )
+
+        return flatten_patches, (grid_t, grid_h, grid_w)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool | None = None,
+        size: dict[str, int] | None = None,
+        min_pixels: int | None = None,
+        max_pixels: int | None = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool | None = None,
+        rescale_factor: float | None = None,
+        do_normalize: bool | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        patch_size: int | None = None,
+        temporal_patch_size: int | None = None,
+        merge_size: int | None = None,
+        do_convert_rgb: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+        data_format: ChannelDimension | None = ChannelDimension.FIRST,
+        input_data_format: str | ChannelDimension | None = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
+                The min pixels of the image to resize the image.
+            max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
+                The max pixels of the image to resize the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """  # noqa: E501
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+
+        if size is not None:
+            if "shortest_edge" not in size or "longest_edge" not in size:
+                raise ValueError(
+                    "size must contain 'shortest_edge' and 'longest_edge' keys."
+                )
+            min_pixels = size["shortest_edge"]
+        elif min_pixels is not None and max_pixels is not None:
+            # backward compatibility: override size with min_pixels and max_pixels
+            # if they are provided.
+            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
+        else:
+            size = {**self.size}
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = (
+            rescale_factor if rescale_factor is not None else self.rescale_factor
+        )
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        temporal_patch_size = (
+            temporal_patch_size
+            if temporal_patch_size is not None
+            else self.temporal_patch_size
+        )
+        merge_size = merge_size if merge_size is not None else self.merge_size
+        do_convert_rgb = (
+            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        )
+
+        if images is not None:
+            images = make_flat_list_of_images(images)
+
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        data = {}
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data.update(
+                {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+            )
+
+        # kept for BC only and should be removed after v5.0
+        if videos is not None:
+            logger.warning(
+                "`HunYuanVLV1ImageProcessor` works only with image inputs "
+                "and doesn't process videos anymore. "
+                "This is a deprecated behavior and will be removed in v5.0. "
+                "Your videos should be forwarded to `HunYuanVLV1VideoProcessor`. "
+            )
+            videos = make_batched_videos(videos)
+            pixel_values_videos, vision_grid_thws_videos = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values_videos.extend(patches)
+                vision_grid_thws_videos.append(video_grid_thw)
+            data.update(
+                {
+                    "pixel_values_videos": np.array(pixel_values_videos),
+                    "video_grid_thw": np.array(vision_grid_thws_videos),
+                }
+            )
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number of image patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*):
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of image patches per image.
+        """
+        min_pixels = (
+            images_kwargs["min_pixels"]
+            if "min_pixels" in images_kwargs
+            else self.size["shortest_edge"]
+        )
+        max_pixels = (
+            images_kwargs["max_pixels"]
+            if "max_pixels" in images_kwargs
+            else self.size["longest_edge"]
+        )
+        patch_size = images_kwargs.get("patch_size", self.patch_size)
+        merge_size = images_kwargs.get("merge_size", self.merge_size)
+
+        factor = patch_size * merge_size
+        resized_height, resized_width = smart_resize(
+            height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
+        )
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+        return grid_h * (grid_w + 1) + 2
+
+
+AutoImageProcessor.register("HunYuanVLImageProcessor", HunYuanVLImageProcessor)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 4a2818ab1bfd8..e7991baeaa1b8 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -43,6 +43,8 @@ class CachedRequestState:
     mrope_positions: torch.Tensor | None = None
     mrope_position_delta: int | None = None
 
+    xdrope_positions: torch.Tensor | None = None
+
     lora_request: LoRARequest | None = None
     prompt_embeds: torch.Tensor | None = None
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6a83ac14e0b3f..6413be66b141c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -50,16 +50,21 @@ from vllm.distributed.parallel_state import (
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.layers.rotary_embedding import (
+    MRotaryEmbedding,
+    XDRotaryEmbedding,
+)
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.interfaces import (
     SupportsMRoPE,
     SupportsMultiModal,
+    SupportsXDRoPE,
     is_mixture_of_experts,
     supports_eagle3,
     supports_mrope,
     supports_multimodal_pruning,
     supports_transcription,
+    supports_xdrope,
 )
 from vllm.model_executor.models.interfaces_base import (
     VllmModelForPooling,
@@ -324,6 +329,7 @@ class GPUModelRunner(
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
+        self.uses_xdrope_dim = model_config.uses_xdrope_dim
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
             model_config
         )
@@ -512,6 +518,13 @@ class GPUModelRunner(
                 (3, self.max_num_tokens + 1), dtype=torch.int64
             )
 
+        # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+        if self.uses_xdrope_dim > 0:
+            # Similar to mrope but use assigned dimension number for RoPE, 4 as default.
+            self.xdrope_positions = self._make_buffer(
+                (self.uses_xdrope_dim, self.max_num_tokens + 1), dtype=torch.int64
+            )
+
         # None in the first PP rank. The rest are set after load_model.
         self.intermediate_tensors: IntermediateTensors | None = None
 
@@ -593,10 +606,14 @@ class GPUModelRunner(
         if isinstance(num_tokens, int):
             if self.uses_mrope:
                 return self.mrope_positions.gpu[:, :num_tokens]
+            if self.uses_xdrope_dim > 0:
+                return self.xdrope_positions.gpu[:, :num_tokens]
             return self.positions.gpu[:num_tokens]
         else:
             if self.uses_mrope:
                 return self.mrope_positions.gpu[:, num_tokens]
+            if self.uses_xdrope_dim > 0:
+                return self.xdrope_positions.gpu[:, num_tokens]
             return self.positions.gpu[num_tokens]
 
     def _make_buffer(
@@ -772,6 +789,10 @@ class GPUModelRunner(
             if self.uses_mrope:
                 self._init_mrope_positions(req_state)
 
+            # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+            if self.uses_xdrope_dim > 0:
+                self._init_xdrope_positions(req_state)
+
             reqs_to_add.append(req_state)
 
         # Update the states of the running/resumed requests.
@@ -987,6 +1008,19 @@ class GPUModelRunner(
             )
         )
 
+    def _init_xdrope_positions(self, req_state: CachedRequestState):
+        model = self.get_model()
+        xdrope_model = cast(SupportsXDRoPE, model)
+        assert req_state.prompt_token_ids is not None, (
+            "XD-RoPE requires prompt_token_ids to be available."
+        )
+        assert supports_xdrope(model), "XD-RoPE support is not implemented."
+
+        req_state.xdrope_positions = xdrope_model.get_xdrope_input_positions(
+            req_state.prompt_token_ids,
+            req_state.mm_features,
+        )
+
     def _extract_mm_kwargs(
         self,
         scheduler_output: "SchedulerOutput",
@@ -1231,6 +1265,11 @@ class GPUModelRunner(
         if self.uses_mrope:
             self._calc_mrope_positions(scheduler_output)
 
+        # Calculate XD-RoPE positions.
+        # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+        if self.uses_xdrope_dim > 0:
+            self._calc_xdrope_positions(scheduler_output)
+
         # Get token indices.
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
@@ -1364,6 +1403,12 @@ class GPUModelRunner(
                 self.mrope_positions.cpu[:, :total_num_scheduled_tokens],
                 non_blocking=True,
             )
+        elif self.uses_xdrope_dim > 0:
+            # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+            self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
+                self.xdrope_positions.cpu[:, :total_num_scheduled_tokens],
+                non_blocking=True,
+            )
         else:
             # Common case (1D positions)
             self.positions.copy_to_gpu(total_num_scheduled_tokens)
@@ -1793,6 +1838,53 @@ class GPUModelRunner(
 
                 mrope_pos_ptr += completion_part_len
 
+    def _calc_xdrope_positions(self, scheduler_output: "SchedulerOutput"):
+        xdrope_pos_ptr = 0
+        for index, req_id in enumerate(self.input_batch.req_ids):
+            req = self.requests[req_id]
+            assert req.xdrope_positions is not None
+
+            num_computed_tokens = self.input_batch.num_computed_tokens_cpu[index]
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
+                req.prompt_token_ids, req.prompt_embeds
+            )
+
+            if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
+                prompt_part_len = max(0, num_prompt_tokens - num_computed_tokens)
+                completion_part_len = max(0, num_scheduled_tokens - prompt_part_len)
+            else:
+                prompt_part_len = num_scheduled_tokens
+                completion_part_len = 0
+
+            assert num_scheduled_tokens == prompt_part_len + completion_part_len
+
+            if prompt_part_len > 0:
+                # prompt's xdrope_positions are pre-computed
+                dst_start = xdrope_pos_ptr
+                dst_end = xdrope_pos_ptr + prompt_part_len
+                src_start = num_computed_tokens
+                src_end = num_computed_tokens + prompt_part_len
+
+                self.xdrope_positions.cpu[:, dst_start:dst_end] = req.xdrope_positions[
+                    :, src_start:src_end
+                ]
+                xdrope_pos_ptr += prompt_part_len
+
+            if completion_part_len > 0:
+                # compute completion's xdrope_positions on-the-fly
+                dst_start = xdrope_pos_ptr
+                dst_end = xdrope_pos_ptr + completion_part_len
+
+                XDRotaryEmbedding.get_next_input_positions_tensor(
+                    out=self.xdrope_positions.np,
+                    out_offset=dst_start,
+                    context_len=num_computed_tokens + prompt_part_len,
+                    num_new_tokens=completion_part_len,
+                )
+
+                xdrope_pos_ptr += completion_part_len
+
     def _calc_spec_decode_metadata(
         self,
         num_draft_tokens: np.ndarray,
@@ -2037,6 +2129,7 @@ class GPUModelRunner(
 
         req_start_idx = 0
         should_sync_mrope_positions = False
+        should_sync_xdrope_positions = False
 
         for req_id in self.input_batch.req_ids:
             mm_embeds_req: list[torch.Tensor] = []
@@ -2110,6 +2203,10 @@ class GPUModelRunner(
             self._calc_mrope_positions(scheduler_output)
             self.mrope_positions.copy_to_gpu(total_num_scheduled_tokens)
 
+        if should_sync_xdrope_positions:
+            self._calc_xdrope_positions(scheduler_output)
+            self.xdrope_positions.copy_to_gpu(total_num_scheduled_tokens)
+
         return mm_embeds, is_mm_embed
 
     def get_model(self) -> nn.Module:
@@ -2384,8 +2481,11 @@ class GPUModelRunner(
             input_ids = self.input_ids.gpu[:num_input_tokens]
             inputs_embeds = None
             model_kwargs = self._init_model_kwargs(num_input_tokens)
+
         if self.uses_mrope:
             positions = self.mrope_positions.gpu[:, :num_input_tokens]
+        elif self.uses_xdrope_dim > 0:
+            positions = self.xdrope_positions.gpu[:, :num_input_tokens]
         else:
             positions = self.positions.gpu[:num_input_tokens]
 
@@ -3824,6 +3924,8 @@ class GPUModelRunner(
 
             if self.uses_mrope:
                 positions = self.mrope_positions.gpu[:, :num_tokens_after_padding]
+            elif self.uses_xdrope_dim > 0:
+                positions = self.xdrope_positions.gpu[:, :num_tokens_after_padding]
             else:
                 positions = self.positions.gpu[:num_tokens_after_padding]
 

From 81db702ed28d9a6edbd59fbd0ec039e107d36bc0 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Tue, 25 Nov 2025 12:25:20 +0800
Subject: [PATCH 039/134] [Attention] add `_cudagraph_support` for linear
 attention (#28934)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/v1/attention/backends/linear_attn.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py
index 1900c50849eca..004baa2d09cde 100644
--- a/vllm/v1/attention/backends/linear_attn.py
+++ b/vllm/v1/attention/backends/linear_attn.py
@@ -7,6 +7,7 @@ import torch
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
     split_decodes_and_prefills,
@@ -35,6 +36,8 @@ class LinearAttentionMetadata:
 class LinearAttentionMetadataBuilder(AttentionMetadataBuilder[LinearAttentionMetadata]):
     reorder_batch_threshold: int = 1
 
+    _cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+
     def __init__(
         self,
         kv_cache_spec: AttentionSpec,

From 2d9ee28cab204b90aa304f60fd7083ea45204bd7 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Mon, 24 Nov 2025 23:55:57 -0500
Subject: [PATCH 040/134] [CI/Test Fix] Fix CP tests on Blackwell (#29338)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/attention/ops/common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index 67c5f7dbba9c0..af6766bdd1615 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -194,7 +194,6 @@ def _cp_lse_common(
     cp_attn_lse = cp_attn_lse.contiguous()
     lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
     out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
-    assert out.is_contiguous()
     return out, lse
 
 
From 316c8492bf4d5fca8f9f8ea6f8ef1d76a0cb940f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 05:24:05 +0000
Subject: [PATCH 041/134] Scheduled removal of `guided_*` config fields
 (#29326)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/features/structured_outputs.md           |   2 +-
 .../llm/test_struct_output_generate.py        |  29 +--
 vllm/engine/arg_utils.py                      |  33 ---
 vllm/entrypoints/openai/protocol.py           | 203 ++++--------------
 vllm/sampling_params.py                       |  38 ----
 5 files changed, 43 insertions(+), 262 deletions(-)

diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index e38627c707884..7d52891bea7b9 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -7,7 +7,7 @@ This document shows you some examples of the different options that are
 available to generate structured outputs.
 
 !!! warning
-    If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
+    If you are still using the following deprecated API fields which were removed in v0.12.0, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
 
     - `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
     - `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index d1b037b7956cf..85f108786c05a 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -3,7 +3,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
-from dataclasses import fields
 from enum import Enum
 from typing import TYPE_CHECKING, Any
 
@@ -21,7 +20,6 @@ from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
 from vllm.sampling_params import (
-    GuidedDecodingParams,
     SamplingParams,
     StructuredOutputsParams,
 )
@@ -108,23 +106,6 @@ class CarDescription(BaseModel):
     car_type: CarType
 
 
-def test_guided_decoding_deprecated():
-    with pytest.warns(DeprecationWarning, match="GuidedDecodingParams is deprecated.*"):
-        guided_decoding = GuidedDecodingParams(json_object=True)
-
-    structured_outputs = StructuredOutputsParams(json_object=True)
-    assert fields(guided_decoding) == fields(structured_outputs)
-
-    with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
-        sp1 = SamplingParams(guided_decoding=guided_decoding)
-
-    with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
-        sp2 = SamplingParams.from_optional(guided_decoding=guided_decoding)
-
-    assert sp1 == sp2
-    assert sp1.structured_outputs == guided_decoding
-
-
 @pytest.mark.parametrize(
     "model_name, backend, tokenizer_mode, speculative_config",
     PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
@@ -899,13 +880,11 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
                 output_json = json.loads(generated_text)
 
 
-@pytest.mark.parametrize("guided_decoding_backend", ["xgrammar"])
-def test_structured_output_with_structural_tag(
-    guided_decoding_backend: str,
-):
+@pytest.mark.parametrize("backend", ["xgrammar"])
+def test_structured_output_with_structural_tag(backend: str):
     llm = LLM(
         model="Qwen/Qwen2.5-1.5B-Instruct",
-        guided_decoding_backend=guided_decoding_backend,
+        structured_outputs_config=StructuredOutputsConfig(backend=backend),
     )
 
     structural_tag_config = {
@@ -923,7 +902,7 @@ def test_structured_output_with_structural_tag(
     sampling_params = SamplingParams(
         temperature=0.0,
         max_tokens=500,
-        guided_decoding=StructuredOutputsParams(
+        structured_outputs=StructuredOutputsParams(
             structural_tag=json.dumps(structural_tag_config)
         ),
     )
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a7c6b11ccd5a8..3cb76fc63f69c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -502,11 +502,6 @@ class EngineArgs:
     )
     reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
     reasoning_parser_plugin: str | None = None
-    # Deprecated guided decoding fields
-    guided_decoding_backend: str | None = None
-    guided_decoding_disable_fallback: bool | None = None
-    guided_decoding_disable_any_whitespace: bool | None = None
-    guided_decoding_disable_additional_properties: bool | None = None
 
     logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern
 
@@ -725,19 +720,6 @@ class EngineArgs:
             "--reasoning-parser-plugin",
             **structured_outputs_kwargs["reasoning_parser_plugin"],
         )
-        # Deprecated guided decoding arguments
-        for arg, type in [
-            ("--guided-decoding-backend", str),
-            ("--guided-decoding-disable-fallback", bool),
-            ("--guided-decoding-disable-any-whitespace", bool),
-            ("--guided-decoding-disable-additional-properties", bool),
-        ]:
-            structured_outputs_group.add_argument(
-                arg,
-                type=type,
-                help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."),
-                deprecated=True,
-            )
 
         # Parallel arguments
         parallel_kwargs = get_kwargs(ParallelConfig)
@@ -1712,21 +1694,6 @@ class EngineArgs:
                 self.reasoning_parser_plugin
             )
 
-        # Forward the deprecated CLI args to the StructuredOutputsConfig
-        so_config = self.structured_outputs_config
-        if self.guided_decoding_backend is not None:
-            so_config.guided_decoding_backend = self.guided_decoding_backend
-        if self.guided_decoding_disable_fallback is not None:
-            so_config.disable_fallback = self.guided_decoding_disable_fallback
-        if self.guided_decoding_disable_any_whitespace is not None:
-            so_config.disable_any_whitespace = (
-                self.guided_decoding_disable_any_whitespace
-            )
-        if self.guided_decoding_disable_additional_properties is not None:
-            so_config.disable_additional_properties = (
-                self.guided_decoding_disable_additional_properties
-            )
-
         observability_config = ObservabilityConfig(
             show_hidden_metrics_for_version=self.show_hidden_metrics_for_version,
             otlp_traces_endpoint=self.otlp_traces_endpoint,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b352c3ad01db0..5a0a05f9af323 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -652,62 +652,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
         default=None,
         description="Additional kwargs for structured outputs",
     )
-    guided_json: str | dict | BaseModel | None = Field(
-        default=None,
-        description=(
-            "`guided_json` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `json` to `structured_outputs` instead."
-        ),
-    )
-    guided_regex: str | None = Field(
-        default=None,
-        description=(
-            "`guided_regex` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `regex` to `structured_outputs` instead."
-        ),
-    )
-    guided_choice: list[str] | None = Field(
-        default=None,
-        description=(
-            "`guided_choice` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `choice` to `structured_outputs` instead."
-        ),
-    )
-    guided_grammar: str | None = Field(
-        default=None,
-        description=(
-            "`guided_grammar` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `grammar` to `structured_outputs` instead."
-        ),
-    )
-    structural_tag: str | None = Field(
-        default=None,
-        description=(
-            "`structural_tag` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `structural_tag` to `structured_outputs` instead."
-        ),
-    )
-    guided_decoding_backend: str | None = Field(
-        default=None,
-        description=(
-            "`guided_decoding_backend` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please remove it from your request."
-        ),
-    )
-    guided_whitespace_pattern: str | None = Field(
-        default=None,
-        description=(
-            "`guided_whitespace_pattern` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `whitespace_pattern` to `structured_outputs` instead."
-        ),
-    )
     priority: int = Field(
         default=0,
         description=(
@@ -841,20 +785,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
 
-        # Forward deprecated guided_* parameters to structured_outputs
-        if self.structured_outputs is None:
-            kwargs = dict[str, Any](
-                json=self.guided_json,
-                regex=self.guided_regex,
-                choice=self.guided_choice,
-                grammar=self.guided_grammar,
-                whitespace_pattern=self.guided_whitespace_pattern,
-                structural_tag=self.structural_tag,
-            )
-            kwargs = {k: v for k, v in kwargs.items() if v is not None}
-            if len(kwargs) > 0:
-                self.structured_outputs = StructuredOutputsParams(**kwargs)
-
         response_format = self.response_format
         if response_format is not None:
             # If structured outputs wasn't already enabled,
@@ -863,24 +793,23 @@ class ChatCompletionRequest(OpenAIBaseModel):
                 self.structured_outputs = StructuredOutputsParams()
 
             # Set structured output params for response format
-            if response_format is not None:
-                if response_format.type == "json_object":
-                    self.structured_outputs.json_object = True
-                elif response_format.type == "json_schema":
-                    json_schema = response_format.json_schema
-                    assert json_schema is not None
-                    self.structured_outputs.json = json_schema.json_schema
-                elif response_format.type == "structural_tag":
-                    structural_tag = response_format
-                    assert structural_tag is not None and isinstance(
-                        structural_tag,
-                        (
-                            LegacyStructuralTagResponseFormat,
-                            StructuralTagResponseFormat,
-                        ),
-                    )
-                    s_tag_obj = structural_tag.model_dump(by_alias=True)
-                    self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
+            if response_format.type == "json_object":
+                self.structured_outputs.json_object = True
+            elif response_format.type == "json_schema":
+                json_schema = response_format.json_schema
+                assert json_schema is not None
+                self.structured_outputs.json = json_schema.json_schema
+            elif response_format.type == "structural_tag":
+                structural_tag = response_format
+                assert structural_tag is not None and isinstance(
+                    structural_tag,
+                    (
+                        LegacyStructuralTagResponseFormat,
+                        StructuralTagResponseFormat,
+                    ),
+                )
+                s_tag_obj = structural_tag.model_dump(by_alias=True)
+                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
@@ -1140,58 +1069,6 @@ class CompletionRequest(OpenAIBaseModel):
         default=None,
         description="Additional kwargs for structured outputs",
     )
-    guided_json: str | dict | BaseModel | None = Field(
-        default=None,
-        description=(
-            "`guided_json` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `json` to `structured_outputs` instead."
-        ),
-    )
-    guided_regex: str | None = Field(
-        default=None,
-        description=(
-            "`guided_regex` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `regex` to `structured_outputs` instead."
-        ),
-    )
-    guided_choice: list[str] | None = Field(
-        default=None,
-        description=(
-            "`guided_choice` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `choice` to `structured_outputs` instead."
-        ),
-    )
-    guided_grammar: str | None = Field(
-        default=None,
-        description=(
-            "`guided_grammar` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `grammar` to `structured_outputs` instead."
-        ),
-    )
-    structural_tag: str | None = Field(
-        default=None,
-        description=("If specified, the output will follow the structural tag schema."),
-    )
-    guided_decoding_backend: str | None = Field(
-        default=None,
-        description=(
-            "`guided_decoding_backend` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please remove it from your request."
-        ),
-    )
-    guided_whitespace_pattern: str | None = Field(
-        default=None,
-        description=(
-            "`guided_whitespace_pattern` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `whitespace_pattern` to `structured_outputs` instead."
-        ),
-    )
     priority: int = Field(
         default=0,
         description=(
@@ -1336,35 +1213,31 @@ class CompletionRequest(OpenAIBaseModel):
 
         echo_without_generation = self.echo and self.max_tokens == 0
 
-        guided_json_object = None
-        if self.response_format is not None:
-            if self.response_format.type == "json_object":
-                guided_json_object = True
-            elif self.response_format.type == "json_schema":
-                json_schema = self.response_format.json_schema
+        response_format = self.response_format
+        if response_format is not None:
+            # If structured outputs wasn't already enabled,
+            # we must enable it for these features to work
+            if self.structured_outputs is None:
+                self.structured_outputs = StructuredOutputsParams()
+
+            # Set structured output params for response format
+            if response_format.type == "json_object":
+                self.structured_outputs.json_object = True
+            elif response_format.type == "json_schema":
+                json_schema = response_format.json_schema
                 assert json_schema is not None
-                self.guided_json = json_schema.json_schema
-            elif self.response_format.type == "structural_tag":
-                structural_tag = self.response_format
+                self.structured_outputs.json = json_schema.json_schema
+            elif response_format.type == "structural_tag":
+                structural_tag = response_format
                 assert structural_tag is not None and isinstance(
-                    structural_tag, StructuralTagResponseFormat
+                    structural_tag,
+                    (
+                        LegacyStructuralTagResponseFormat,
+                        StructuralTagResponseFormat,
+                    ),
                 )
                 s_tag_obj = structural_tag.model_dump(by_alias=True)
-                self.structural_tag = json.dumps(s_tag_obj)
-
-        # Forward deprecated guided_* parameters to structured_outputs
-        if self.structured_outputs is None:
-            kwargs = dict[str, Any](
-                json=self.guided_json,
-                json_object=guided_json_object,
-                regex=self.guided_regex,
-                choice=self.guided_choice,
-                grammar=self.guided_grammar,
-                whitespace_pattern=self.guided_whitespace_pattern,
-            )
-            kwargs = {k: v for k, v in kwargs.items() if v is not None}
-            if len(kwargs) > 0:
-                self.structured_outputs = StructuredOutputsParams(**kwargs)
+                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index fbbe3d4cabb9a..142853ff0ff0e 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -3,7 +3,6 @@
 """Sampling parameters for text generation."""
 
 import copy
-import warnings
 from dataclasses import field
 from enum import Enum, IntEnum
 from functools import cached_property
@@ -100,19 +99,6 @@ class StructuredOutputsParams:
         )
 
 
-@dataclass
-class GuidedDecodingParams(StructuredOutputsParams):
-    def __post_init__(self):
-        warnings.warn(
-            "GuidedDecodingParams is deprecated. This will be removed in "
-            "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-            "StructuredOutputsParams instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        return super().__post_init__()
-
-
 class RequestOutputKind(Enum):
     # Return entire output so far in every RequestOutput
     CUMULATIVE = 0
@@ -234,8 +220,6 @@ class SamplingParams(
     # Fields used to construct logits processors
     structured_outputs: StructuredOutputsParams | None = None
     """Parameters for configuring structured outputs."""
-    guided_decoding: GuidedDecodingParams | None = None
-    """Deprecated alias for structured_outputs."""
     logit_bias: dict[int, float] | None = None
     """If provided, the engine will construct a logits processor that applies
     these logit biases."""
@@ -283,7 +267,6 @@ class SamplingParams(
         truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None,
         output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
         structured_outputs: StructuredOutputsParams | None = None,
-        guided_decoding: GuidedDecodingParams | None = None,
         logit_bias: dict[int, float] | dict[str, float] | None = None,
         allowed_token_ids: list[int] | None = None,
         extra_args: dict[str, Any] | None = None,
@@ -295,16 +278,6 @@ class SamplingParams(
                 int(token): min(100.0, max(-100.0, bias))
                 for token, bias in logit_bias.items()
             }
-        if guided_decoding is not None:
-            warnings.warn(
-                "guided_decoding is deprecated. This will be removed in "
-                "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-                "structured_outputs instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            structured_outputs = guided_decoding
-            guided_decoding = None
 
         return SamplingParams(
             n=1 if n is None else n,
@@ -387,17 +360,6 @@ class SamplingParams(
         # eos_token_id is added to this by the engine
         self._all_stop_token_ids.update(self.stop_token_ids)
 
-        if self.guided_decoding is not None:
-            warnings.warn(
-                "guided_decoding is deprecated. This will be removed in "
-                "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-                "structured_outputs instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            self.structured_outputs = self.guided_decoding
-            self.guided_decoding = None
-
         if self.skip_reading_prefix_cache is None:
             # If prefix caching is enabled,
             # the output of prompt logprobs may less than n_prompt_tokens,

From a21256c46327ec366b7804d22ba66ed04c2ae18b Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Tue, 25 Nov 2025 14:03:20 +0800
Subject: [PATCH 042/134] Add TP CLI argument to multimodal inference examples
 (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
---
 examples/offline_inference/audio_language.py  | 15 +++++++
 examples/offline_inference/vision_language.py | 15 +++++++
 .../vision_language_multi_image.py            | 40 ++++++++++++++++---
 3 files changed, 65 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 examples/offline_inference/audio_language.py
 mode change 100644 => 100755 examples/offline_inference/vision_language.py
 mode change 100644 => 100755 examples/offline_inference/vision_language_multi_image.py

diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
old mode 100644
new mode 100755
index 04e6f99f8957e..df6e96ca375fc
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -425,6 +425,13 @@ def parse_args():
         default=None,
         help="Set the seed when initializing `vllm.LLM`.",
     )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=None,
+        help="Tensor parallel size to override the model's default setting. ",
+    )
 
     return parser.parse_args()
 
@@ -434,6 +441,12 @@ def main(args):
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
+    if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
+        raise ValueError(
+            f"tensor_parallel_size must be a positive integer, "
+            f"got {args.tensor_parallel_size}"
+        )
+
     audio_count = args.num_audios
     req_data = model_example_map[model](
         question_per_audio_count[audio_count], audio_count
@@ -446,6 +459,8 @@ def main(args):
     )
 
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    if args.tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
     llm = LLM(**engine_args)
 
     # We set temperature to 0.2 so that outputs can be different
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
old mode 100644
new mode 100755
index 65ea4df4a3099..8f72bf6f0b0d1
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -2064,6 +2064,13 @@ def parse_args():
         help="If True, will send all requests in a second batch with empty mm "
         "data to verify cache hits with UUIDs.",
     )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=None,
+        help="Tensor parallel size to override the model's default setting. ",
+    )
     return parser.parse_args()
 
 
@@ -2072,6 +2079,12 @@ def main(args):
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
+    if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
+        raise ValueError(
+            f"tensor_parallel_size must be a positive integer, "
+            f"got {args.tensor_parallel_size}"
+        )
+
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
@@ -2089,6 +2102,8 @@ def main(args):
         "seed": args.seed,
         "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
     }
+    if args.tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
     llm = LLM(**engine_args)
 
     # Don't want to check the flag multiple times, so just hijack `prompts`.
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
old mode 100644
new mode 100755
index 301265d4e17f7..7ba4e64b567de
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1352,10 +1352,18 @@ model_example_map = {
 }
 
 
-def run_generate(model, question: str, image_urls: list[str], seed: int | None):
+def run_generate(
+    model,
+    question: str,
+    image_urls: list[str],
+    seed: int | None,
+    tensor_parallel_size: int | None,
+):
     req_data = model_example_map[model](question, image_urls)
 
-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    if tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = tensor_parallel_size
     llm = LLM(**engine_args)
 
     sampling_params = SamplingParams(
@@ -1378,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None):
         print("-" * 50)
 
 
-def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
+def run_chat(
+    model: str,
+    question: str,
+    image_urls: list[str],
+    seed: int | None,
+    tensor_parallel_size: int | None,
+):
     req_data = model_example_map[model](question, image_urls)
 
     # Disable other modalities to save memory
@@ -1388,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
     )
 
     engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    if tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = tensor_parallel_size
     llm = LLM(**engine_args)
 
     sampling_params = (
@@ -1463,6 +1479,13 @@ def parse_args():
         default=2,
         help="Number of images to use for the demo.",
     )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=None,
+        help="Tensor parallel size to override the model's default setting. ",
+    )
     return parser.parse_args()
 
 
@@ -1470,13 +1493,20 @@ def main(args: Namespace):
     model = args.model_type
     method = args.method
     seed = args.seed
+    tensor_parallel_size = args.tensor_parallel_size
+
+    if tensor_parallel_size is not None and tensor_parallel_size < 1:
+        raise ValueError(
+            f"tensor_parallel_size must be a positive integer, "
+            f"got {tensor_parallel_size}"
+        )
 
     image_urls = IMAGE_URLS[: args.num_images]
 
     if method == "generate":
-        run_generate(model, QUESTION, image_urls, seed)
+        run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
     elif method == "chat":
-        run_chat(model, QUESTION, image_urls, seed)
+        run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
     else:
         raise ValueError(f"Invalid method: {method}")
 

From ce58fdc1c366b0257c2b2d8310b14d4ea8f8dd30 Mon Sep 17 00:00:00 2001
From: kflu <kflu@users.noreply.github.com>
Date: Mon, 24 Nov 2025 22:39:29 -0800
Subject: [PATCH 043/134] Fix PoolingParams.skip_reading_prefix_cache type
 (#29364)

Signed-off-by: KFL <kludev@gmail.com>
---
 vllm/pooling_params.py  | 2 +-
 vllm/sampling_params.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 5c3dfa8ac9cbc..d1aab98c274e1 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -57,7 +57,7 @@ class PoolingParams(
     ## Internal use only
     task: PoolingTask | None = None
     requires_token_ids: bool = False
-    skip_reading_prefix_cache: bool = None
+    skip_reading_prefix_cache: bool | None = None
     extra_kwargs: dict[str, Any] | None = None
     output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 142853ff0ff0e..8de961e62db1b 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -238,7 +238,7 @@ class SamplingParams(
     generated token can complete the sequence."""
     _bad_words_token_ids: list[list[int]] | None = None
 
-    skip_reading_prefix_cache: bool = None
+    skip_reading_prefix_cache: bool | None = None
 
     @staticmethod
     def from_optional(

From 40a6f53f6c09cd15b07436acc8d631a3a86f7416 Mon Sep 17 00:00:00 2001
From: Inoki <inoki@inoki.cc>
Date: Tue, 25 Nov 2025 07:40:06 +0100
Subject: [PATCH 044/134] Display warning only when ROCm version is less than
 Pytorch required version (#29200)

Signed-off-by: Inoki <inoki@inoki.cc>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4cf51d17e982..86746a0db4c0e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,7 +136,7 @@ elseif(HIP_FOUND)
 
   # ROCm 5.X and 6.X
   if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+      Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
     message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
       "expected for ROCm build, saw ${Torch_VERSION} instead.")
   endif()

From 7992324f23478bebf5e39542a4ce198cd7a1ab2a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 24 Nov 2025 22:55:16 -0800
Subject: [PATCH 045/134] [BugFix] Use unique ids for different transcription
 prompts (#29372)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/speech_to_text.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index b9b9b1ab30ad8..3dece07748cc4 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -201,10 +201,10 @@ class OpenAISpeechToText(OpenAIServing):
                 self.engine_client.generate(
                     prompt,
                     sampling_params,
-                    request_id,
+                    f"{request_id}_{i}",
                     lora_request=lora_request,
                 )
-                for prompt in prompts
+                for i, prompt in enumerate(prompts)
             ]
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error

From 64deead719cc181a1930982b0a5f4d280c284156 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 25 Nov 2025 14:56:06 +0800
Subject: [PATCH 046/134] [Bugfix] [ROCm] [UX]: revert Flex attention backend
 (#29371)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../v1/attention/test_rocm_attention_backends_selection.py  | 6 ++++++
 vllm/platforms/rocm.py                                      | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
index 4ec79e9eb6ba4..80158d4b7278c 100644
--- a/tests/v1/attention/test_rocm_attention_backends_selection.py
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -36,6 +36,12 @@ def mock_on_gfx9():
 @pytest.mark.parametrize(
     "env_vars, selected_backend, expected_backend_path",
     [
+        # Test Case: Explicit FLEX_ATTENTION backend
+        (
+            {},
+            "FLEX_ATTENTION",
+            AttentionBackendEnum.FLEX_ATTENTION.get_path(),
+        ),
         # Test Case 1: Default (no env vars, no explicit backend)
         (
             {},
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f3ec965bd0881..b0434b9642f07 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -262,6 +262,10 @@ class RocmPlatform(Platform):
                 f"is not MLA type while requested for MLA backend."
             )
 
+        if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
+            logger.info("Using FlexAttention backend.")
+            return AttentionBackendEnum.FLEX_ATTENTION.get_path()
+
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
             logger.info("Using Triton Attention backend on V1 engine.")
             return AttentionBackendEnum.TRITON_ATTN.get_path()

From 98caeadd54599c8038fab5b19cc8ef5688b7b03a Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Tue, 25 Nov 2025 07:11:11 +0000
Subject: [PATCH 047/134] [fix][cpu] Use a SwigluOAI impl which supports
 interleaved gate-up wei (#29273)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .../layers/fused_moe/cpu_fused_moe.py         | 29 +++++--------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 572307052b489..659a2d4ee5b39 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -6,22 +6,7 @@ import torch
 from torch.nn import functional as F
 
 from vllm import _custom_ops as ops
-
-
-def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
-    d = x.shape[-1] // 2
-    return F.silu(x[..., :d]) * x[..., d:]
-
-
-def swigluoai_and_mul(
-    x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0
-) -> torch.Tensor:
-    d = x.shape[-1] // 2
-    gate, up = x[..., :d], x[..., d:]
-    gate = gate.clamp(max=limit)
-    up = up.clamp(min=-limit, max=limit)
-    glu = gate * torch.sigmoid(alpha * gate)
-    return (up + 1) * glu
+from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
 
 
 def grouped_topk(
@@ -227,6 +212,11 @@ class CPUFusedMOE:
             layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
 
+        self.act_to_impl = {
+            "silu": SiluAndMul(),
+            "swigluoai": SwigluOAIAndMul(),
+        }
+
     def __call__(
         self,
         layer: torch.nn.Module,
@@ -246,7 +236,7 @@ class CPUFusedMOE:
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
-        assert activation in {"silu", "swigluoai"}, f"{activation} is not supported."
+        assert activation in self.act_to_impl, f"{activation} is not supported."
         assert not apply_router_weight_on_input
         topk_weights, topk_ids = select_experts(
             hidden_states=x,
@@ -283,10 +273,7 @@ class CPUFusedMOE:
             tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
 
             gate_up = layer.gate_up_linear[i](tokens_for_this_expert)
-            if activation == "swigluoai":
-                gate_up = swigluoai_and_mul(gate_up)
-            else:
-                gate_up = silu_and_mul(gate_up)
+            gate_up = self.act_to_impl[activation].forward_native(gate_up)
             expert_out = layer.down_linear[i](gate_up)
             outputs.append(expert_out)
             start_idx = end_idx

From fe3a4f5b347c64f1d5f2cb10990437a56f720660 Mon Sep 17 00:00:00 2001
From: Ryan Rock <ryan.rock@amd.com>
Date: Tue, 25 Nov 2025 01:14:59 -0600
Subject: [PATCH 048/134] [CI/Build] Pin torchgeo dependency for AMD (#29353)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
---
 requirements/rocm-test.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index f9bddc23420b4..8a91b59de6f72 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -45,6 +45,7 @@ multiprocess==0.70.16
 
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
+torchgeo==0.7.0
 
 # Required for suffix decoding test
-arctic-inference == 0.1.1
\ No newline at end of file
+arctic-inference == 0.1.1

From 888152bf87d62c9f5929d06f386068990b618db7 Mon Sep 17 00:00:00 2001
From: Icey <1790571317@qq.com>
Date: Tue, 25 Nov 2025 15:25:15 +0800
Subject: [PATCH 049/134] Allow oot custom compiler extension via
 CompilerInterface (#28623)

Signed-off-by: wxsIcey <1790571317@qq.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/compilation/backends.py | 34 +++++++++++++++++-----------------
 vllm/config/compilation.py   | 12 +++++-------
 vllm/platforms/interface.py  | 20 ++++++++++++++++++++
 3 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 1e66f21ff6388..2d8dd4c51c7ef 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -63,13 +63,14 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
         else:
             logger.debug("Using InductorAdaptor")
             return InductorAdaptor()
-    else:
-        assert compilation_config.backend == "eager", (
-            "Custom backends not supported with CompilationMode.VLLM_COMPILE"
-        )
-
+    elif compilation_config.backend == "eager":
         logger.debug("Using EagerAdaptor")
         return EagerAdaptor()
+    else:
+        logger.debug("Using custom backend: %s", compilation_config.backend)
+        compiler = resolve_obj_by_qualname(current_platform.get_compile_backend())()
+        assert isinstance(compiler, CompilerInterface)
+        return compiler
 
 
 class CompilerManager:
@@ -545,7 +546,10 @@ class VllmBackend:
         self.prefix = prefix or model_tag
 
         # Passes to run on the graph post-grad.
-        self.post_grad_pass_manager = PostGradPassManager()
+        self.pass_manager = resolve_obj_by_qualname(
+            current_platform.get_pass_manager_cls()
+        )()
+        self.pass_key = current_platform.pass_key
 
         self.sym_tensor_indices = []
         self.input_buffers = []
@@ -562,24 +566,20 @@ class VllmBackend:
 
     def configure_post_pass(self):
         config = self.compilation_config
-        self.post_grad_pass_manager.configure(self.vllm_config)
+        self.pass_manager.configure(self.vllm_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
         # hook. If a pass for that hook exists, add it to the pass manager.
         inductor_config = config.inductor_compile_config
-        PASS_KEY = "post_grad_custom_post_pass"
-        if PASS_KEY in inductor_config:
-            if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
+        if self.pass_key in inductor_config:
+            if isinstance(inductor_config[self.pass_key], PostGradPassManager):
                 # PassManager already added to config, make sure it's correct
-                assert (
-                    inductor_config[PASS_KEY].uuid()
-                    == self.post_grad_pass_manager.uuid()
-                )
+                assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid()
             else:
                 # Config should automatically wrap all inductor passes
-                assert isinstance(inductor_config[PASS_KEY], InductorPass)
-                self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
-        inductor_config[PASS_KEY] = self.post_grad_pass_manager
+                assert isinstance(inductor_config[self.pass_key], InductorPass)
+                self.pass_manager.add(inductor_config[self.pass_key])
+        inductor_config[self.pass_key] = self.pass_manager
 
     def __call__(
         self, graph: fx.GraphModule, example_inputs
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 42eccf9f41123..556b2d9168b32 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -331,9 +331,9 @@ class CompilationConfig:
     We use string to avoid serialization issues when using compilation in a
     distributed setting. When the compilation mode is 1 or 2, the backend is
     used for the compilation directly (it sees the whole graph). When the
-    compilation mode is 3, the backend is used for the piecewise compilation
-    (it sees a part of the graph). The backend can not be custom for compilation
-    mode 3, i.e. the backend must be either eager or inductor. Furthermore,
+    compilation mode is 3, the backend supports both whole graph and piecewise 
+    compilation, available backends include eager, inductor, and custom backends, 
+    the latter of which can be defined via `get_compile_backend`. Furthermore,
     compilation is only piecewise if splitting ops is set accordingly and
     use_inductor_graph_partition is off. Note that the default options for
     splitting ops are sufficient for piecewise compilation.
@@ -768,7 +768,7 @@ class CompilationConfig:
             self.backend = "inductor" if self.use_inductor else "eager"
 
         if self.backend == "":
-            self.backend = current_platform.simple_compile_backend
+            self.backend = current_platform.get_compile_backend()
 
     def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
         """
@@ -800,9 +800,7 @@ class CompilationConfig:
 
         assert self.mode == CompilationMode.VLLM_COMPILE
         if self.backend not in ["eager", "inductor"]:
-            raise ValueError(
-                f"Invalid backend for piecewise compilation: {self.backend}"
-            )
+            logger.info("Using OOT custom backend for compilation.")
 
         from vllm.compilation.backends import VllmBackend
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 0471c20429b1d..1e6b53021f888 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -134,6 +134,11 @@ class Platform:
 
     _global_graph_pool: Any | None = None
 
+    @property
+    def pass_key(self) -> str:
+        """Inductor config key for the PassManager custom pass"""
+        return "post_grad_custom_post_pass"
+
     @property
     def supported_dtypes(self) -> list[torch.dtype]:
         """Returns the supported dtypes for the current platform."""
@@ -177,6 +182,21 @@ class Platform:
         # all ROCm platforms for now.
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
+    @classmethod
+    def get_pass_manager_cls(cls) -> str:
+        """
+        Get the pass manager class for this platform.
+        It will be registered as a custom pass under the current_platform.pass_key.
+        """
+        return "vllm.compilation.pass_manager.PostGradPassManager"
+
+    @classmethod
+    def get_compile_backend(cls) -> str:
+        """
+        Get the custom compile backend for current platform.
+        """
+        return cls.simple_compile_backend
+
     @classmethod
     def device_id_to_physical_device_id(cls, device_id: int):
         # Treat empty device control env var as unset. This is a valid

From f242cfcdd5f1db4e005503a02a1317369d2a8e3d Mon Sep 17 00:00:00 2001
From: zhrrr <43847754+izhuhaoran@users.noreply.github.com>
Date: Tue, 25 Nov 2025 15:31:07 +0800
Subject: [PATCH 050/134] [Perf] use cpu all reduce to avoid sync when
 async_scheduling & dp > 1 (#29311)

Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
---
 vllm/engine/arg_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3cb76fc63f69c..8338e54d4fd85 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1570,6 +1570,12 @@ class EngineArgs:
             model_config.skip_tokenizer_init = True
             logger.info("Skipping tokenizer initialization for tokens-only mode.")
 
+        if self.async_scheduling and not self.disable_nccl_for_dp_synchronization:
+            logger.info(
+                "Disabling NCCL for DP synchronization when using async scheduling."
+            )
+            self.disable_nccl_for_dp_synchronization = True
+
         # Forward the deprecated CLI args to the EPLB config.
         if self.num_redundant_experts is not None:
             self.eplb_config.num_redundant_experts = self.num_redundant_experts

From 12c007e288bf5c0ae3bd438036fbafbad88e706b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Delacourt?=
 <54138269+Flechman@users.noreply.github.com>
Date: Tue, 25 Nov 2025 08:32:21 +0100
Subject: [PATCH 051/134] EAGLE Support DP>1 (#26086)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Delacourt <remi@mistral.ai>
Signed-off-by: Rémi Delacourt <54138269+Flechman@users.noreply.github.com>
Signed-off-by: remi <remi@mistral.ai>
---
 .buildkite/test-pipeline.yaml         |   2 +
 tests/v1/distributed/test_eagle_dp.py |  77 ++++++++++++++++
 vllm/v1/spec_decode/eagle.py          | 123 +++++++++++++++++++-------
 vllm/v1/worker/gpu_model_runner.py    |   5 +-
 4 files changed, 176 insertions(+), 31 deletions(-)
 create mode 100644 tests/v1/distributed/test_eagle_dp.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f1cd39ef4f948..e88e693a2dda5 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -192,6 +192,7 @@ steps:
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -1116,6 +1117,7 @@ steps:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
diff --git a/tests/v1/distributed/test_eagle_dp.py b/tests/v1/distributed/test_eagle_dp.py
new file mode 100644
index 0000000000000..9f6a6614fc1fd
--- /dev/null
+++ b/tests/v1/distributed/test_eagle_dp.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+from contextlib import AsyncExitStack
+from dataclasses import replace
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+
+DP_SIZE = int(os.getenv("DP_SIZE", 2))
+
+
+@pytest.mark.asyncio
+async def test_run_eagle_dp():
+    target_model = "meta-llama/Llama-3.1-8B-Instruct"
+    draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+
+    engine_args = AsyncEngineArgs(
+        model=target_model,
+        tokenizer_mode="auto",
+        enforce_eager=False,
+        tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+        data_parallel_size=DP_SIZE,
+        data_parallel_backend="mp",  # ray takes more time
+        trust_remote_code=True,
+        max_model_len=16384,
+    )
+
+    eagle_engine_args = replace(
+        engine_args,
+        speculative_config={
+            "model": draft_model,
+            "method": "eagle",
+            "num_speculative_tokens": 3,
+        },
+    )
+
+    prompt = "This is a test of data parallel with eagle"
+    num_expected_tokens = 100
+    sampling_params = SamplingParams(
+        min_tokens=num_expected_tokens,
+        max_tokens=num_expected_tokens,
+        ignore_eos=True,
+        output_kind=RequestOutputKind.FINAL_ONLY,
+        temperature=0,
+    )
+
+    async def generate_with_timeout(given_engine: AsyncLLM):
+        async for out in given_engine.generate(
+            request_id="test-eagle-dp", prompt=prompt, sampling_params=sampling_params
+        ):
+            token_ids = out.outputs[0].token_ids
+            assert len(token_ids) == num_expected_tokens
+            return token_ids
+
+    async def engine_create_and_generate(engine_args: AsyncEngineArgs):
+        async with AsyncExitStack() as after:
+            engine = AsyncLLM.from_engine_args(engine_args)
+            after.callback(engine.shutdown)
+
+            token_ids = await asyncio.wait_for(
+                generate_with_timeout(engine), timeout=30
+            )
+
+            assert not engine.output_processor.has_unfinished_requests()
+        return token_ids
+
+    token_ids_with_eagle = await engine_create_and_generate(eagle_engine_args)
+    token_ids_no_eagle = await engine_create_and_generate(engine_args)
+
+    # Test for correctness
+    assert token_ids_with_eagle == token_ids_no_eagle
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index afa16573eea10..784ccbc04932f 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -40,6 +40,7 @@ from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import _SAMPLING_EPS
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 logger = init_logger(__name__)
@@ -65,6 +66,7 @@ class EagleProposer:
         self.dtype = vllm_config.model_config.dtype
         self.max_model_len = vllm_config.model_config.max_model_len
         self.block_size = vllm_config.cache_config.block_size
+        self.dp_rank = vllm_config.parallel_config.data_parallel_rank
         self.num_speculative_tokens = self.speculative_config.num_speculative_tokens
         self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
         self.token_arange_np = np.arange(self.max_num_tokens)
@@ -271,15 +273,24 @@ class EagleProposer:
             assert draft_indexer_metadata is not None
             per_layer_attn_metadata[layer_name] = draft_indexer_metadata
 
+        num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
+            num_tokens_unpadded=num_tokens,
+            num_tokens_padded=num_tokens,
+        )
+
         cudagraph_runtime_mode = CUDAGraphMode.NONE
         if (
             self.use_cuda_graph
-            and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+            and num_tokens_dp_padded
+            <= self.compilation_config.max_cudagraph_capture_size
         ):
-            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens_dp_padded)
             cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
-            num_input_tokens = num_tokens
+            num_input_tokens = num_tokens_dp_padded
+        if num_tokens_across_dp is not None:
+            num_tokens_across_dp[self.dp_rank] = num_input_tokens
+
         # copy inputs to buffer for cudagraph
         self._set_positions(num_tokens, target_positions)
         self.hidden_states[:num_tokens] = target_hidden_states
@@ -303,6 +314,7 @@ class EagleProposer:
             per_layer_attn_metadata,
             self.vllm_config,
             num_tokens=num_input_tokens,
+            num_tokens_across_dp=num_tokens_across_dp,
             cudagraph_runtime_mode=cudagraph_runtime_mode,
         ):
             ret_hidden_states = self.model(
@@ -365,15 +377,23 @@ class EagleProposer:
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
 
+        batch_size_dp_padded, batch_size_across_dp = self._pad_batch_across_dp(
+            num_tokens_unpadded=batch_size,
+            num_tokens_padded=batch_size,
+        )
+
         if (
             self.use_cuda_graph
-            and batch_size <= self.compilation_config.max_cudagraph_capture_size
+            and batch_size_dp_padded
+            <= self.compilation_config.max_cudagraph_capture_size
         ):
-            input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
+            input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size_dp_padded)
             cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
-            input_batch_size = batch_size
+            input_batch_size = batch_size_dp_padded
             cudagraph_runtime_mode = CUDAGraphMode.NONE
+        if batch_size_across_dp is not None:
+            batch_size_across_dp[self.dp_rank] = input_batch_size
 
         common_attn_metadata.num_actual_tokens = batch_size
         common_attn_metadata.max_query_len = 1
@@ -474,6 +494,7 @@ class EagleProposer:
                 per_layer_attn_metadata,
                 self.vllm_config,
                 num_tokens=input_batch_size,
+                num_tokens_across_dp=batch_size_across_dp,
                 cudagraph_runtime_mode=cudagraph_runtime_mode,
             ):
                 ret_hidden_states = self.model(
@@ -1116,36 +1137,56 @@ class EagleProposer:
         self,
         num_tokens: int,
         use_cudagraphs=True,
+        is_graph_capturing=False,
     ) -> None:
         # Determine if CUDA graphs should be used for this run.
         cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
-        if (
-            cudagraphs_enabled
-            and num_tokens <= self.compilation_config.max_cudagraph_capture_size
-        ):
-            num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
 
-        with set_forward_context(
-            None,
-            self.vllm_config,
-            num_tokens=num_tokens,
-            cudagraph_runtime_mode=(
-                CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE
-            ),
+        # FIXME: when using tree-based specdec, adjust number of forward-passes
+        # according to the depth of the tree.
+        for fwd_idx in range(
+            self.num_speculative_tokens if not is_graph_capturing else 1
         ):
-            if self.supports_mm_inputs:
-                input_ids = None
-                inputs_embeds = self.inputs_embeds[:num_tokens]
-            else:
-                input_ids = self.input_ids[:num_tokens]
-                inputs_embeds = None
+            if fwd_idx <= 1:
+                num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
+                    num_tokens_unpadded=num_tokens,
+                    num_tokens_padded=num_tokens,
+                )
+                if (
+                    cudagraphs_enabled
+                    and num_tokens_dp_padded
+                    <= self.compilation_config.max_cudagraph_capture_size
+                ):
+                    num_input_tokens = self.vllm_config.pad_for_cudagraph(
+                        num_tokens_dp_padded
+                    )
+                else:
+                    num_input_tokens = num_tokens_dp_padded
+                if num_tokens_across_dp is not None:
+                    num_tokens_across_dp[self.dp_rank] = num_input_tokens
 
-            self.model(
-                input_ids=input_ids,
-                positions=self._get_positions(num_tokens),
-                hidden_states=self.hidden_states[:num_tokens],
-                inputs_embeds=inputs_embeds,
-            )
+            with set_forward_context(
+                None,
+                self.vllm_config,
+                num_tokens=num_input_tokens,
+                num_tokens_across_dp=num_tokens_across_dp,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE
+                if cudagraphs_enabled
+                else CUDAGraphMode.NONE,
+            ):
+                if self.supports_mm_inputs:
+                    input_ids = None
+                    inputs_embeds = self.inputs_embeds[:num_input_tokens]
+                else:
+                    input_ids = self.input_ids[:num_input_tokens]
+                    inputs_embeds = None
+
+                self.model(
+                    input_ids=input_ids,
+                    positions=self._get_positions(num_input_tokens),
+                    hidden_states=self.hidden_states[:num_input_tokens],
+                    inputs_embeds=inputs_embeds,
+                )
 
     def _get_attention_metadata_builder(self) -> AttentionMetadataBuilder:
         """Find and return the attention metadata builders for EAGLE layers.
@@ -1211,6 +1252,28 @@ class EagleProposer:
             == 1
         ), "All eagle layers should belong to the same kv cache group"
 
+    def _pad_batch_across_dp(
+        self,
+        num_tokens_unpadded: int,
+        num_tokens_padded: int,
+    ) -> tuple[int, torch.Tensor]:
+        # TODO(Flechman): support DBO ubatching
+        ubatch_slices, num_toks_across_dp = coordinate_batch_across_dp(
+            num_tokens_unpadded=num_tokens_unpadded,
+            parallel_config=self.vllm_config.parallel_config,
+            allow_microbatching=False,
+            allow_dp_padding=self.use_cuda_graph,
+            num_tokens_padded=num_tokens_padded,
+            uniform_decode=None,
+            num_scheduled_tokens_per_request=None,
+        )
+        assert ubatch_slices is None, "DBO ubatching not implemented for EAGLE"
+
+        num_tokens_dp_padded = num_tokens_padded
+        if num_toks_across_dp is not None:
+            num_tokens_dp_padded = int(num_toks_across_dp[self.dp_rank].item())
+        return num_tokens_dp_padded, num_toks_across_dp
+
 
 # NOTE(woosuk): Currently, the below code is not used and we always use argmax
 # to sample the draft tokens. We will use this after we find a way to manage
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6413be66b141c..74fd2a1e2a2c0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3746,6 +3746,7 @@ class GPUModelRunner(
         create_mixed_batch: bool = False,
         remove_lora: bool = True,
         activate_lora: bool = False,
+        is_graph_capturing: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Run a dummy forward pass to warm up/profile run or capture the
@@ -3981,7 +3982,7 @@ class GPUModelRunner(
             if self.speculative_config and self.speculative_config.use_eagle():
                 assert isinstance(self.drafter, EagleProposer)
                 use_cudagraphs = (
-                    cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+                    cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE)
                     and not self.speculative_config.enforce_eager
                 )
 
@@ -3995,6 +3996,7 @@ class GPUModelRunner(
                 self.drafter.dummy_run(
                     num_tokens,
                     use_cudagraphs=use_cudagraphs,
+                    is_graph_capturing=is_graph_capturing,
                 )
 
         # This is necessary to avoid blocking DP.
@@ -4427,6 +4429,7 @@ class GPUModelRunner(
                 skip_eplb=True,
                 remove_lora=False,
                 activate_lora=activate_lora,
+                is_graph_capturing=True,
             )
         self.maybe_remove_all_loras(self.lora_config)
 

From ef1f7030f016cc811236517e02fa51ee8876cc31 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Tue, 25 Nov 2025 01:55:09 -0600
Subject: [PATCH 052/134] [ROCm][CI] Fix test_cudagraph_mode failure in AMD CI
 (#29367)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/attention/utils.py               |  7 +++
 tests/v1/cudagraph/test_cudagraph_mode.py | 62 +++++++++++++++--------
 vllm/platforms/rocm.py                    |  4 +-
 3 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index dea89babd4b47..df3d53332c7cd 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -340,4 +340,11 @@ full_cg_backend_configs = {
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
     ),
+    "RocmAttn": BackendConfig(
+        name="RocmAttn",
+        env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+        comp_config={
+            "cudagraph_mode": "FULL",
+        },
+    ),
 }
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index d6bde16eba36b..7f9c2a0571c3c 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -35,14 +35,22 @@ def temporary_environ(env_vars):
 
 # test attention backend and cudagraph_mode combo
 # (backend_name, cudagraph_mode, supported)
-combo_cases_1 = [
-    ("FA3", "FULL", True),
-    ("FA3", "FULL_AND_PIECEWISE", True),
-    ("FA2", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
-    ("FA2", "FULL_AND_PIECEWISE", True),
-    ("FlashInfer", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
-    ("FlashInfer", "FULL_AND_PIECEWISE", True),
-]
+if current_platform.is_rocm():
+    combo_cases_1 = [
+        ("RocmAttn", "FULL", True),
+        ("RocmAttn", "FULL_AND_PIECEWISE", True),
+        ("TritonAttn", "FULL", True),
+        ("TritonAttn", "FULL_AND_PIECEWISE", True),
+    ]
+else:
+    combo_cases_1 = [
+        ("FA3", "FULL", True),
+        ("FA3", "FULL_AND_PIECEWISE", True),
+        ("FA2", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
+        ("FA2", "FULL_AND_PIECEWISE", True),
+        ("FlashInfer", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
+        ("FlashInfer", "FULL_AND_PIECEWISE", True),
+    ]
 
 
 @pytest.mark.parametrize("backend_name, cudagraph_mode, supported", combo_cases_1)
@@ -92,18 +100,32 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
 
 # test cudagraph_mode with different compilation mode.
 # (backend_name, cudagraph_mode, compilation_mode, supported)
-combo_cases_2 = [
-    ("FA2", "FULL", CompilationMode.NONE, True),
-    ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "PIECEWISE", CompilationMode.NONE, False),
-    ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
-    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
-    ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "NONE", CompilationMode.NONE, True),
-    ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
-]
+if current_platform.is_rocm():
+    combo_cases_2 = [
+        ("RocmAttn", "FULL", CompilationMode.NONE, True),
+        ("RocmAttn", "FULL", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "PIECEWISE", CompilationMode.NONE, False),
+        ("RocmAttn", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+        ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+        ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "NONE", CompilationMode.NONE, True),
+        ("RocmAttn", "NONE", CompilationMode.VLLM_COMPILE, True),
+    ]
+else:
+    combo_cases_2 = [
+        ("FA2", "FULL", CompilationMode.NONE, True),
+        ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "PIECEWISE", CompilationMode.NONE, False),
+        ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+        ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "NONE", CompilationMode.NONE, True),
+        ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
+    ]
 
 
 @pytest.mark.parametrize(
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index b0434b9642f07..0483f6c06ada8 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -321,8 +321,8 @@ class RocmPlatform(Platform):
             return AttentionBackendEnum.TRITON_ATTN.get_path()
 
         raise RuntimeError(
-            "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
-            "to select a supported backend."
+            f"Attention backend {selected_backend.name} is not supported on "
+            "ROCm. Note that V0 attention backends have been removed."
         )
 
     @classmethod

From 6330f9477db214477004df6546f86e3f14f8eab9 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Tue, 25 Nov 2025 15:59:40 +0800
Subject: [PATCH 053/134] [Bugfix] Fix GPT-OSS AR+NORM fusion (#28841)

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                   |  1 +
 tests/compile/distributed/test_fusions_e2e.py   | 11 +++++++++++
 .../device_communicators/symm_mem.py            |  2 +-
 vllm/model_executor/layers/fused_moe/layer.py   | 17 +++++++++++------
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e88e693a2dda5..e444becd9867b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -971,6 +971,7 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - vllm/model_executor/layers/fused_moe/layer.py
   - tests/compile/test_fusion_attn.py
   - tests/compile/test_silu_mul_quant_fusion.py
   - tests/compile/distributed/test_fusion_all_reduce.py
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 661172e1965b5..53c3f875d2003 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -111,6 +111,17 @@ if current_platform.is_cuda():
                 async_tp=96,  # MLP is MoE, half the fusions of dense
             ),
         ),
+        ModelBackendTestCase(
+            model_name="openai/gpt-oss-20b",
+            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
+            backend=AttentionBackendEnum.FLASHINFER,
+            matches=Matches(
+                attention_fusion=0,
+                allreduce_fusion=49,
+                sequence_parallel=49,
+                async_tp=48,
+            ),
+        ),
     ]
 
 elif current_platform.is_rocm():
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index eb1f173b11925..7a049b003cf73 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -131,7 +131,7 @@ class SymmMemCommunicator:
             return None
         if out is None:
             out = torch.empty_like(inp)
-        self.buffer[: inp.numel()].copy_(inp.view(-1))
+        self.buffer[: inp.numel()].copy_(inp.reshape(-1))
 
         # Determine which algorithm to use
         use_multimem = False
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 0ef3130b26333..bb30f1292a5fa 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1690,6 +1690,10 @@ class FusedMoE(CustomOp):
             )
 
         def reduce_output(states: torch.Tensor) -> torch.Tensor:
+            # Slice before all_reduce to enable possible fusion
+            if self.hidden_size != og_hidden_states:
+                states = states[..., :og_hidden_states]
+
             if (
                 not self.is_sequence_parallel
                 and not self.use_dp_chunking
@@ -1712,11 +1716,12 @@ class FusedMoE(CustomOp):
             if self.zero_expert_num is not None and self.zero_expert_num > 0:
                 assert isinstance(fused_output, tuple)
                 fused_output, zero_expert_result = fused_output
-                return (reduce_output(fused_output) + zero_expert_result)[
-                    ..., :og_hidden_states
-                ]
+                return (
+                    reduce_output(fused_output)
+                    + zero_expert_result[..., :og_hidden_states]
+                )
             else:
-                return reduce_output(fused_output)[..., :og_hidden_states]
+                return reduce_output(fused_output)
         else:
             if current_platform.is_tpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
@@ -1729,8 +1734,8 @@ class FusedMoE(CustomOp):
                     hidden_states, router_logits, self.layer_name
                 )
             return (
-                reduce_output(shared_output)[..., :og_hidden_states],
-                reduce_output(fused_output)[..., :og_hidden_states],
+                reduce_output(shared_output),
+                reduce_output(fused_output),
             )
 
     def forward_cuda(

From 67fc16cd8cf778a30ad0f7619fe77bd85f1d1633 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 25 Nov 2025 16:06:09 +0800
Subject: [PATCH 054/134] [Bugfix] If chunked_prefill is disabled, end the
 scheduling early. (#28911)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 tests/v1/core/test_scheduler.py | 28 ++++++++++++++++++++++++++++
 tests/v1/core/utils.py          |  3 ++-
 vllm/v1/core/sched/scheduler.py |  6 +++---
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 09acde6e08faa..fe4153e609971 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -641,6 +641,34 @@ def test_schedule_concurrent_batches(
     scheduler.update_from_output(scheduler_output1, model_runner_output)
 
 
+@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
+def test_schedule_order(enable_chunked_prefill: bool):
+    scheduler = create_scheduler(
+        max_num_batched_tokens=1024,
+        max_num_seqs=3,
+        enable_chunked_prefill=enable_chunked_prefill,
+    )
+
+    # long requests
+    requests = create_requests(num_requests=2, num_tokens=800)
+    # short requests
+    requests += create_requests(num_requests=2, num_tokens=10)
+
+    for request in requests:
+        scheduler.add_request(request)
+
+    scheduler_output1 = scheduler.schedule()
+
+    if enable_chunked_prefill:
+        # When enable chunked prefill, long requests will be chunked.
+        assert len(scheduler_output1.scheduled_new_reqs) == 2
+    else:
+        # When disable chunked prefill, should not skip the long requests,
+        # and scheduling subsequent short requests in advance,
+        # even though there is still token budgets remaining.
+        assert len(scheduler_output1.scheduled_new_reqs) == 1
+
+
 def test_preempt_during_execution():
     # NOTE(woosuk): The actual number of available blocks is 10 instead of 11
     # because block 0 is reserved as the null block.
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 6830f68736453..7537c7a60476b 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -42,6 +42,7 @@ def create_scheduler(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
+    enable_chunked_prefill: bool = True,
     enable_prefix_caching: bool = False,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
@@ -76,7 +77,7 @@ def create_scheduler(
         max_model_len=max_model_len,
         long_prefill_token_threshold=long_prefill_token_threshold,
         disable_chunked_mm_input=disable_chunked_mm_input,
-        enable_chunked_prefill=True,
+        enable_chunked_prefill=enable_chunked_prefill,
         async_scheduling=async_scheduling,
     )
     model_config = ModelConfig(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index a7ec0de372631..23af014c10364 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -508,9 +508,9 @@ class Scheduler(SchedulerInterface):
                         not self.scheduler_config.enable_chunked_prefill
                         and num_new_tokens > token_budget
                     ):
-                        self.waiting.pop_request()
-                        skipped_waiting_requests.prepend_request(request)
-                        continue
+                        # If chunked_prefill is disabled,
+                        # we can stop the scheduling here.
+                        break
 
                     num_new_tokens = min(num_new_tokens, token_budget)
                     assert num_new_tokens > 0

From db2906108acdc141e8a21e390228c69b1379e3c2 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 25 Nov 2025 00:30:11 -0800
Subject: [PATCH 055/134] [Misc] Streamline unique id generation (#29375)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/protocol.py       | 16 ++++++++--------
 vllm/entrypoints/openai/serving_engine.py |  9 +++++----
 vllm/utils/__init__.py                    |  4 +++-
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 5a0a05f9af323..c4023a6185289 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -661,7 +661,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1078,7 +1078,7 @@ class CompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1375,7 +1375,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1470,7 +1470,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1892,7 +1892,7 @@ class ClassificationCompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1983,7 +1983,7 @@ class ClassificationChatRequest(OpenAIBaseModel):
     )
 
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -3094,7 +3094,7 @@ class TranslationResponseVerbose(OpenAIBaseModel):
 ####### Tokens IN <> Tokens OUT #######
 class GenerateRequest(BaseModel):
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -3151,7 +3151,7 @@ class GenerateResponseChoice(BaseModel):
 
 class GenerateResponse(BaseModel):
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index de22c48809dc8..09a135b701d05 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1349,11 +1349,12 @@ class OpenAIServing:
         raw_request: Request | None, default: str | None = None
     ) -> str | None:
         """Pulls the request id to use from a header, if provided"""
-        default = default or random_uuid()
-        if raw_request is None:
-            return default
+        if raw_request is not None and (
+            (req_id := raw_request.headers.get("X-Request-Id")) is not None
+        ):
+            return req_id
 
-        return raw_request.headers.get("X-Request-Id", default)
+        return random_uuid() if default is None else default
 
     @staticmethod
     def _get_data_parallel_rank(raw_request: Request | None) -> int | None:
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index d94da71b289f3..fddcc27204307 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -52,9 +52,11 @@ STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"
 
+MASK_64_BITS = (1 << 64) - 1
+
 
 def random_uuid() -> str:
-    return str(uuid.uuid4().hex)
+    return f"{uuid.uuid4().int & MASK_64_BITS:016x}"  # 16 hex chars
 
 
 def length_from_prompt_token_ids_or_embeds(

From 32c40b95e09f26fc140d442d687072d01ea9ff2b Mon Sep 17 00:00:00 2001
From: Avishek Goswami <86944690+GOavi101@users.noreply.github.com>
Date: Tue, 25 Nov 2025 15:06:34 +0530
Subject: [PATCH 056/134] [BugFix] bad_words filtering ineffective when n > 1
 (#29313)

Signed-off-by: GOavi101 <1704178@kiit.ac.in>
---
 vllm/v1/engine/__init__.py   | 8 ++++++++
 vllm/v1/engine/async_llm.py  | 9 +++++----
 vllm/v1/engine/llm_engine.py | 7 +++++--
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 3f621d77c0241..ce2aae77108da 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -72,6 +72,14 @@ class EngineCoreRequest(
 
     trace_headers: Mapping[str, str] | None = None
 
+    @property
+    def params(self) -> SamplingParams | PoolingParams:
+        """Return the processed params (sampling or pooling)."""
+        if self.sampling_params is not None:
+            return self.sampling_params
+        assert self.pooling_params is not None
+        return self.pooling_params
+
 
 class EngineCoreEventType(enum.IntEnum):
     """The type of engine core request event."""
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index c64b3cccfc652..55087baadff97 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -321,14 +321,15 @@ class AsyncLLM(EngineClient):
             elif isinstance(prompt, Mapping):
                 prompt_text = cast(str | None, prompt.get("prompt"))
 
+        # Use cloned params that may have been updated in process_inputs()
+        params = request.params
+
         if is_pooling or params.n == 1:
             await self._add_request(request, prompt_text, None, 0, queue)
             return queue
 
-        # Get the updated SamplingParams from the request, which
-        # were cloned/updated in processor.process_inputs above.
-        parent_params = request.sampling_params
-        assert parent_params is not None
+        parent_params = params
+        assert isinstance(parent_params, SamplingParams)
 
         # Fan out child requests (for n>1).
         parent_request = ParentRequest(request_id, parent_params)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index e403cea87788b..dffe05445ee46 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -250,6 +250,9 @@ class LLMEngine:
             elif isinstance(prompt, Mapping):
                 prompt_text = cast(str | None, prompt.get("prompt"))
 
+        # Use cloned params that may have been updated in process_inputs()
+        params = request.params
+
         n = params.n if isinstance(params, SamplingParams) else 1
 
         if n == 1:
@@ -262,10 +265,10 @@ class LLMEngine:
         # Fan out child requests (for n>1).
         parent_req = ParentRequest(request_id, params)
         for idx in range(n):
-            request_id, params = parent_req.get_child_info(idx)
+            request_id, child_params = parent_req.get_child_info(idx)
             child_request = request if idx == n - 1 else copy(request)
             child_request.request_id = request_id
-            child_request.sampling_params = params
+            child_request.sampling_params = child_params
 
             # Make a new RequestState and queue.
             self.output_processor.add_request(

From a685b47c575de7bf1c8adf309f9eba33af354535 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Tue, 25 Nov 2025 01:47:10 -0800
Subject: [PATCH 057/134] [responsesAPI] refactor construct_input_messages
 (#29359)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 vllm/entrypoints/openai/serving_responses.py | 50 +++-----------------
 vllm/entrypoints/responses_utils.py          | 46 +++++++++++++++++-
 2 files changed, 52 insertions(+), 44 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 06efb43ecb7b8..f546dbda7fef5 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -94,7 +94,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.responses_utils import (
-    construct_chat_message_with_tool_call,
+    construct_input_messages,
     convert_tool_responses_to_completions_format,
     extract_tool_types,
 )
@@ -504,7 +504,12 @@ class OpenAIServingResponses(OpenAIServing):
                 for tool in request.tools
             ]
         # Construct the input messages.
-        messages = self._construct_input_messages(request, prev_response)
+        messages = construct_input_messages(
+            request_instructions=request.instructions,
+            request_input=request.input,
+            prev_msg=self.msg_store.get(prev_response.id) if prev_response else None,
+            prev_response_output=prev_response.output if prev_response else None,
+        )
         _, request_prompts, engine_prompts = await self._preprocess_chat(
             request,
             tokenizer,
@@ -869,47 +874,6 @@ class OpenAIServingResponses(OpenAIServing):
             output_items.extend(last_items)
         return output_items
 
-    def _construct_input_messages(
-        self,
-        request: ResponsesRequest,
-        prev_response: ResponsesResponse | None = None,
-    ) -> list[ChatCompletionMessageParam]:
-        messages: list[ChatCompletionMessageParam] = []
-        if request.instructions:
-            messages.append(
-                {
-                    "role": "system",
-                    "content": request.instructions,
-                }
-            )
-
-        # Prepend the conversation history.
-        if prev_response is not None:
-            # Add the previous messages.
-            prev_msg = self.msg_store[prev_response.id]
-            messages.extend(prev_msg)
-
-            # Add the previous output.
-            for output_item in prev_response.output:
-                # NOTE: We skip the reasoning output.
-                if isinstance(output_item, ResponseOutputMessage):
-                    for content in output_item.content:
-                        messages.append(
-                            {
-                                "role": "assistant",
-                                "content": content.text,
-                            }
-                        )
-
-        # Append the new input.
-        # Responses API supports simple text inputs without chat format.
-        if isinstance(request.input, str):
-            messages.append({"role": "user", "content": request.input})
-        else:
-            for item in request.input:
-                messages.append(construct_chat_message_with_tool_call(item))
-        return messages
-
     def _construct_harmony_system_input_message(
         self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str]
     ) -> OpenAIHarmonyMessage:
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 912e8a690573d..b02c43c7f8246 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -9,7 +9,8 @@ from openai.types.chat import (
 from openai.types.chat.chat_completion_message_tool_call_param import (
     Function as FunctionCallTool,
 )
-from openai.types.responses import ResponseFunctionToolCall
+from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
 
@@ -20,6 +21,49 @@ from vllm.entrypoints.openai.protocol import (
 )
 
 
+def construct_input_messages(
+    *,
+    request_instructions: str | None = None,
+    request_input: str | list[ResponseInputOutputItem],
+    prev_msg: list[ChatCompletionMessageParam] | None = None,
+    prev_response_output: list[ResponseOutputItem] | None = None,
+):
+    messages: list[ChatCompletionMessageParam] = []
+    if request_instructions:
+        messages.append(
+            {
+                "role": "system",
+                "content": request_instructions,
+            }
+        )
+
+    # Prepend the conversation history.
+    if prev_msg is not None:
+        # Add the previous messages.
+        messages.extend(prev_msg)
+    if prev_response_output is not None:
+        # Add the previous output.
+        for output_item in prev_response_output:
+            # NOTE: We skip the reasoning output.
+            if isinstance(output_item, ResponseOutputMessage):
+                for content in output_item.content:
+                    messages.append(
+                        {
+                            "role": "assistant",
+                            "content": content.text,
+                        }
+                    )
+
+    # Append the new input.
+    # Responses API supports simple text inputs without chat format.
+    if isinstance(request_input, str):
+        messages.append({"role": "user", "content": request_input})
+    else:
+        for item in request_input:
+            messages.append(construct_chat_message_with_tool_call(item))
+    return messages
+
+
 def construct_chat_message_with_tool_call(
     item: ResponseInputOutputItem,
 ) -> ChatCompletionMessageParam:

From e1dd706cd1f2b008f6295a6a64634bf9e9b202c1 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Tue, 25 Nov 2025 04:56:15 -0500
Subject: [PATCH 058/134] [Frontend] Respect Chat Completion
 parallel_tool_calls param (#26233)

Signed-off-by: Ben Browning <bbrownin@redhat.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 docs/serving/openai_compatible_server.md   |  3 +-
 tests/tool_use/test_parallel_tool_calls.py | 57 ++++++++++++++++++++++
 vllm/entrypoints/openai/protocol.py        |  4 +-
 vllm/entrypoints/openai/serving_chat.py    |  3 ++
 vllm/entrypoints/openai/serving_engine.py  |  6 +--
 vllm/entrypoints/openai/utils.py           | 37 ++++++++++++++
 6 files changed, 102 insertions(+), 8 deletions(-)
 create mode 100644 vllm/entrypoints/openai/utils.py

diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 23df3963823aa..e3280bd15b55c 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -49,7 +49,8 @@ We currently support the following OpenAI APIs:
     - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
     - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
-    - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
+    - *Note: `user` parameter is ignored.*
+    - *Note:* Setting the `parallel_tool_calls` parameter to `false` ensures vLLM only returns zero or one tool call per request. Setting it to `true` (the default) allows returning more than one tool call per request. There is no guarantee more than one tool call will be returned if this is set to `true`, as that behavior is model dependent and not all models are designed to support parallel tool calls.
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
     - Only applicable to [embedding models](../models/pooling_models.md).
 - [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index 9af94a6a64a25..77084ec2d9456 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -212,3 +212,60 @@ async def test_parallel_tool_calls_with_results(
     assert finish_reason_count == 1
     assert len(chunks)
     assert "".join(chunks) == choice.message.content
+
+
+@pytest.mark.asyncio
+async def test_parallel_tool_calls_false(client: openai.AsyncOpenAI):
+    """
+    Ensure only one tool call is returned when parallel_tool_calls is False.
+    """
+
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        parallel_tool_calls=False,
+    )
+
+    stop_reason = chat_completion.choices[0].finish_reason
+    non_streamed_tool_calls = chat_completion.choices[0].message.tool_calls
+
+    # make sure only 1 tool call is present
+    assert len(non_streamed_tool_calls) == 1
+    assert stop_reason == "tool_calls"
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        parallel_tool_calls=False,
+        stream=True,
+    )
+
+    finish_reason_count: int = 0
+    tool_call_id_count: int = 0
+
+    async for chunk in stream:
+        # if there's a finish reason make sure it's tools
+        if chunk.choices[0].finish_reason:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == "tool_calls"
+
+        streamed_tool_calls = chunk.choices[0].delta.tool_calls
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            tool_call = streamed_tool_calls[0]
+            if tool_call.id:
+                tool_call_id_count += 1
+
+    # make sure only 1 streaming tool call is present
+    assert tool_call_id_count == 1
+    assert finish_reason_count == 1
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index c4023a6185289..98a385a1dcd5f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -559,9 +559,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
     ) = "none"
     reasoning_effort: Literal["low", "medium", "high"] | None = None
     include_reasoning: bool = True
+    parallel_tool_calls: bool | None = True
 
-    # NOTE this will be ignored by vLLM -- the model determines the behavior
-    parallel_tool_calls: bool | None = False
+    # NOTE this will be ignored by vLLM
     user: str | None = None
 
     # --8<-- [start:chat-completion-sampling-params]
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 2a870dbc3afac..9a7051e0920af 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -55,6 +55,7 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_l
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
+from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
 from vllm.entrypoints.utils import get_max_tokens, should_include_usage
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
@@ -1206,6 +1207,7 @@ class OpenAIServingChat(OpenAIServing):
 
                         finish_reason_sent[i] = True
 
+                    choice_data = maybe_filter_parallel_tool_calls(choice_data, request)
                     chunk = ChatCompletionStreamResponse(
                         id=request_id,
                         object=chunk_object_type,
@@ -1531,6 +1533,7 @@ class OpenAIServingChat(OpenAIServing):
                     as_list(output.token_ids) if request.return_token_ids else None
                 ),
             )
+            choice_data = maybe_filter_parallel_tool_calls(choice_data, request)
 
             choices.append(choice_data)
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 09a135b701d05..d9feee917ff4e 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -296,11 +296,7 @@ class OpenAIServing:
         parser = None
         if not enable_auto_tools or tool_parser_name is None:
             return parser
-        logger.info(
-            '"auto" tool choice has been enabled please note that while'
-            " the parallel_tool_calls client option is preset for "
-            "compatibility reasons, it will be ignored."
-        )
+        logger.info('"auto" tool choice has been enabled.')
 
         try:
             if tool_parser_name == "pythonic" and self.model_config.model.startswith(
diff --git a/vllm/entrypoints/openai/utils.py b/vllm/entrypoints/openai/utils.py
new file mode 100644
index 0000000000000..6f37f6adff4c2
--- /dev/null
+++ b/vllm/entrypoints/openai/utils.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TypeVar
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+)
+
+# Used internally
+_ChatCompletionResponseChoiceT = TypeVar(
+    "_ChatCompletionResponseChoiceT",
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+)
+
+
+def maybe_filter_parallel_tool_calls(
+    choice: _ChatCompletionResponseChoiceT, request: ChatCompletionRequest
+) -> _ChatCompletionResponseChoiceT:
+    """Filter to first tool call only when parallel_tool_calls is False."""
+
+    if request.parallel_tool_calls:
+        return choice
+
+    if isinstance(choice, ChatCompletionResponseChoice) and choice.message.tool_calls:
+        choice.message.tool_calls = choice.message.tool_calls[:1]
+    elif (
+        isinstance(choice, ChatCompletionResponseStreamChoice)
+        and choice.delta.tool_calls
+    ):
+        choice.delta.tool_calls = [
+            tool_call for tool_call in choice.delta.tool_calls if tool_call.index == 0
+        ]
+
+    return choice

From 7a80b01889a963f1769a3a6f9cd509dc50d5b8ad Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 25 Nov 2025 18:39:10 +0800
Subject: [PATCH 059/134] [CI] Resettle pooling entrypoints tests.  (#29370)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 tests/entrypoints/pooling/{correctness => basic}/__init__.py      | 0
 tests/entrypoints/pooling/{llm => basic}/test_encode.py           | 0
 tests/entrypoints/pooling/{openai => basic}/test_truncation.py    | 0
 tests/entrypoints/pooling/{llm => classify}/__init__.py           | 0
 .../pooling/{llm/test_classify.py => classify/test_offline.py}    | 0
 .../{openai/test_classification.py => classify/test_online.py}    | 0
 .../test_online_vision.py}                                        | 0
 tests/entrypoints/pooling/{openai => embed}/__init__.py           | 0
 .../test_mteb_embed.py => embed/test_correctness_mteb.py}         | 0
 .../pooling/{llm/test_embedding.py => embed/test_offline.py}      | 0
 .../pooling/{openai/test_embedding.py => embed/test_online.py}    | 0
 .../test_online_dimensions.py}                                    | 0
 .../test_online_long_text.py}                                     | 0
 .../test_vision_embedding.py => embed/test_online_vision.py}      | 0
 tests/entrypoints/pooling/pooling/__init__.py                     | 0
 .../pooling/{openai/test_pooling.py => pooling/test_online.py}    | 0
 tests/entrypoints/pooling/reward/__init__.py                      | 0
 .../pooling/{llm/test_reward.py => reward/test_offline.py}        | 0
 tests/entrypoints/pooling/score/__init__.py                       | 0
 .../test_mteb_score.py => score/test_correctness_mteb.py}         | 0
 .../pooling/{llm/test_score.py => score/test_offline.py}          | 0
 .../{openai/test_rerank.py => score/test_online_rerank.py}        | 0
 .../pooling/{openai/test_score.py => score/test_online_score.py}  | 0
 23 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/entrypoints/pooling/{correctness => basic}/__init__.py (100%)
 rename tests/entrypoints/pooling/{llm => basic}/test_encode.py (100%)
 rename tests/entrypoints/pooling/{openai => basic}/test_truncation.py (100%)
 rename tests/entrypoints/pooling/{llm => classify}/__init__.py (100%)
 rename tests/entrypoints/pooling/{llm/test_classify.py => classify/test_offline.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_classification.py => classify/test_online.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_vision_classification.py => classify/test_online_vision.py} (100%)
 rename tests/entrypoints/pooling/{openai => embed}/__init__.py (100%)
 rename tests/entrypoints/pooling/{correctness/test_mteb_embed.py => embed/test_correctness_mteb.py} (100%)
 rename tests/entrypoints/pooling/{llm/test_embedding.py => embed/test_offline.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_embedding.py => embed/test_online.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_embedding_dimensions.py => embed/test_online_dimensions.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_embedding_long_text.py => embed/test_online_long_text.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_vision_embedding.py => embed/test_online_vision.py} (100%)
 create mode 100644 tests/entrypoints/pooling/pooling/__init__.py
 rename tests/entrypoints/pooling/{openai/test_pooling.py => pooling/test_online.py} (100%)
 create mode 100644 tests/entrypoints/pooling/reward/__init__.py
 rename tests/entrypoints/pooling/{llm/test_reward.py => reward/test_offline.py} (100%)
 create mode 100644 tests/entrypoints/pooling/score/__init__.py
 rename tests/entrypoints/pooling/{correctness/test_mteb_score.py => score/test_correctness_mteb.py} (100%)
 rename tests/entrypoints/pooling/{llm/test_score.py => score/test_offline.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_rerank.py => score/test_online_rerank.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_score.py => score/test_online_score.py} (100%)

diff --git a/tests/entrypoints/pooling/correctness/__init__.py b/tests/entrypoints/pooling/basic/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/correctness/__init__.py
rename to tests/entrypoints/pooling/basic/__init__.py
diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/basic/test_encode.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_encode.py
rename to tests/entrypoints/pooling/basic/test_encode.py
diff --git a/tests/entrypoints/pooling/openai/test_truncation.py b/tests/entrypoints/pooling/basic/test_truncation.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_truncation.py
rename to tests/entrypoints/pooling/basic/test_truncation.py
diff --git a/tests/entrypoints/pooling/llm/__init__.py b/tests/entrypoints/pooling/classify/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/__init__.py
rename to tests/entrypoints/pooling/classify/__init__.py
diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/classify/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_classify.py
rename to tests/entrypoints/pooling/classify/test_offline.py
diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/classify/test_online.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_classification.py
rename to tests/entrypoints/pooling/classify/test_online.py
diff --git a/tests/entrypoints/pooling/openai/test_vision_classification.py b/tests/entrypoints/pooling/classify/test_online_vision.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_vision_classification.py
rename to tests/entrypoints/pooling/classify/test_online_vision.py
diff --git a/tests/entrypoints/pooling/openai/__init__.py b/tests/entrypoints/pooling/embed/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/__init__.py
rename to tests/entrypoints/pooling/embed/__init__.py
diff --git a/tests/entrypoints/pooling/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
similarity index 100%
rename from tests/entrypoints/pooling/correctness/test_mteb_embed.py
rename to tests/entrypoints/pooling/embed/test_correctness_mteb.py
diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/embed/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_embedding.py
rename to tests/entrypoints/pooling/embed/test_offline.py
diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/embed/test_online.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_embedding.py
rename to tests/entrypoints/pooling/embed/test_online.py
diff --git a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/embed/test_online_dimensions.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_embedding_dimensions.py
rename to tests/entrypoints/pooling/embed/test_online_dimensions.py
diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/embed/test_online_long_text.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_embedding_long_text.py
rename to tests/entrypoints/pooling/embed/test_online_long_text.py
diff --git a/tests/entrypoints/pooling/openai/test_vision_embedding.py b/tests/entrypoints/pooling/embed/test_online_vision.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_vision_embedding.py
rename to tests/entrypoints/pooling/embed/test_online_vision.py
diff --git a/tests/entrypoints/pooling/pooling/__init__.py b/tests/entrypoints/pooling/pooling/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/pooling/openai/test_pooling.py b/tests/entrypoints/pooling/pooling/test_online.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_pooling.py
rename to tests/entrypoints/pooling/pooling/test_online.py
diff --git a/tests/entrypoints/pooling/reward/__init__.py b/tests/entrypoints/pooling/reward/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/reward/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_reward.py
rename to tests/entrypoints/pooling/reward/test_offline.py
diff --git a/tests/entrypoints/pooling/score/__init__.py b/tests/entrypoints/pooling/score/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/score/test_correctness_mteb.py
similarity index 100%
rename from tests/entrypoints/pooling/correctness/test_mteb_score.py
rename to tests/entrypoints/pooling/score/test_correctness_mteb.py
diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/score/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_score.py
rename to tests/entrypoints/pooling/score/test_offline.py
diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/score/test_online_rerank.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_rerank.py
rename to tests/entrypoints/pooling/score/test_online_rerank.py
diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/score/test_online_score.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_score.py
rename to tests/entrypoints/pooling/score/test_online_score.py

From de6889946bd10045f2ee79b252e75d8f3e323956 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 25 Nov 2025 19:00:44 +0800
Subject: [PATCH 060/134] [Misc] Suppress log outputs when constructing the
 default vllm config. (#29291)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 14 ++++++++------
 vllm/logger.py           | 11 ++++++++++-
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8338e54d4fd85..6b5c8ba87ecbf 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -77,7 +77,7 @@ from vllm.config.observability import DetailedTraceModules
 from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
-from vllm.logger import init_logger
+from vllm.logger import init_logger, suppress_logging
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
@@ -247,11 +247,13 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
             default = field.default
             # Handle pydantic.Field defaults
             if isinstance(default, FieldInfo):
-                default = (
-                    default.default
-                    if default.default_factory is None
-                    else default.default_factory()
-                )
+                if default.default_factory is None:
+                    default = default.default
+                else:
+                    # VllmConfig's Fields have default_factory set to config classes.
+                    # These could emit logs on init, which would be confusing.
+                    with suppress_logging():
+                        default = default.default_factory()
         elif field.default_factory is not MISSING:
             default = field.default_factory()
 
diff --git a/vllm/logger.py b/vllm/logger.py
index 772e36497b45e..ad3123c0f0149 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -7,7 +7,8 @@ import json
 import logging
 import os
 import sys
-from collections.abc import Hashable
+from collections.abc import Generator, Hashable
+from contextlib import contextmanager
 from functools import lru_cache, partial
 from logging import Logger
 from logging.config import dictConfig
@@ -212,6 +213,14 @@ def init_logger(name: str) -> _VllmLogger:
     return cast(_VllmLogger, logger)
 
 
+@contextmanager
+def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]:
+    current_level = logging.root.manager.disable
+    logging.disable(level)
+    yield
+    logging.disable(current_level)
+
+
 # The root logger is initialized when the module is imported.
 # This is thread-safe as the module is only imported once,
 # guaranteed by the Python GIL.

From 798e87db5c21e017f77edcdf7e50b1ac79d65c54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 25 Nov 2025 12:32:11 +0100
Subject: [PATCH 061/134] [Core] Generalize Encoder-Decoder `seq_lens`
 computation to avoid Whisper hardcoded logic   (#29268)

Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
---
 vllm/attention/layers/cross_attention.py | 44 ++++++++++++------------
 vllm/v1/attention/backends/utils.py      |  3 +-
 vllm/v1/worker/gpu_model_runner.py       | 38 ++++++++++++++------
 3 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py
index 5b44c7e3e7ec8..068fd0a0eb7d0 100644
--- a/vllm/attention/layers/cross_attention.py
+++ b/vllm/attention/layers/cross_attention.py
@@ -25,15 +25,6 @@ from vllm.v1.kv_cache_interface import CrossAttentionSpec, KVCacheSpec
 logger = init_logger(__name__)
 
 
-def _get_max_encoder_len(vllm_config: "VllmConfig") -> int:
-    """Gets the max number of encoder input tokens from the config."""
-    sc = vllm_config.scheduler_config
-    assert sc and isinstance(sc.max_num_encoder_input_tokens, int), (
-        "max_num_encoder_input_tokens must be int for enc-dec models"
-    )
-    return sc.max_num_encoder_input_tokens
-
-
 def _get_cross_slot_mapping(
     encoder_seq_lens: np.ndarray,
     block_table_tensor: torch.Tensor,
@@ -93,23 +84,32 @@ def create_cross_attention_backend(
         ) -> AttentionMetadata:
             new_metadata = copy(common_attn_metadata)
             new_metadata.causal = False
-            max_encoder_len = _get_max_encoder_len(self.vllm_config)
+            max_encoder_len = int(new_metadata.encoder_seq_lens_cpu.max())
             new_metadata.max_seq_len = max_encoder_len
+            # Any computed tokens indicated decode step>1 (no chunked prefill)
+            num_cache_decodes = (
+                (common_attn_metadata.num_computed_tokens_cpu > 0).sum().item()
+            )
+            if num_cache_decodes > 0:
+                # CrossAttn KV cache has already been populated on first decoder step,
+                # skip slot_mapping calculation for requests that do not need
+                # reshape_and_cache.
+                num_tokens = common_attn_metadata.num_computed_tokens_cpu.numpy()
+                new_metadata.encoder_seq_lens_cpu = np.where(
+                    num_tokens > 0, 0, new_metadata.encoder_seq_lens_cpu
+                )
 
-            new_metadata.seq_lens = torch.full(
-                (new_metadata.num_reqs,),
-                max_encoder_len,
-                dtype=torch.int32,
-                device=self.device,
-            )
-            new_metadata.seq_lens_cpu = torch.full(
-                (new_metadata.num_reqs,),
-                max_encoder_len,
-                dtype=torch.int32,
-                device="cpu",
+            # seq_lens is provided by model runner: initial encoder input length is
+            # needed here to know how many tokens to attend to from the cached
+            # cross-attention KV cache.
+            new_metadata.seq_lens = common_attn_metadata.encoder_seq_lens
+            new_metadata.seq_lens_cpu = torch.from_numpy(
+                common_attn_metadata.encoder_seq_lens_cpu
             )
+
+            # NOTE (NickLucche) use `new_metadata` instead of `common_*` (initial) here
             new_metadata.slot_mapping = _get_cross_slot_mapping(
-                new_metadata.encoder_seq_lens,
+                new_metadata.encoder_seq_lens_cpu,
                 new_metadata.block_table_tensor,
                 self.kv_cache_spec,
                 self.device,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 540a8e2b1d016..cebfe8a3ff04e 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -89,7 +89,8 @@ class CommonAttentionMetadata:
     num_logits_indices: int | None = None
 
     # Needed by CrossAttentionBuilder
-    encoder_seq_lens: np.ndarray | None = None
+    encoder_seq_lens: torch.Tensor | None = None
+    encoder_seq_lens_cpu: np.ndarray | None = None
 
     dcp_local_seq_lens: torch.Tensor | None = None
     dcp_local_seq_lens_cpu: torch.Tensor | None = None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 74fd2a1e2a2c0..0ce6c4a3204b0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -475,6 +475,7 @@ class GPUModelRunner(
             self.max_num_reqs + 1, dtype=torch.int32
         )
         self.seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
+        self.encoder_seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
         if self.dcp_world_size > 1:
             self.dcp_local_seq_lens = self._make_buffer(
                 self.max_num_reqs, dtype=torch.int32
@@ -1202,21 +1203,35 @@ class GPUModelRunner(
 
     def _get_encoder_seq_lens(
         self,
-        scheduled_encoder_inputs: dict[str, list[int]],
+        num_scheduled_tokens: dict[str, int],
         kv_cache_spec: KVCacheSpec,
         num_reqs: int,
-    ) -> np.ndarray | None:
+    ) -> tuple[torch.Tensor | None, np.ndarray | None]:
         if not isinstance(kv_cache_spec, CrossAttentionSpec):
-            return None
+            return None, None
 
         # Build encoder_seq_lens array mapping request indices to
         # encoder lengths for inputs scheduled in this batch
-        encoder_seq_lens = np.zeros(num_reqs, dtype=np.int32)
-        for req_id in scheduled_encoder_inputs:
+        for req_id in num_scheduled_tokens:
             req_index = self.input_batch.req_id_to_index[req_id]
-            encoder_seq_lens[req_index] = self.max_encoder_len
+            req_state = self.requests[req_id]
+            if req_state.mm_features is None:
+                self.encoder_seq_lens.np[req_index] = 0
+                continue
 
-        return encoder_seq_lens
+            # Get the total number of encoder input tokens for running encoder requests
+            # whether encoding is finished or not so that cross-attention knows how
+            # many encoder tokens to attend to.
+            encoder_input_tokens = sum(
+                feature.mm_position.length for feature in req_state.mm_features
+            )
+            self.encoder_seq_lens.np[req_index] = encoder_input_tokens
+
+        self.encoder_seq_lens.copy_to_gpu(num_reqs)
+        encoder_seq_lens = self.encoder_seq_lens.gpu[:num_reqs]
+        encoder_seq_lens_cpu = self.encoder_seq_lens.np[:num_reqs]
+
+        return encoder_seq_lens, encoder_seq_lens_cpu
 
     def _prepare_inputs(
         self,
@@ -1482,7 +1497,7 @@ class GPUModelRunner(
         logits_indices: torch.Tensor | None = None,
         use_spec_decode: bool = False,
         for_cudagraph_capture: bool = False,
-        scheduled_encoder_inputs: dict[str, list[int]] | None = None,
+        num_scheduled_tokens: dict[str, int] | None = None,
         cascade_attn_prefix_lens: list[list[int]] | None = None,
     ) -> tuple[PerLayerAttnMetadata, CommonAttentionMetadata | None]:
         """
@@ -1547,8 +1562,8 @@ class GPUModelRunner(
         for kv_cache_gid, kv_cache_group in enumerate(
             self.kv_cache_config.kv_cache_groups
         ):
-            encoder_seq_lens = self._get_encoder_seq_lens(
-                scheduled_encoder_inputs or {},
+            encoder_seq_lens, encoder_seq_lens_cpu = self._get_encoder_seq_lens(
+                num_scheduled_tokens or {},
                 kv_cache_group.kv_cache_spec,
                 num_reqs,
             )
@@ -1591,6 +1606,7 @@ class GPUModelRunner(
                 num_logits_indices=num_logits_indices,
                 causal=True,
                 encoder_seq_lens=encoder_seq_lens,
+                encoder_seq_lens_cpu=encoder_seq_lens_cpu,
                 dcp_local_seq_lens=dcp_local_seq_lens,
                 dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu,
             )
@@ -2828,7 +2844,7 @@ class GPUModelRunner(
                         ubatch_slices=ubatch_slices,
                         logits_indices=logits_indices,
                         use_spec_decode=use_spec_decode,
-                        scheduled_encoder_inputs=scheduler_output.scheduled_encoder_inputs,
+                        num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
                         cascade_attn_prefix_lens=cascade_attn_prefix_lens,
                     )
                 )

From c2c661af9be413fb22adc59fc17fe5f5a680b313 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Tue, 25 Nov 2025 04:38:36 -0800
Subject: [PATCH 062/134] [Bugfix] Fix overallocation in MM profiling  (#29386)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 vllm/v1/worker/gpu_model_runner.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0ce6c4a3204b0..e78d3c71af77a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4245,14 +4245,18 @@ class GPUModelRunner(
                     # NOTE: This happens when encoder cache needs to store
                     # the embeddings that encoder outputs are scattered onto.
                     # In this case we create dummy embeddings of size
-                    # (encode_budget, hidden_size) and scatter encoder
-                    # output into it.
+                    # (max_tokens_for_modality, hidden_size) and scatter
+                    # encoder output into it.
                     encoder_output_shape = dummy_encoder_outputs[0].shape
-                    if encoder_output_shape[0] < encoder_budget:
+                    max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[
+                        dummy_modality
+                    ]
+                    if encoder_output_shape[0] < max_mm_tokens_per_item:
+                        encoder_hidden_size = encoder_output_shape[-1]
                         expanded_outputs = []
                         for output in dummy_encoder_outputs:
                             expanded = output.new_zeros(
-                                (encoder_budget, encoder_output_shape[-1])
+                                (max_mm_tokens_per_item, encoder_hidden_size)
                             )
                             num_tokens = output.shape[0]
                             expanded[:num_tokens].copy_(output)

From bf0c75cd4f638b359f52602bb0fd54ef434068cb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 12:41:15 +0000
Subject: [PATCH 063/134] Make Transformers Nightly tests soft-fail and enable
 all tests (#29401)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e444becd9867b..10a19c52c72dc 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -903,11 +903,12 @@ steps:
 - label: Transformers Nightly Models Test
   working_dir: "/vllm-workspace/"
   optional: true
+  soft_fail: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py
     - pytest -v -s tests/models/test_transformers.py
-    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/processing/
     - pytest -v -s tests/models/multimodal/test_mapping.py
     - python3 examples/offline_inference/basic/chat.py
     - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl

From 51fc9e017a721c7fb283cecf3231bbe6e358132b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 12:55:42 +0000
Subject: [PATCH 064/134] Scheduled removal of `CompilationConfig.use_inductor`
 (#29323)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/compile/fullgraph/test_simple.py | 12 +++++------
 tests/engine/test_arg_utils.py         |  9 ++++----
 tests/utils_/test_argparse_utils.py    |  4 ++--
 vllm/config/compilation.py             | 29 +-------------------------
 4 files changed, 13 insertions(+), 41 deletions(-)

diff --git a/tests/compile/fullgraph/test_simple.py b/tests/compile/fullgraph/test_simple.py
index e258133ab50a7..36cc1510ed798 100644
--- a/tests/compile/fullgraph/test_simple.py
+++ b/tests/compile/fullgraph/test_simple.py
@@ -55,7 +55,7 @@ class SillyModel(nn.Module):
 def _run_simple_model(
     splitting_ops,
     use_inductor_graph_partition,
-    use_inductor,
+    backend,
     expected_num_piecewise_graphs_seen,
     expected_num_piecewise_capturable_graphs_seen,
     expected_num_backend_compilations,
@@ -64,7 +64,7 @@ def _run_simple_model(
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_inductor=use_inductor,
+            backend=backend,
             splitting_ops=splitting_ops,
             use_inductor_graph_partition=use_inductor_graph_partition,
             cudagraph_copy_inputs=True,
@@ -124,14 +124,14 @@ def _run_simple_model(
         assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
 
 
-@pytest.mark.parametrize("use_inductor", [True, False])
+@pytest.mark.parametrize("backend", ["inductor", "eager"])
 @torch.inference_mode()
 @create_new_process_for_each_test("spawn")
-def test_simple_piecewise_compile(use_inductor):
+def test_simple_piecewise_compile(backend):
     _run_simple_model(
         splitting_ops=["silly::attention"],
         use_inductor_graph_partition=False,
-        use_inductor=use_inductor,
+        backend=backend,
         # 2 * num_layers + 1
         expected_num_piecewise_graphs_seen=5,
         # 1 + num_layers
@@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
     _run_simple_model(
         splitting_ops=["silly::attention"],
         use_inductor_graph_partition=True,
-        use_inductor=True,
+        backend="inductor",
         # Since not splitting at fx graph level
         expected_num_piecewise_graphs_seen=1,
         # Since not splitting at fx graph level
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 10827e3b4b9cd..93bc94123aaa9 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -249,14 +249,13 @@ def test_compilation_config():
     args = parser.parse_args(
         [
             "-O",
-            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-            '"use_inductor": false}',
+            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
         ]
     )
     assert (
         args.compilation_config.mode == 3
         and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-        and not args.compilation_config.use_inductor
+        and args.compilation_config.backend == "eager"
     )
 
     # set to string form of a dict
@@ -264,13 +263,13 @@ def test_compilation_config():
         [
             "--compilation-config="
             '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-            '"use_inductor": true}',
+            '"backend": "inductor"}',
         ]
     )
     assert (
         args.compilation_config.mode == 3
         and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-        and args.compilation_config.use_inductor
+        and args.compilation_config.backend == "inductor"
     )
 
 
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 3310753d2b6d6..32d4eca541356 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -166,7 +166,7 @@ def test_dict_args(parser):
         "--hf-overrides.key2.key4",
         "val3",
         # Test compile config and compilation mode
-        "-O.use_inductor=true",
+        "-O.use_inductor_graph_partition=true",
         "-O.backend",
         "custom",
         "-O1",
@@ -219,7 +219,7 @@ def test_dict_args(parser):
     }
     assert parsed_args.compilation_config == {
         "mode": 1,
-        "use_inductor": True,
+        "use_inductor_graph_partition": True,
         "backend": "custom",
         "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
     }
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 556b2d9168b32..865d045676d14 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -264,7 +264,6 @@ class CompilationConfig:
         - [`cudagraph_copy_inputs`]
         [vllm.config.CompilationConfig.cudagraph_copy_inputs]
     - Inductor compilation:
-        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
         - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
         - [`inductor_compile_config`]
         [vllm.config.CompilationConfig.inductor_compile_config]
@@ -348,7 +347,7 @@ class CompilationConfig:
     - 'none,+op1,+op2' to enable only op1 and op2
 
     By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
+    disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor".
     Inductor generates (fused) Triton kernels for disabled custom ops."""
     splitting_ops: list[str] | None = None
     """A list of ops to exclude from cudagraphs, used in piecewise compilation.
@@ -374,24 +373,6 @@ class CompilationConfig:
     Disabled by default until more models are supported/tested to work."""
 
     # Inductor capture
-    use_inductor: bool | None = None
-    """
-    Whether to use inductor compilation.
-
-    This flag is deprecated and will be removed in the next release 0.12.0.
-    Please use the 'backend' option instead.
-
-    - False: inductor compilation is not used. graph runs in eager
-        (custom_ops enabled by default).
-    - True: inductor compilation is used (custom_ops disabled by default).
-        One graph for symbolic shape and one graph per size in compile_sizes
-        are compiled using configurations in inductor_compile_config.
-
-    This setting is ignored if mode<VLLM_COMPILE.
-
-    For future compatibility:
-    If use_inductor is True, backend="inductor" otherwise backend="eager".
-    """
     compile_sizes: list[int | str] | None = None
     """Sizes to compile for inductor. In addition
     to integers, it also supports "cudagraph_capture_sizes" to
@@ -759,14 +740,6 @@ class CompilationConfig:
                 f"Invalid backend for piecewise compilation: {self.backend}"
             )
 
-        if self.use_inductor is not None:
-            logger.warning_once(
-                "The 'use_inductor' flag is deprecated and will be "
-                "removed in the next release (v0.12.0). "
-                "Please use the 'backend' option instead.",
-            )
-            self.backend = "inductor" if self.use_inductor else "eager"
-
         if self.backend == "":
             self.backend = current_platform.get_compile_backend()
 

From 516c3f784723f0f25d65305ef7051ae016e18d42 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 25 Nov 2025 15:05:10 +0100
Subject: [PATCH 065/134] [Bugfix] Fix logic for choosing default prefix
 caching setting (#29393)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 tests/engine/test_arg_utils.py | 3 ++-
 vllm/engine/arg_utils.py       | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 93bc94123aaa9..be926764e4948 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -277,8 +277,9 @@ def test_prefix_cache_default():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     args = parser.parse_args([])
 
+    # should be None by default (depends on model).
     engine_args = EngineArgs.from_cli_args(args=args)
-    assert engine_args.enable_prefix_caching, "prefix caching should default to on."
+    assert engine_args.enable_prefix_caching is None
 
     # with flag to turn it on.
     args = parser.parse_args(["--enable-prefix-caching"])
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6b5c8ba87ecbf..177915715140f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -880,7 +880,11 @@ class EngineArgs:
             "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
         )
         cache_group.add_argument(
-            "--enable-prefix-caching", **cache_kwargs["enable_prefix_caching"]
+            "--enable-prefix-caching",
+            **{
+                **cache_kwargs["enable_prefix_caching"],
+                "default": None,
+            },
         )
         cache_group.add_argument(
             "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]

From 0231ce836a27ad0ed722e4b7162821baeeb19fdf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eldar=20Kurti=C4=87?=
 <8884008+eldarkurtic@users.noreply.github.com>
Date: Tue, 25 Nov 2025 15:23:38 +0100
Subject: [PATCH 066/134] Revert back to torch.equal over torch.allclose from
 #28819  (#29086)

Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com>
---
 vllm/v1/spec_decode/eagle.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 784ccbc04932f..7b9037c03d4f0 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1055,11 +1055,11 @@ class EagleProposer:
                 elif (
                     isinstance(target_embed_tokens.weight, torch.Tensor)
                     and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
-                    and torch.allclose(
+                    # TODO: Offload to CPU for comparison to avoid extra GPU memory
+                    # usage in CI testing environments with limited GPU memory
+                    and torch.equal(
                         target_embed_tokens.weight.cpu(),
                         self.model.model.embed_tokens.weight.cpu(),
-                        rtol=1e-5,
-                        atol=1e-7,
                     )
                 ):
                     share_embeddings = True
@@ -1105,8 +1105,11 @@ class EagleProposer:
                 hasattr(target_language_model, "lm_head")
                 and isinstance(target_language_model.lm_head.weight, torch.Tensor)
                 and isinstance(self.model.lm_head.weight, torch.Tensor)
+                # TODO: Offload to CPU for comparison to avoid extra GPU memory
+                # usage in CI testing environments with limited GPU memory
                 and torch.equal(
-                    target_language_model.lm_head.weight, self.model.lm_head.weight
+                    target_language_model.lm_head.weight.cpu(),
+                    self.model.lm_head.weight.cpu(),
                 )
             ):
                 share_lm_head = True

From 794029f012066e6039b63566ef7d845af1ade831 Mon Sep 17 00:00:00 2001
From: Injae Ryou <injaeryou@gmail.com>
Date: Tue, 25 Nov 2025 23:28:53 +0900
Subject: [PATCH 067/134] [Feature]: Improve GGUF loading from HuggingFace user
 experience like repo_id:quant_type (#29137)

Signed-off-by: Injae Ryou <injaeryou@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/test_gguf_download.py            | 240 ++++++++++++++++++
 tests/transformers_utils/test_utils.py        | 146 +++++++++++
 vllm/config/model.py                          |  15 +-
 vllm/engine/arg_utils.py                      |   6 +-
 .../model_loader/gguf_loader.py               |  31 ++-
 .../model_loader/weight_utils.py              |  46 ++++
 vllm/transformers_utils/config.py             |  51 +++-
 vllm/transformers_utils/processor.py          |  10 +-
 vllm/transformers_utils/tokenizer.py          |  17 +-
 vllm/transformers_utils/utils.py              |  53 ++++
 10 files changed, 579 insertions(+), 36 deletions(-)
 create mode 100644 tests/models/test_gguf_download.py

diff --git a/tests/models/test_gguf_download.py b/tests/models/test_gguf_download.py
new file mode 100644
index 0000000000000..155768ac9bff7
--- /dev/null
+++ b/tests/models/test_gguf_download.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.config.load import LoadConfig
+from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader
+from vllm.model_executor.model_loader.weight_utils import download_gguf
+
+
+class TestGGUFDownload:
+    """Test GGUF model downloading functionality."""
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    def test_download_gguf_single_file(self, mock_download):
+        """Test downloading a single GGUF file."""
+        # Setup mock
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        # Mock glob to return a single file
+        with patch("glob.glob") as mock_glob:
+            mock_glob.side_effect = lambda pattern, **kwargs: (
+                [f"{mock_folder}/model-IQ1_S.gguf"] if "IQ1_S" in pattern else []
+            )
+
+            result = download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
+
+            # Verify download_weights_from_hf was called with correct patterns
+            mock_download.assert_called_once_with(
+                model_name_or_path="unsloth/Qwen3-0.6B-GGUF",
+                cache_dir=None,
+                allow_patterns=[
+                    "*-IQ1_S.gguf",
+                    "*-IQ1_S-*.gguf",
+                    "*/*-IQ1_S.gguf",
+                    "*/*-IQ1_S-*.gguf",
+                ],
+                revision=None,
+                ignore_patterns=None,
+            )
+
+            # Verify result is the file path, not folder
+            assert result == f"{mock_folder}/model-IQ1_S.gguf"
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    def test_download_gguf_sharded_files(self, mock_download):
+        """Test downloading sharded GGUF files."""
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        # Mock glob to return sharded files
+        with patch("glob.glob") as mock_glob:
+            mock_glob.side_effect = lambda pattern, **kwargs: (
+                [
+                    f"{mock_folder}/model-Q2_K-00001-of-00002.gguf",
+                    f"{mock_folder}/model-Q2_K-00002-of-00002.gguf",
+                ]
+                if "Q2_K" in pattern
+                else []
+            )
+
+            result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
+
+            # Should return the first file after sorting
+            assert result == f"{mock_folder}/model-Q2_K-00001-of-00002.gguf"
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    def test_download_gguf_subdir(self, mock_download):
+        """Test downloading GGUF files from subdirectory."""
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        with patch("glob.glob") as mock_glob:
+            mock_glob.side_effect = lambda pattern, **kwargs: (
+                [f"{mock_folder}/Q2_K/model-Q2_K.gguf"]
+                if "Q2_K" in pattern or "**/*.gguf" in pattern
+                else []
+            )
+
+            result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
+
+            assert result == f"{mock_folder}/Q2_K/model-Q2_K.gguf"
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    @patch("glob.glob", return_value=[])
+    def test_download_gguf_no_files_found(self, mock_glob, mock_download):
+        """Test error when no GGUF files are found."""
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        with pytest.raises(ValueError, match="Downloaded GGUF files not found"):
+            download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
+
+
+class TestGGUFModelLoader:
+    """Test GGUFModelLoader class methods."""
+
+    @patch("os.path.isfile", return_value=True)
+    def test_prepare_weights_local_file(self, mock_isfile):
+        """Test _prepare_weights with local file."""
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        # Create a simple mock ModelConfig with only the model attribute
+        model_config = MagicMock()
+        model_config.model = "/path/to/model.gguf"
+
+        result = loader._prepare_weights(model_config)
+        assert result == "/path/to/model.gguf"
+        mock_isfile.assert_called_once_with("/path/to/model.gguf")
+
+    @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_https_url(self, mock_isfile, mock_hf_download):
+        """Test _prepare_weights with HTTPS URL."""
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        mock_hf_download.return_value = "/downloaded/model.gguf"
+
+        # Create a simple mock ModelConfig with only the model attribute
+        model_config = MagicMock()
+        model_config.model = "https://huggingface.co/model.gguf"
+
+        result = loader._prepare_weights(model_config)
+        assert result == "/downloaded/model.gguf"
+        mock_hf_download.assert_called_once_with(
+            url="https://huggingface.co/model.gguf"
+        )
+
+    @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_repo_filename(self, mock_isfile, mock_hf_download):
+        """Test _prepare_weights with repo_id/filename.gguf format."""
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        mock_hf_download.return_value = "/downloaded/model.gguf"
+
+        # Create a simple mock ModelConfig with only the model attribute
+        model_config = MagicMock()
+        model_config.model = "unsloth/Qwen3-0.6B-GGUF/model.gguf"
+
+        result = loader._prepare_weights(model_config)
+        assert result == "/downloaded/model.gguf"
+        mock_hf_download.assert_called_once_with(
+            repo_id="unsloth/Qwen3-0.6B-GGUF", filename="model.gguf"
+        )
+
+    @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
+    @patch("vllm.transformers_utils.config.file_or_path_exists", return_value=True)
+    @patch("vllm.config.model.get_config")
+    @patch("vllm.config.model.is_gguf", return_value=True)
+    @patch("vllm.model_executor.model_loader.gguf_loader.download_gguf")
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_repo_quant_type(
+        self,
+        mock_isfile,
+        mock_download_gguf,
+        mock_is_gguf,
+        mock_get_config,
+        mock_file_exists,
+        mock_get_image_config,
+    ):
+        """Test _prepare_weights with repo_id:quant_type format."""
+        mock_hf_config = MagicMock()
+        mock_hf_config.architectures = ["Qwen3ForCausalLM"]
+
+        class MockTextConfig:
+            max_position_embeddings = 4096
+            sliding_window = None
+            model_type = "qwen3"
+            num_attention_heads = 32
+
+        mock_text_config = MockTextConfig()
+        mock_hf_config.get_text_config.return_value = mock_text_config
+        mock_hf_config.dtype = "bfloat16"
+        mock_get_config.return_value = mock_hf_config
+
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        mock_download_gguf.return_value = "/downloaded/model-IQ1_S.gguf"
+
+        model_config = ModelConfig(
+            model="unsloth/Qwen3-0.6B-GGUF:IQ1_S", tokenizer="Qwen/Qwen3-0.6B"
+        )
+        result = loader._prepare_weights(model_config)
+        # The actual result will be the downloaded file path from mock
+        assert result == "/downloaded/model-IQ1_S.gguf"
+        mock_download_gguf.assert_called_once_with(
+            "unsloth/Qwen3-0.6B-GGUF",
+            "IQ1_S",
+            cache_dir=None,
+            revision=None,
+            ignore_patterns=["original/**/*"],
+        )
+
+    @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
+    @patch("vllm.config.model.get_config")
+    @patch("vllm.config.model.is_gguf", return_value=False)
+    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_invalid_format(
+        self,
+        mock_isfile,
+        mock_check_gguf,
+        mock_is_gguf,
+        mock_get_config,
+        mock_get_image_config,
+    ):
+        """Test _prepare_weights with invalid format."""
+        mock_hf_config = MagicMock()
+        mock_hf_config.architectures = ["Qwen3ForCausalLM"]
+
+        class MockTextConfig:
+            max_position_embeddings = 4096
+            sliding_window = None
+            model_type = "qwen3"
+            num_attention_heads = 32
+
+        mock_text_config = MockTextConfig()
+        mock_hf_config.get_text_config.return_value = mock_text_config
+        mock_hf_config.dtype = "bfloat16"
+        mock_get_config.return_value = mock_hf_config
+
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        # Create ModelConfig with a valid repo_id to avoid validation errors
+        # Then test _prepare_weights with invalid format
+        model_config = ModelConfig(model="unsloth/Qwen3-0.6B")
+        # Manually set model to invalid format after creation
+        model_config.model = "invalid-format"
+        with pytest.raises(ValueError, match="Unrecognised GGUF reference"):
+            loader._prepare_weights(model_config)
diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py
index bfe1cec76c138..a8d0b9be9ec29 100644
--- a/tests/transformers_utils/test_utils.py
+++ b/tests/transformers_utils/test_utils.py
@@ -1,11 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+from unittest.mock import patch
 
+import pytest
 
 from vllm.transformers_utils.utils import (
     is_cloud_storage,
     is_gcs,
+    is_gguf,
+    is_remote_gguf,
     is_s3,
+    split_remote_gguf,
 )
 
 
@@ -28,3 +34,143 @@ def test_is_cloud_storage():
     assert is_cloud_storage("s3://model-path/path-to-model")
     assert not is_cloud_storage("/unix/local/path")
     assert not is_cloud_storage("nfs://nfs-fqdn.local")
+
+
+class TestIsRemoteGGUF:
+    """Test is_remote_gguf utility function."""
+
+    def test_is_remote_gguf_with_colon_and_slash(self):
+        """Test is_remote_gguf with repo_id:quant_type format."""
+        # Valid quant types
+        assert is_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
+        assert is_remote_gguf("user/repo:Q2_K")
+        assert is_remote_gguf("repo/model:Q4_K")
+        assert is_remote_gguf("repo/model:Q8_0")
+
+        # Invalid quant types should return False
+        assert not is_remote_gguf("repo/model:quant")
+        assert not is_remote_gguf("repo/model:INVALID")
+        assert not is_remote_gguf("repo/model:invalid_type")
+
+    def test_is_remote_gguf_without_colon(self):
+        """Test is_remote_gguf without colon."""
+        assert not is_remote_gguf("repo/model")
+        assert not is_remote_gguf("unsloth/Qwen3-0.6B-GGUF")
+
+    def test_is_remote_gguf_without_slash(self):
+        """Test is_remote_gguf without slash."""
+        assert not is_remote_gguf("model.gguf")
+        # Even with valid quant_type, no slash means not remote GGUF
+        assert not is_remote_gguf("model:IQ1_S")
+        assert not is_remote_gguf("model:quant")
+
+    def test_is_remote_gguf_local_path(self):
+        """Test is_remote_gguf with local file path."""
+        assert not is_remote_gguf("/path/to/model.gguf")
+        assert not is_remote_gguf("./model.gguf")
+
+    def test_is_remote_gguf_with_path_object(self):
+        """Test is_remote_gguf with Path object."""
+        assert is_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
+        assert not is_remote_gguf(Path("repo/model"))
+
+    def test_is_remote_gguf_with_http_https(self):
+        """Test is_remote_gguf with HTTP/HTTPS URLs."""
+        # HTTP/HTTPS URLs should return False even with valid quant_type
+        assert not is_remote_gguf("http://example.com/repo/model:IQ1_S")
+        assert not is_remote_gguf("https://huggingface.co/repo/model:Q2_K")
+        assert not is_remote_gguf("http://repo/model:Q4_K")
+        assert not is_remote_gguf("https://repo/model:Q8_0")
+
+    def test_is_remote_gguf_with_cloud_storage(self):
+        """Test is_remote_gguf with cloud storage paths."""
+        # Cloud storage paths should return False even with valid quant_type
+        assert not is_remote_gguf("s3://bucket/repo/model:IQ1_S")
+        assert not is_remote_gguf("gs://bucket/repo/model:Q2_K")
+        assert not is_remote_gguf("s3://repo/model:Q4_K")
+        assert not is_remote_gguf("gs://repo/model:Q8_0")
+
+
+class TestSplitRemoteGGUF:
+    """Test split_remote_gguf utility function."""
+
+    def test_split_remote_gguf_valid(self):
+        """Test split_remote_gguf with valid repo_id:quant_type format."""
+        repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
+        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
+        assert quant_type == "IQ1_S"
+
+        repo_id, quant_type = split_remote_gguf("repo/model:Q2_K")
+        assert repo_id == "repo/model"
+        assert quant_type == "Q2_K"
+
+    def test_split_remote_gguf_with_path_object(self):
+        """Test split_remote_gguf with Path object."""
+        repo_id, quant_type = split_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
+        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
+        assert quant_type == "IQ1_S"
+
+    def test_split_remote_gguf_invalid(self):
+        """Test split_remote_gguf with invalid format."""
+        # Invalid format (no colon) - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("repo/model")
+
+        # Invalid quant type - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("repo/model:INVALID_TYPE")
+
+        # HTTP URL - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("http://repo/model:IQ1_S")
+
+        # Cloud storage - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("s3://bucket/repo/model:Q2_K")
+
+
+class TestIsGGUF:
+    """Test is_gguf utility function."""
+
+    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=True)
+    def test_is_gguf_with_local_file(self, mock_check_gguf):
+        """Test is_gguf with local GGUF file."""
+        assert is_gguf("/path/to/model.gguf")
+        assert is_gguf("./model.gguf")
+
+    def test_is_gguf_with_remote_gguf(self):
+        """Test is_gguf with remote GGUF format."""
+        # Valid remote GGUF format (repo_id:quant_type with valid quant_type)
+        assert is_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
+        assert is_gguf("repo/model:Q2_K")
+        assert is_gguf("repo/model:Q4_K")
+
+        # Invalid quant_type should return False
+        assert not is_gguf("repo/model:quant")
+        assert not is_gguf("repo/model:INVALID")
+
+    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+    def test_is_gguf_false(self, mock_check_gguf):
+        """Test is_gguf returns False for non-GGUF models."""
+        assert not is_gguf("unsloth/Qwen3-0.6B")
+        assert not is_gguf("repo/model")
+        assert not is_gguf("model")
+
+    def test_is_gguf_edge_cases(self):
+        """Test is_gguf with edge cases."""
+        # Empty string
+        assert not is_gguf("")
+
+        # Only colon, no slash (even with valid quant_type)
+        assert not is_gguf("model:IQ1_S")
+
+        # Only slash, no colon
+        assert not is_gguf("repo/model")
+
+        # HTTP/HTTPS URLs
+        assert not is_gguf("http://repo/model:IQ1_S")
+        assert not is_gguf("https://repo/model:Q2_K")
+
+        # Cloud storage
+        assert not is_gguf("s3://bucket/repo/model:IQ1_S")
+        assert not is_gguf("gs://bucket/repo/model:Q2_K")
diff --git a/vllm/config/model.py b/vllm/config/model.py
index caa9a3440c41d..14ffdec2e09d1 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -39,7 +39,12 @@ from vllm.transformers_utils.gguf_utils import (
     maybe_patch_hf_config_from_gguf,
 )
 from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
-from vllm.transformers_utils.utils import check_gguf_file, maybe_model_redirect
+from vllm.transformers_utils.utils import (
+    is_gguf,
+    is_remote_gguf,
+    maybe_model_redirect,
+    split_remote_gguf,
+)
 from vllm.utils.import_utils import LazyLoader
 from vllm.utils.torch_utils import common_broadcastable_dtype
 
@@ -440,7 +445,8 @@ class ModelConfig:
         self.model = maybe_model_redirect(self.model)
         # The tokenizer is consistent with the model by default.
         if self.tokenizer is None:
-            if check_gguf_file(self.model):
+            # Check if this is a GGUF model (either local file or remote GGUF)
+            if is_gguf(self.model):
                 raise ValueError(
                     "Using a tokenizer is mandatory when loading a GGUF model. "
                     "Please specify the tokenizer path or name using the "
@@ -832,7 +838,10 @@ class ModelConfig:
             self.tokenizer = object_storage_tokenizer.dir
 
     def _get_encoder_config(self):
-        return get_sentence_transformer_tokenizer_config(self.model, self.revision)
+        model = self.model
+        if is_remote_gguf(model):
+            model, _ = split_remote_gguf(model)
+        return get_sentence_transformer_tokenizer_config(model, self.revision)
 
     def _verify_tokenizer_mode(self) -> None:
         tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 177915715140f..6d5b3392baa2b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -86,7 +86,7 @@ from vllm.transformers_utils.config import (
     is_interleaved,
     maybe_override_with_speculators,
 )
-from vllm.transformers_utils.utils import check_gguf_file, is_cloud_storage
+from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
@@ -1148,8 +1148,8 @@ class EngineArgs:
         return engine_args
 
     def create_model_config(self) -> ModelConfig:
-        # gguf file needs a specific model loader and doesn't use hf_repo
-        if check_gguf_file(self.model):
+        # gguf file needs a specific model loader
+        if is_gguf(self.model):
             self.quantization = self.load_format = "gguf"
 
         # NOTE(woosuk): In V1, we use separate processes for workers (unless
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 2416836be03c4..74052f72ceab9 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -18,6 +18,7 @@ from vllm.model_executor.model_loader.utils import (
     process_weights_after_loading,
 )
 from vllm.model_executor.model_loader.weight_utils import (
+    download_gguf,
     get_gguf_extra_tensor_names,
     get_gguf_weight_type_map,
     gguf_quant_weights_iterator,
@@ -43,7 +44,8 @@ class GGUFModelLoader(BaseModelLoader):
                 f"load format {load_config.load_format}"
             )
 
-    def _prepare_weights(self, model_name_or_path: str):
+    def _prepare_weights(self, model_config: ModelConfig):
+        model_name_or_path = model_config.model
         if os.path.isfile(model_name_or_path):
             return model_name_or_path
         # for raw HTTPS link
@@ -55,12 +57,23 @@ class GGUFModelLoader(BaseModelLoader):
         if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"):
             repo_id, filename = model_name_or_path.rsplit("/", 1)
             return hf_hub_download(repo_id=repo_id, filename=filename)
-        else:
-            raise ValueError(
-                f"Unrecognised GGUF reference: {model_name_or_path} "
-                "(expected local file, raw URL, or <repo_id>/<filename>.gguf)"
+        # repo_id:quant_type
+        elif "/" in model_name_or_path and ":" in model_name_or_path:
+            repo_id, quant_type = model_name_or_path.rsplit(":", 1)
+            return download_gguf(
+                repo_id,
+                quant_type,
+                cache_dir=self.load_config.download_dir,
+                revision=model_config.revision,
+                ignore_patterns=self.load_config.ignore_patterns,
             )
 
+        raise ValueError(
+            f"Unrecognised GGUF reference: {model_name_or_path} "
+            "(expected local file, raw URL, <repo_id>/<filename>.gguf, "
+            "or <repo_id>:<quant_type>)"
+        )
+
     def _get_gguf_weights_map(self, model_config: ModelConfig):
         """
         GGUF uses this naming convention for their tensors from HF checkpoint:
@@ -244,7 +257,7 @@ class GGUFModelLoader(BaseModelLoader):
         gguf_to_hf_name_map: dict[str, str],
     ) -> dict[str, str]:
         weight_type_map = get_gguf_weight_type_map(
-            model_config.model, gguf_to_hf_name_map
+            model_name_or_path, gguf_to_hf_name_map
         )
         is_multimodal = hasattr(model_config.hf_config, "vision_config")
         if is_multimodal:
@@ -290,10 +303,10 @@ class GGUFModelLoader(BaseModelLoader):
         yield from gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
 
     def download_model(self, model_config: ModelConfig) -> None:
-        self._prepare_weights(model_config.model)
+        self._prepare_weights(model_config)
 
     def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
-        local_model_path = self._prepare_weights(model_config.model)
+        local_model_path = self._prepare_weights(model_config)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         model.load_weights(
             self._get_weights_iterator(model_config, local_model_path, gguf_weights_map)
@@ -303,7 +316,7 @@ class GGUFModelLoader(BaseModelLoader):
         self, vllm_config: VllmConfig, model_config: ModelConfig
     ) -> nn.Module:
         device_config = vllm_config.device_config
-        local_model_path = self._prepare_weights(model_config.model)
+        local_model_path = self._prepare_weights(model_config)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         # we can only know if tie word embeddings after mapping weights
         if "lm_head.weight" in get_gguf_extra_tensor_names(
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 4572ebe2ea11b..0809bdfa9d4c2 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -369,6 +369,52 @@ def get_sparse_attention_config(
     return config
 
 
+def download_gguf(
+    repo_id: str,
+    quant_type: str,
+    cache_dir: str | None = None,
+    revision: str | None = None,
+    ignore_patterns: str | list[str] | None = None,
+) -> str:
+    # Use patterns that snapshot_download can handle directly
+    # Patterns to match:
+    # - *-{quant_type}.gguf (root)
+    # - *-{quant_type}-*.gguf (root sharded)
+    # - */*-{quant_type}.gguf (subdir)
+    # - */*-{quant_type}-*.gguf (subdir sharded)
+    allow_patterns = [
+        f"*-{quant_type}.gguf",
+        f"*-{quant_type}-*.gguf",
+        f"*/*-{quant_type}.gguf",
+        f"*/*-{quant_type}-*.gguf",
+    ]
+
+    # Use download_weights_from_hf which handles caching and downloading
+    folder = download_weights_from_hf(
+        model_name_or_path=repo_id,
+        cache_dir=cache_dir,
+        allow_patterns=allow_patterns,
+        revision=revision,
+        ignore_patterns=ignore_patterns,
+    )
+
+    # Find the downloaded file(s) in the folder
+    local_files = []
+    for pattern in allow_patterns:
+        # Convert pattern to glob pattern for local filesystem
+        glob_pattern = os.path.join(folder, pattern)
+        local_files.extend(glob.glob(glob_pattern))
+
+    if not local_files:
+        raise ValueError(
+            f"Downloaded GGUF files not found in {folder} for quant_type {quant_type}"
+        )
+
+    # Sort to ensure consistent ordering (prefer non-sharded files)
+    local_files.sort(key=lambda x: (x.count("-"), x))
+    return local_files[0]
+
+
 def download_weights_from_hf(
     model_name_or_path: str,
     cache_dir: str | None,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index c1880a3fba0ee..a29d92f67f5df 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -42,7 +42,10 @@ from vllm.logger import init_logger
 from vllm.transformers_utils.config_parser_base import ConfigParserBase
 from vllm.transformers_utils.utils import (
     check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
     parse_safetensors_file_metadata,
+    split_remote_gguf,
 )
 
 if envs.VLLM_USE_MODELSCOPE:
@@ -629,10 +632,12 @@ def maybe_override_with_speculators(
     Returns:
         Tuple of (resolved_model, resolved_tokenizer, speculative_config)
     """
-    is_gguf = check_gguf_file(model)
-    if is_gguf:
+    if check_gguf_file(model):
         kwargs["gguf_file"] = Path(model).name
         gguf_model_repo = Path(model).parent
+    elif is_remote_gguf(model):
+        repo_id, _ = split_remote_gguf(model)
+        gguf_model_repo = Path(repo_id)
     else:
         gguf_model_repo = None
     kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
@@ -678,10 +683,18 @@ def get_config(
 ) -> PretrainedConfig:
     # Separate model folder from file path for GGUF models
 
-    is_gguf = check_gguf_file(model)
-    if is_gguf:
-        kwargs["gguf_file"] = Path(model).name
-        model = Path(model).parent
+    _is_gguf = is_gguf(model)
+    _is_remote_gguf = is_remote_gguf(model)
+    if _is_gguf:
+        if check_gguf_file(model):
+            # Local GGUF file
+            kwargs["gguf_file"] = Path(model).name
+            model = Path(model).parent
+        elif _is_remote_gguf:
+            # Remote GGUF - extract repo_id from repo_id:quant_type format
+            # The actual GGUF file will be downloaded later by GGUFModelLoader
+            # Keep model as repo_id:quant_type for download, but use repo_id for config
+            model, _ = split_remote_gguf(model)
 
     if config_format == "auto":
         try:
@@ -689,10 +702,25 @@ def get_config(
             # Transformers implementation.
             if file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
                 config_format = "mistral"
-            elif is_gguf or file_or_path_exists(
+            elif (_is_gguf and not _is_remote_gguf) or file_or_path_exists(
                 model, HF_CONFIG_NAME, revision=revision
             ):
                 config_format = "hf"
+            # Remote GGUF models must have config.json in repo,
+            # otherwise the config can't be parsed correctly.
+            # FIXME(Isotr0py): Support remote GGUF repos without config.json
+            elif _is_remote_gguf and not file_or_path_exists(
+                model, HF_CONFIG_NAME, revision=revision
+            ):
+                err_msg = (
+                    "Could not find config.json for remote GGUF model repo. "
+                    "To load remote GGUF model through `<repo_id>:<quant_type>`, "
+                    "ensure your model has config.json (HF format) file. "
+                    "Otherwise please specify --hf-config-path <original_repo> "
+                    "in engine args to fetch config from unquantized hf model."
+                )
+                logger.error(err_msg)
+                raise ValueError(err_msg)
             else:
                 raise ValueError(
                     "Could not detect config format for no config file found. "
@@ -713,9 +741,6 @@ def get_config(
                 "'config.json'.\n"
                 "   - For Mistral models: ensure the presence of a "
                 "'params.json'.\n"
-                "3. For GGUF: pass the local path of the GGUF checkpoint.\n"
-                "   Loading GGUF from a remote repo directly is not yet "
-                "supported.\n"
             ).format(model=model)
 
             raise ValueError(error_message) from e
@@ -729,7 +754,7 @@ def get_config(
         **kwargs,
     )
     # Special architecture mapping check for GGUF models
-    if is_gguf:
+    if _is_gguf:
         if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
             raise RuntimeError(f"Can't get gguf config for {config.model_type}.")
         model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
@@ -889,6 +914,8 @@ def get_pooling_config(model: str, revision: str | None = "main") -> dict | None
         A dictionary containing the pooling type and whether
             normalization is used, or None if no pooling configuration is found.
     """
+    if is_remote_gguf(model):
+        model, _ = split_remote_gguf(model)
 
     modules_file_name = "modules.json"
 
@@ -1108,6 +1135,8 @@ def get_hf_image_processor_config(
     # Separate model folder from file path for GGUF models
     if check_gguf_file(model):
         model = Path(model).parent
+    elif is_remote_gguf(model):
+        model, _ = split_remote_gguf(model)
     return get_image_processor_config(
         model, token=hf_token, revision=revision, **kwargs
     )
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 8deacb5b07913..63cdf63370342 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -18,7 +18,7 @@ from transformers.processing_utils import ProcessorMixin
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar
 
-from vllm.transformers_utils.utils import check_gguf_file, convert_model_repo_to_path
+from vllm.transformers_utils.utils import convert_model_repo_to_path, is_gguf
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
@@ -236,8 +236,8 @@ def cached_processor_from_config(
     processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
-    if check_gguf_file(model_config.model):
-        assert not check_gguf_file(model_config.tokenizer), (
+    if is_gguf(model_config.model):
+        assert not is_gguf(model_config.tokenizer), (
             "For multimodal GGUF models, the original tokenizer "
             "should be used to correctly load processor."
         )
@@ -350,8 +350,8 @@ def cached_image_processor_from_config(
     model_config: "ModelConfig",
     **kwargs: Any,
 ):
-    if check_gguf_file(model_config.model):
-        assert not check_gguf_file(model_config.tokenizer), (
+    if is_gguf(model_config.model):
+        assert not is_gguf(model_config.tokenizer), (
             "For multimodal GGUF models, the original tokenizer "
             "should be used to correctly load image processor."
         )
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 233076741503d..f0e0ba8ef4246 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -20,7 +20,12 @@ from vllm.transformers_utils.config import (
     list_filtered_repo_files,
 )
 from vllm.transformers_utils.tokenizers import MistralTokenizer
-from vllm.transformers_utils.utils import check_gguf_file
+from vllm.transformers_utils.utils import (
+    check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
+    split_remote_gguf,
+)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -180,10 +185,12 @@ def get_tokenizer(
         kwargs["truncation_side"] = "left"
 
     # Separate model folder from file path for GGUF models
-    is_gguf = check_gguf_file(tokenizer_name)
-    if is_gguf:
-        kwargs["gguf_file"] = Path(tokenizer_name).name
-        tokenizer_name = Path(tokenizer_name).parent
+    if is_gguf(tokenizer_name):
+        if check_gguf_file(tokenizer_name):
+            kwargs["gguf_file"] = Path(tokenizer_name).name
+            tokenizer_name = Path(tokenizer_name).parent
+        elif is_remote_gguf(tokenizer_name):
+            tokenizer_name, _ = split_remote_gguf(tokenizer_name)
 
     # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
     # first to use official Mistral tokenizer if possible.
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 901a64d9d2633..45a873c9f7001 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -9,6 +9,8 @@ from os import PathLike
 from pathlib import Path
 from typing import Any
 
+from gguf import GGMLQuantizationType
+
 import vllm.envs as envs
 from vllm.logger import init_logger
 
@@ -46,6 +48,57 @@ def check_gguf_file(model: str | PathLike) -> bool:
         return False
 
 
+@cache
+def is_remote_gguf(model: str | Path) -> bool:
+    """Check if the model is a remote GGUF model."""
+    model = str(model)
+    return (
+        (not is_cloud_storage(model))
+        and (not model.startswith(("http://", "https://")))
+        and ("/" in model and ":" in model)
+        and is_valid_gguf_quant_type(model.rsplit(":", 1)[1])
+    )
+
+
+def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
+    """Check if the quant type is a valid GGUF quant type."""
+    return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
+
+
+def split_remote_gguf(model: str | Path) -> tuple[str, str]:
+    """Split the model into repo_id and quant type."""
+    model = str(model)
+    if is_remote_gguf(model):
+        parts = model.rsplit(":", 1)
+        return (parts[0], parts[1])
+    raise ValueError(
+        "Wrong GGUF model or invalid GGUF quant type: %s.\n"
+        "- It should be in repo_id:quant_type format.\n"
+        "- Valid GGMLQuantizationType values: %s",
+        model,
+        GGMLQuantizationType._member_names_,
+    )
+
+
+def is_gguf(model: str | Path) -> bool:
+    """Check if the model is a GGUF model.
+
+    Args:
+        model: Model name, path, or Path object to check.
+
+    Returns:
+        True if the model is a GGUF model, False otherwise.
+    """
+    model = str(model)
+
+    # Check if it's a local GGUF file
+    if check_gguf_file(model):
+        return True
+
+    # Check if it's a remote GGUF model (repo_id:quant_type format)
+    return is_remote_gguf(model)
+
+
 def modelscope_list_repo_files(
     repo_id: str,
     revision: str | None = None,

From dbc3d9991ab0e5adc0db6a8c71c9059268032a14 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 25 Nov 2025 09:46:18 -0500
Subject: [PATCH 068/134] [UX] Put CUDA attention backend selection log into
 one line (#29337)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/platforms/cuda.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 06793a3d1bb14..75b6bc77e4c1c 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -407,9 +407,6 @@ class CudaPlatformBase(Platform):
 
         # We have found some valid backends. Select the one with the
         # highest priority.
-        logger.info(
-            "Valid backends: %s", [b[0].name for b in valid_backends_priorities]
-        )
         sorted_indices = sorted(
             range(len(valid_backends_priorities)),
             key=lambda i: valid_backends_priorities[i][1],
@@ -417,8 +414,9 @@ class CudaPlatformBase(Platform):
         selected_index = sorted_indices[0]
         selected_backend = valid_backends_priorities[selected_index][0]
         logger.info(
-            "Using %s backend.",
+            "Using %s attention backend out of potential backends: %s",
             selected_backend.name,
+            [b[0].name for b in valid_backends_priorities],
         )
 
         return selected_backend.get_path()

From e502098643b863f09dd9d8e74249523641f01298 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 25 Nov 2025 09:59:07 -0500
Subject: [PATCH 069/134] [Kernel] Add NVFP4 MoE CUTLASS support for SM120
 (#29242)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
---
 CMakeLists.txt                                |   5 +-
 .../fp4/nvfp4_blockwise_moe_kernel.cu         | 230 +++++++++++++++++-
 csrc/quantization/fp4/nvfp4_experts_quant.cu  |   2 +-
 csrc/quantization/fp4/nvfp4_quant_entry.cu    |  10 +-
 .../w8a8/cutlass/scaled_mm_entry.cu           |  27 +-
 docs/design/moe_kernel_features.md            |   2 +-
 .../compressed_tensors_moe.py                 |  14 +-
 .../layers/quantization/modelopt.py           |   4 +
 8 files changed, 264 insertions(+), 30 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86746a0db4c0e..d88ba3aa66303 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -604,12 +604,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(SRCS
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
       "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${FP4_ARCHS}")
     list(APPEND VLLM_EXT_SRC "${SRCS}")
     list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
     message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
   else()
     message(STATUS "Not building NVFP4 as no compatible archs were found.")
diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
index 5b007e5ea3283..6744402783832 100644
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -22,6 +22,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
+#include "cutlass_extensions/common.hpp"
 
 #include "cute/tensor.hpp"
 #include "cutlass/tensor_ref.h"
@@ -173,7 +174,7 @@ void run_get_group_gemm_starts(
 }
 
 template <typename OutType>
-void run_fp4_blockwise_scaled_group_mm(
+void run_fp4_blockwise_scaled_group_mm_sm100(
     torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
     const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
     const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
@@ -343,17 +344,225 @@ void run_fp4_blockwise_scaled_group_mm(
 
   auto can_implement_status = gemm_op.can_implement(args);
   TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
-              "Failed to implement GEMM");
+              "Failed to implement GEMM: status=", (int)can_implement_status);
 
   // Run the GEMM
   auto status = gemm_op.initialize(args, workspace.data_ptr());
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+  TORCH_CHECK(status == cutlass::Status::kSuccess,
+              "Failed to initialize GEMM: status=", (int)status,
+              " workspace_size=", workspace_size, " num_experts=", num_experts,
+              " M=", M, " N=", N, " K=", K);
 
   status = gemm_op.run(args, workspace.data_ptr(), stream);
   TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
 }
 
+void run_fp4_blockwise_scaled_group_mm_sm120(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
+    int N, int K) {
+  using ProblemShape =
+      cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
+  using ElementType = cutlass::float_e2m1_t;
+  using ElementSFType = cutlass::float_ue4m3_t;
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  // NOTE: For SM120 it seems templating the output type is not supported and
+  // we need to hardcode the output type to bfloat16
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  // Layout definitions
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+
+  // Alignment constraints
+  static constexpr int AlignmentA = 32;
+  static constexpr int AlignmentB = 32;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Architecture definitions
+  using ArchTag = cutlass::arch::Sm120;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  using ClusterShape = Shape<_1, _1, _1>;
+  using MmaTileShape = Shape<_128, _128, _128>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
+      ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, MmaTileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
+          LayoutD*, AlignmentD,
+          cutlass::epilogue::collective::EpilogueScheduleAuto,
+          FusionOperation>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
+          LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using LayoutSFA =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
+  using LayoutSFB =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
+  using ScaleConfig =
+      typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor c_strides1 =
+      torch::full({num_experts}, output.stride(0), options_int);
+  torch::Tensor a_strides1 =
+      torch::full({num_experts}, a.stride(0) * 2, options_int);
+  torch::Tensor b_strides1 =
+      torch::full({num_experts}, b.stride(1) * 2, options_int);
+
+  run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
+      a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
+      layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
+      expert_offsets, sf_offsets, problem_sizes, M, N, K);
+
+  // Create an instance of the GEMM
+  Gemm gemm_op;
+
+  // Initialize problem_sizes_as_shapes correctly
+  UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  // Set the Scheduler info
+  cutlass::KernelHardwareInfo hw_info;
+  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
+  typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
+  scheduler.raster_order = RasterOrderOptions::AlongM;
+  hw_info.device_id = a.get_device();
+  static std::unordered_map<int, int> cached_sm_counts;
+  if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
+    cached_sm_counts[hw_info.device_id] =
+        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+            hw_info.device_id);
+  }
+  hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
+
+  // Mainloop Arguments
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementType**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(a_strides1.data_ptr()),
+      static_cast<const ElementType**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(b_strides1.data_ptr()),
+      static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
+      static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
+
+  // Epilogue Arguments
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},  // epilogue.thread
+      nullptr,
+      static_cast<StrideC*>(c_strides1.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides1.data_ptr())};
+  auto& fusion_args = epilogue_args.thread;
+  fusion_args.alpha_ptr_array =
+      reinterpret_cast<float**>(alpha_ptrs.data_ptr());
+  fusion_args.dAlpha = {_0{}, _0{}, 1};
+  fusion_args.beta = 0.0f;
+
+  // Gemm Arguments
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info,
+      scheduler};
+
+  size_t workspace_size = Gemm::get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
+              "Failed to implement GEMM: status=", (int)can_implement_status);
+
+  // Run the GEMM
+  auto status = gemm_op.initialize(args, workspace.data_ptr());
+  TORCH_CHECK(status == cutlass::Status::kSuccess,
+              "Failed to initialize GEMM: status=", (int)status,
+              " workspace_size=", workspace_size, " num_experts=", num_experts,
+              " M=", M, " N=", N, " K=", K);
+
+  status = gemm_op.run(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+template <typename OutType>
+void run_fp4_blockwise_scaled_group_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
+    int N, int K) {
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+  if (version_num >= 120 && version_num < 130) {
+    run_fp4_blockwise_scaled_group_mm_sm120(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+    return;
+  }
+#endif
 #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+  if (version_num >= 100 && version_num < 120) {
+    run_fp4_blockwise_scaled_group_mm_sm100<OutType>(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+    return;
+  }
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ",
+      version_num, ". Required capability: 100 or 120");
+}
+
+#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
+    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
 constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
 constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
 #endif
@@ -374,7 +583,8 @@ void cutlass_fp4_group_mm(
     const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
     const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
     const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
+    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
   // Input validation
   CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
   CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
@@ -408,6 +618,14 @@ void cutlass_fp4_group_mm(
         output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
         expert_offsets, sf_offsets, M, N, K);
   } else {
+  #if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+    int32_t version_num = get_sm_version_num();
+    if (version_num >= 120 && version_num < 130) {
+      TORCH_CHECK_NOT_IMPLEMENTED(
+          false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ",
+          output.scalar_type());
+    }
+  #endif
     run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
         output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
         expert_offsets, sf_offsets, M, N, K);
@@ -416,8 +634,8 @@ void cutlass_fp4_group_mm(
   TORCH_CHECK_NOT_IMPLEMENTED(
       false,
       "No compiled cutlass_fp4_group_mm kernel, vLLM must "
-      "be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA "
-      "12.8 or above.");
+      "be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 "
+      "and CUDA 12.8 or above.");
 #endif
 }
 
diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu
index 6d385e0dd94e7..82c53c2375a31 100644
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -307,7 +307,7 @@ constexpr auto FLOAT = at::ScalarType::Float;
 constexpr auto INT = at::ScalarType::Int;
 constexpr auto UINT8 = at::ScalarType::Byte;
 
-void scaled_fp4_experts_quant_sm100a(
+void scaled_fp4_experts_quant_sm1xxa(
     torch::Tensor& output, torch::Tensor& output_scale,
     torch::Tensor const& input, torch::Tensor const& input_global_scale,
     torch::Tensor const& input_offset_by_experts,
diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu
index c2b39e5438805..fb6d22f035b99 100644
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -24,8 +24,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                              torch::Tensor const& input_sf);
 #endif
 
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
-void scaled_fp4_experts_quant_sm100a(
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void scaled_fp4_experts_quant_sm1xxa(
     torch::Tensor& output, torch::Tensor& output_scale,
     torch::Tensor const& input, torch::Tensor const& input_global_scale,
     torch::Tensor const& input_offset_by_experts,
@@ -54,8 +55,9 @@ void scaled_fp4_experts_quant(
     torch::Tensor const& input, torch::Tensor const& input_global_scale,
     torch::Tensor const& input_offset_by_experts,
     torch::Tensor const& output_scale_offset_by_experts) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
-  return scaled_fp4_experts_quant_sm100a(
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  return scaled_fp4_experts_quant_sm1xxa(
       output, output_scale, input, input_global_scale, input_offset_by_experts,
       output_scale_offset_by_experts);
 #endif
diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
index 1001af05ff003..c5012a8669317 100644
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -67,9 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                              std::optional<torch::Tensor> const& bias);
 #endif
 
-#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 ||   \
-    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
-    defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
+#if (defined(ENABLE_CUTLASS_MOE_SM90) && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined(ENABLE_CUTLASS_MOE_SM120) && ENABLE_CUTLASS_MOE_SM120)
 void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -284,8 +284,9 @@ void get_cutlass_moe_mm_data(
   // This function currently gets compiled only if we have a valid cutlass moe
   // mm to run it for.
   int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
   get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
                                  problem_sizes2, input_permutation,
                                  output_permutation, num_experts, n, k,
@@ -296,7 +297,7 @@ void get_cutlass_moe_mm_data(
       false,
       "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
       "CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 
 void get_cutlass_moe_mm_problem_sizes(
@@ -304,8 +305,9 @@ void get_cutlass_moe_mm_problem_sizes(
     torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
     const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
   int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
   get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
                                           problem_sizes2, num_experts, n, k,
                                           blockscale_offsets);
@@ -315,7 +317,7 @@ void get_cutlass_moe_mm_problem_sizes(
       false,
       "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
       "kernel for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
@@ -328,8 +330,9 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
   // This function currently gets compiled only if we have a valid cutlass moe
   // mm to run it for.
   int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
   get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
                                       problem_sizes2, expert_num_tokens,
                                       num_local_experts, padded_m, n, k);
@@ -339,7 +342,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
       false,
       "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
       "for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index f0d5a3e934f39..e54a9e2bc5e77 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 
 - [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
 - [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
-- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
+- [`CompressedTensorsW4A4Nvfp4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoeMethod]
 - [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
 - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
 - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 149e4419c64a4..71d7de97d4a10 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -103,7 +103,7 @@ __all__ = [
     "CompressedTensorsW8A8Int8MoEMethod",
     "CompressedTensorsWNA16MarlinMoEMethod",
     "CompressedTensorsWNA16MoEMethod",
-    "CompressedTensorsW4A4MoeMethod",
+    "CompressedTensorsW4A4Nvfp4MoeMethod",
     "CompressedTensorsW4A8Int8MoEMethod",
 ]
 
@@ -171,7 +171,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
                     quant_config, layer.moe_config
                 )
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
-            return CompressedTensorsW4A4MoeMethod(layer.moe_config)
+            return CompressedTensorsW4A4Nvfp4MoeMethod(layer.moe_config)
         elif (
             quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
             or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
@@ -188,7 +188,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
             )
 
 
-class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
+class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
     def __init__(self, moe: FusedMoEConfig):
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
             detect_nvfp4_moe_support,
@@ -205,8 +205,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
-                " for CompressedTensorsW4A4MoeMethod."
+                " for CompressedTensorsW4A4Nvfp4MoeMethod."
             )
+        elif self.use_marlin:
+            logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoeMethod.")
+        else:
+            logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoeMethod.")
 
     def create_weights(
         self,
@@ -612,7 +616,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             assert expert_map is None, (
                 "Expert Parallelism / expert_map "
                 "is currently not supported for "
-                "CompressedTensorsW4A4MoeMethod."
+                "CompressedTensorsW4A4Nvfp4MoeMethod."
             )
             assert self.moe_quant_config is not None
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 8165673135910..2cf7089e0ff90 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1132,6 +1132,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
                 " for ModelOptNvFp4FusedMoE."
             )
+        elif self.use_marlin:
+            logger.info_once("Using Marlin for ModelOptNvFp4FusedMoE.")
+        else:
+            logger.info_once("Using Cutlass for ModelOptNvFp4FusedMoE.")
 
     def maybe_make_prepare_finalize(
         self,

From 48ddb02b79d7e22e2eefbf5294bf70de50afd1b2 Mon Sep 17 00:00:00 2001
From: Yifan Qiao <yifanqiao@berkeley.edu>
Date: Tue, 25 Nov 2025 07:30:57 -0800
Subject: [PATCH 070/134] [Hybrid Allocator] Support KV cache groups with
 different block_size (#29143)

Signed-off-by: Yifan Qiao <yifanqiao@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/core/test_kv_cache_utils.py          |  49 +++++-
 tests/v1/core/test_prefix_caching.py          |  95 ++++++++++-
 .../core/test_single_type_kv_cache_manager.py |  22 ++-
 vllm/engine/arg_utils.py                      |   8 +-
 vllm/model_executor/models/config.py          |  11 +-
 vllm/v1/core/block_pool.py                    |  22 ++-
 vllm/v1/core/kv_cache_coordinator.py          |  98 ++++++++---
 vllm/v1/core/kv_cache_manager.py              |  25 +--
 vllm/v1/core/kv_cache_utils.py                | 156 +++++++++++++++---
 vllm/v1/core/sched/scheduler.py               |   1 +
 vllm/v1/core/single_type_kv_cache_manager.py  |  72 +++++++-
 11 files changed, 472 insertions(+), 87 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 12ed59b6e863b..58a7a2692bfc8 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1248,7 +1248,9 @@ def test_allocate_with_lookahead():
     )
 
     # Test case 1: Requires additional lookahead tokens
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
+        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
+    )
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_new_tokens=3,
@@ -1257,7 +1259,9 @@ def test_allocate_with_lookahead():
     assert len(blocks.get_block_ids()[0]) == 2  # ceil(5/4)=2 blocks
 
     # Test case 2: With precomputed blocks
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
+        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
+    )
     # required_blocks = ceil((3 + 2) /4) = 2
     blocks = kv_cache_manager.allocate_slots(
         request,
@@ -1268,7 +1272,9 @@ def test_allocate_with_lookahead():
 
     # Test case 3: With precomputed blocks
     # required_blocks = ceil((3 + 4) / 4) = 2
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
+        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
+    )
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_new_tokens=3,
@@ -1495,7 +1501,8 @@ def test_get_kv_cache_config_one_worker():
             ),
         ],
     )
-    # different hidden size
+
+    # different hidden size but same type, use UniformTypeKVCacheSpecs
     kv_cache_specs_hybrid = {
         "layer_1": new_kv_cache_spec(head_size=128),
         "layer_2": new_kv_cache_spec(head_size=64),
@@ -1519,6 +1526,40 @@ def test_get_kv_cache_config_one_worker():
         ],
     )
 
+    # Different hidden size and different type, align by different block size
+    kv_cache_specs_hybrid = {
+        "layer_1": new_kv_cache_spec(head_size=64),
+        "layer_2": new_sliding_window_spec(head_size=32),
+    }
+    kv_cache_config_hybrid = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 32]
+    )[0]
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32, shared_by=["layer_1", "layer_2"]
+            ),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1"], new_kv_cache_spec(head_size=64)),
+            KVCacheGroupSpec(
+                ["layer_2"], new_sliding_window_spec(head_size=32, block_size=32)
+            ),
+        ],
+    )
+
+    # different hidden size that cannot be aligned by using different block size
+    kv_cache_specs_hybrid = {
+        "layer_1": new_kv_cache_spec(head_size=64),
+        "layer_2": new_sliding_window_spec(head_size=96),
+    }
+
+    with pytest.raises(NotImplementedError):
+        get_kv_cache_configs(
+            vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 2 * 32]
+        )[0]
+
     # Test num_gpu_blocks_override
     vllm_config.cache_config.num_gpu_blocks_override = 16
     kv_cache_config_override_blocks = get_kv_cache_configs(
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 2291f363731f2..64fd5ab1dd9aa 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -134,6 +134,7 @@ def test_prefill(hash_fn):
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Complete 3 blocks (48 tokens)
@@ -256,6 +257,7 @@ def test_prefill_hybrid_model():
         make_kv_cache_config_hybrid_model(block_size, 21),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     hash_fn = sha256
@@ -416,6 +418,7 @@ def test_prefill_plp():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
     # the default hash function is sha256
     hash_fn = sha256
@@ -523,6 +526,7 @@ def test_decode():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Complete 3 blocks (48 tokens)
@@ -585,6 +589,7 @@ def test_evict():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     last_token_id = 5 * 16 + 7
@@ -643,6 +648,7 @@ def test_hash_block_correct_reuse():
         make_kv_cache_config(16, 2),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Allocate 1 block and cache it.
@@ -683,6 +689,7 @@ def test_computed_blocks_not_evicted():
         make_kv_cache_config(block_size, 3),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Allocate a block and cache it.
@@ -741,6 +748,7 @@ def test_basic_prefix_caching_disabled():
         make_kv_cache_config(block_size, 5),
         max_model_len=8192,
         enable_caching=False,
+        hash_block_size=block_size,
     )
 
     req1 = make_request(
@@ -790,6 +798,7 @@ def test_cache_blocks(hash_fn):
     block_pool = BlockPool(
         num_gpu_blocks=5,
         enable_caching=True,
+        hash_block_size=block_size,
     )
     # Req:
     #  Block 0: [0, 1, 2, 3]
@@ -833,7 +842,9 @@ def test_cache_blocks_multi_group():
     This tests that blocks are cached correctly for different kv cache groups.
     """
     block_size = 4
-    block_pool = BlockPool(num_gpu_blocks=10, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=10, enable_caching=True, hash_block_size=block_size
+    )
 
     # Req:
     #  Block 0/4: [0, 1, 2, 3]
@@ -921,6 +932,7 @@ def test_mm_prefix_caching():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Common prompt tokens (T is text tokens and P is image placeholder tokens)
@@ -1020,6 +1032,7 @@ def test_cache_key_salting():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # 3 complete blocks and an incomplete block with 11 tokens.
@@ -1101,6 +1114,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
     # Complete 3 blocks (48 tokens)
     # | Common-0 | Common-1 | Common-2 | ... |
@@ -1173,6 +1187,7 @@ def test_reset_prefix_cache():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     full_block_token_ids = [i for i in range(3) for _ in range(16)]
@@ -1213,6 +1228,7 @@ def test_prefix_cache_stats_disabled():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
         log_stats=False,  # Disable logging stats
     )
     assert manager.prefix_cache_stats is None
@@ -1232,7 +1248,7 @@ def test_prefix_cache_stats_disabled():
 
 
 def test_maybe_evict_cached_block():
-    pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
+    pool = BlockPool(num_gpu_blocks=4, enable_caching=True, hash_block_size=16)
     block_hash0 = make_block_hash_with_group_id(BlockHash(b"10"), 1000)
     block_hash1 = make_block_hash_with_group_id(BlockHash(b"20"), 2000)
     block_hash2 = make_block_hash_with_group_id(BlockHash(b"30"), 3000)
@@ -1293,6 +1309,7 @@ def test_kv_cache_events(blocks_to_cache: int):
         max_model_len=8192,
         enable_caching=True,
         enable_kv_cache_events=True,
+        hash_block_size=block_size,
     )
 
     num_tokens = block_size * blocks_to_cache
@@ -1351,6 +1368,7 @@ def test_kv_cache_events_with_lora(blocks_to_cache: int):
         max_model_len=8192,
         enable_caching=True,
         enable_kv_cache_events=True,
+        hash_block_size=block_size,
     )
 
     # Test with LoRA request
@@ -1405,6 +1423,7 @@ def test_eagle_enabled_removes_last_block():
         max_model_len=8192,
         enable_caching=True,
         use_eagle=True,
+        hash_block_size=block_size,
     )
 
     # Request with 3 full blocks (48 tokens)
@@ -1437,6 +1456,7 @@ def test_eagle_with_partial_blocks():
         max_model_len=8192,
         enable_caching=True,
         use_eagle=True,
+        hash_block_size=block_size,
     )
     # 2 full blocks + 5 tokens (non-divisible length)
     token_ids = [0] * (2 * block_size + 5)
@@ -1476,6 +1496,7 @@ def test_eagle_with_sliding_window():
         max_model_len=8192,
         enable_caching=True,
         use_eagle=True,
+        hash_block_size=block_size,
     )
 
     # 2 full blocks + 5 tokens (non-divisible length)
@@ -1522,6 +1543,76 @@ def test_eagle_with_sliding_window():
     assert num_tokens == 0
 
 
+def test_different_block_size():
+    block_size = 16
+    # full attention and sliding window attention layers have the same page size:
+    # (32 tokens/block * float16 token, vs. 16 tokens/block * float32 token)
+    kv_cache_config = KVCacheConfig(
+        num_blocks=100,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer1"],
+                FullAttentionSpec(block_size * 2, 1, 1, torch.float16),
+            ),
+            KVCacheGroupSpec(
+                ["layer2"],
+                SlidingWindowSpec(
+                    block_size,
+                    1,
+                    1,
+                    torch.float32,
+                    sliding_window=2 * block_size,
+                ),
+            ),
+        ],
+    )
+    manager = KVCacheManager(
+        kv_cache_config=kv_cache_config,
+        max_model_len=8192,
+        enable_caching=True,
+        hash_block_size=block_size,
+    )
+
+    # 10 blocks of 16 tokens each. Token ids are not strictly aligned for each block.
+    common_token_ids = [i for i in range(10) for _ in range(block_size)]
+
+    req0 = make_request("0", common_token_ids, block_size, sha256)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert not computed_blocks.blocks[0]
+    assert not computed_blocks.blocks[1]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(
+        req0, 7 * block_size, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7, 8, 9, 10, 11])
+    req1 = make_request("1", common_token_ids[: 7 * block_size + 1], block_size, sha256)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(computed_blocks.blocks[0]) == 3
+    assert len(computed_blocks.blocks[1]) == 6
+    assert num_computed_tokens == 6 * 16
+
+    req2 = make_request("2", common_token_ids[: 6 * block_size + 1], block_size, sha256)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert len(computed_blocks.blocks[0]) == 3
+    assert len(computed_blocks.blocks[1]) == 6
+    assert num_computed_tokens == 6 * 16
+
+    # Evict some blocks to make sliding window cache hit length 5*16
+    # But should return 4 * 16 because full attention cache hit length must be
+    # a multiple of 32
+    manager.block_pool.cached_block_hash_to_block.pop(
+        make_block_hash_with_group_id(req1.block_hashes[6], 1), 11
+    )
+    manager.block_pool.cached_block_hash_to_block.pop(
+        make_block_hash_with_group_id(req1.block_hashes[5], 1), 10
+    )
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(computed_blocks.blocks[0]) == 2
+    assert len(computed_blocks.blocks[1]) == 4
+    assert num_computed_tokens == 4 * 16
+
+
 def test_block_lookup_cache_single_block_per_key():
     cache = BlockHashToBlockMap()
     key0 = BlockHashWithGroupId(b"hash0")
diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py
index a27f32938c08b..e6a69dc8a949a 100644
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@@ -41,7 +41,9 @@ def test_chunked_local_attention_possible_cached_prefix():
         attention_chunk_size=4,
     )
 
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
     manager = get_chunked_local_attention_manager(
         chunked_local_attention_spec, block_pool
     )
@@ -70,6 +72,7 @@ def test_chunked_local_attention_possible_cached_prefix():
             block_pool=block_pool,
             kv_cache_spec=chunked_local_attention_spec,
             use_eagle=False,
+            alignment_tokens=block_size,
         )[0]
         assert len(computed_blocks) == expect_length
 
@@ -111,7 +114,9 @@ def test_sliding_window_possible_cached_prefix():
         sliding_window=4,
     )
 
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
     manager = get_sliding_window_manager(sliding_window_spec, block_pool)
 
     def run_one_case(block_is_cached, expect_length):
@@ -138,6 +143,7 @@ def test_sliding_window_possible_cached_prefix():
             block_pool=block_pool,
             kv_cache_spec=sliding_window_spec,
             use_eagle=False,
+            alignment_tokens=block_size,
         )[0]
         assert len(computed_blocks) == expect_length
 
@@ -178,7 +184,7 @@ def test_chunked_local_attention_remove_skipped_blocks():
         attention_chunk_size=4,
     )
 
-    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
 
     manager = get_chunked_local_attention_manager(attention_spec, block_pool)
 
@@ -239,7 +245,7 @@ def test_sliding_window_remove_skipped_blocks():
         sliding_window=4,
     )
 
-    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
 
     manager = get_sliding_window_manager(sliding_window_spec, block_pool)
 
@@ -316,7 +322,9 @@ def test_get_num_blocks_to_allocate():
         sliding_window=4,  # Placeholder value, not related to test result
     )
 
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
     manager = get_sliding_window_manager(sliding_window_spec, block_pool)
     cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
     cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
@@ -341,7 +349,9 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
         attention_chunk_size=4,  # Placeholder value, not related to test result
     )
 
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
     manager = get_chunked_local_attention_manager(attention_spec, block_pool)
     cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
     cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6d5b3392baa2b..bdccb15e3f655 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1816,9 +1816,11 @@ class EngineArgs:
         if model_config.runner_type != "pooling":
             default_chunked_prefill = True
 
-            # Disable prefix caching default for hybrid models
-            # since the feature is still experimental.
-            default_prefix_caching = not model_config.is_hybrid
+            # Disable prefix caching default for hybrid models and mamba-only
+            # models since the feature is still experimental.
+            default_prefix_caching = not (
+                model_config.is_hybrid or model_config.is_attention_free
+            )
         else:
             assert model_config.pooler_config is not None
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 3cf4bf991e667..d7e802ba1aca0 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -289,9 +289,6 @@ class MambaModelConfig(VerifyAndUpdateConfig):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
 
-        if cache_config.mamba_block_size is None:
-            cache_config.mamba_block_size = model_config.max_model_len
-
         if cache_config.enable_prefix_caching:
             if model_config.supports_mamba_prefix_caching:
                 logger.info(
@@ -299,6 +296,11 @@ class MambaModelConfig(VerifyAndUpdateConfig):
                     "Its support for Mamba layers is experimental. "
                     "Please report any issues you may observe."
                 )
+                # By default, mamba block size will be set to max_model_len (see
+                # below). When enabling prefix caching, we align mamba block size
+                # to the block size as the basic granularity for prefix caching.
+                if cache_config.mamba_block_size is None:
+                    cache_config.mamba_block_size = cache_config.block_size
             else:
                 logger.info(
                     "Hybrid or mamba-based model detected without "
@@ -306,6 +308,9 @@ class MambaModelConfig(VerifyAndUpdateConfig):
                 )
                 cache_config.enable_prefix_caching = False
 
+        if cache_config.mamba_block_size is None:
+            cache_config.mamba_block_size = model_config.max_model_len
+
         # TODO(tdoublep): remove once cascade attention is supported
         logger.info(
             "Disabling cascade attention since it is not supported for hybrid models."
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 55710ad5cc693..8b0e8fd3a2410 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -13,6 +13,8 @@ from vllm.distributed.kv_events import (
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
+    BlockHashList,
+    BlockHashListWithBlockSize,
     BlockHashWithGroupId,
     ExternalBlockHash,
     FreeKVCacheBlockQueue,
@@ -133,6 +135,10 @@ class BlockPool:
     Args:
         num_gpu_blocks: The number of blocks in the pool.
         enable_caching: Whether to enable prefix caching.
+        hash_block_size: The block size of which the block hashes are computed.
+            The actual block size usually equals hash_block_size, but in cases
+            where different KV cache groups have different block sizes, the
+            actual block size can be a multiple of hash_block_size.
         enable_kv_cache_events: Whether to enable kv cache events.
     """
 
@@ -140,11 +146,13 @@ class BlockPool:
         self,
         num_gpu_blocks: int,
         enable_caching: bool,
+        hash_block_size: int,
         enable_kv_cache_events: bool = False,
     ):
         assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
         self.num_gpu_blocks = num_gpu_blocks
         self.enable_caching = enable_caching
+        self.hash_block_size = hash_block_size
         # All kv-cache blocks.
         self.blocks: list[KVCacheBlock] = [
             KVCacheBlock(idx) for idx in range(num_gpu_blocks)
@@ -223,8 +231,20 @@ class BlockPool:
             return
         new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
         assert len(request.block_hashes) >= num_full_blocks
-        new_block_hashes = request.block_hashes[num_cached_blocks:]
+        if block_size == self.hash_block_size:
+            # Common case.
+            block_hashes: BlockHashList = request.block_hashes
+        else:
+            # block_size is a multiple of hash_block_size. This happens when
+            # different KV cache groups have different block sizes.
+            assert block_size % self.hash_block_size == 0
+            # Recalculate block_hashes at the granularity of block_size, using
+            # the original block_hashes (at the granularity of hash_block_size).
+            block_hashes = BlockHashListWithBlockSize(
+                request.block_hashes, self.hash_block_size, block_size
+            )
 
+        new_block_hashes = block_hashes[num_cached_blocks:]
         new_hashes: list[ExternalBlockHash] | None = (
             [] if self.enable_kv_cache_events else None
         )
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 1531b61f88fe2..fd1ec8e27fba2 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -2,15 +2,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
+from math import lcm
 
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import (
+    BlockHash,
+    BlockHashList,
+    BlockHashListWithBlockSize,
+    KVCacheBlock,
+)
 from vllm.v1.core.single_type_kv_cache_manager import (
     CrossAttentionManager,
     FullAttentionManager,
     get_manager_for_kv_cache_spec,
 )
-from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheSpec
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheSpec,
+)
 from vllm.v1.request import Request
 
 
@@ -28,13 +38,17 @@ class KVCacheCoordinator(ABC):
         enable_kv_cache_events: bool,
         dcp_world_size: int,
         pcp_world_size: int,
+        hash_block_size: int,
     ):
         self.kv_cache_config = kv_cache_config
         self.max_model_len = max_model_len
         self.enable_caching = enable_caching
 
         self.block_pool = BlockPool(
-            kv_cache_config.num_blocks, enable_caching, enable_kv_cache_events
+            kv_cache_config.num_blocks,
+            enable_caching,
+            hash_block_size,
+            enable_kv_cache_events,
         )
 
         # Needs special handling for find_longest_cache_hit if eagle is enabled
@@ -213,6 +227,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
         enable_kv_cache_events: bool,
         dcp_world_size: int,
         pcp_world_size: int,
+        hash_block_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -222,6 +237,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
             pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
         )
         self.num_single_type_manager = len(self.single_type_managers)
 
@@ -255,6 +271,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
         enable_kv_cache_events: bool,
         dcp_world_size: int,
         pcp_world_size: int,
+        hash_block_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -264,6 +281,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
             pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
         )
         self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec
         self.block_size = self.kv_cache_spec.block_size
@@ -273,6 +291,11 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
             self.block_size *= dcp_world_size
         if pcp_world_size > 1:
             self.block_size *= pcp_world_size
+        # For models using only Mamba, block_size is set to max_model_len when
+        # prefix caching is disabled, and hash_block_size validation is skipped.
+        assert not enable_caching or (hash_block_size == self.block_size), (
+            "UnitaryKVCacheCoordinator assumes hash_block_size == block_size"
+        )
         assert len(self.kv_cache_config.kv_cache_groups) == 1, (
             "UnitaryKVCacheCoordinator assumes only one kv cache group"
         )
@@ -289,6 +312,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
             block_pool=self.block_pool,
             kv_cache_spec=self.kv_cache_spec,
             use_eagle=self.use_eagle,
+            alignment_tokens=self.block_size,
             dcp_world_size=self.dcp_world_size,
             pcp_world_size=self.pcp_world_size,
         )
@@ -313,6 +337,7 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
         enable_kv_cache_events: bool,
         dcp_world_size: int,
         pcp_world_size: int,
+        hash_block_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -322,7 +347,17 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
             pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
         )
+        # hash_block_size: the block size used to compute block hashes.
+        # The actual block size usually equals hash_block_size, but in cases where
+        # different KV cache groups have different block sizes, the actual block size
+        # can be a multiple of hash_block_size.
+        self.hash_block_size = hash_block_size
+        assert all(
+            g.kv_cache_spec.block_size % hash_block_size == 0
+            for g in kv_cache_config.kv_cache_groups
+        ), "block_size must be divisible by hash_block_size"
         assert dcp_world_size == 1, "DCP not support hybrid attn now."
         assert pcp_world_size == 1, "PCP not support hybrid attn now."
         self.verify_and_split_kv_cache_groups()
@@ -373,14 +408,12 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
         self.other_spec = other_spec
         self.full_attention_block_size = self.full_attention_spec.block_size
         self.other_block_size = self.other_spec.block_size
-
-        if self.enable_caching:
-            # this requirement is only needed for the prefix caching logic
-            divisible = self.other_block_size % self.full_attention_block_size
-            assert divisible == 0, (
-                "KVCacheCoordinator assumes the block_size of full "
-                "attention layers is divisible by other layers now."
-            )
+        # The LCM of the block sizes of full attention and other attention.
+        # The cache hit length must be a multiple of the LCM of the block sizes
+        # to make sure the cache hit length is a multiple of the block size of
+        # each attention type. Requiring this because we don't support partial
+        # block cache hit yet.
+        self.lcm_block_size = lcm(self.full_attention_block_size, self.other_block_size)
 
         if max(self.full_attention_group_ids) < min(self.other_group_ids):
             self.full_attn_first = True
@@ -414,25 +447,48 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
                 - The number of tokens of the longest cache hit.
         """
         # First, find the longest cache hit for full attention.
+        if self.full_attention_spec.block_size == self.hash_block_size:
+            # Common case.
+            full_attention_block_hashes: BlockHashList = block_hashes
+        else:
+            # block_size is a multiple of hash_block_size. This happens when different
+            # KV cache groups have different block sizes. In this case, we need to
+            # recalculate block_hashes at the granularity of block_size, using the
+            # original block_hashes (at the granularity of hash_block_size).
+            full_attention_block_hashes = BlockHashListWithBlockSize(
+                block_hashes, self.hash_block_size, self.full_attention_spec.block_size
+            )
         hit_blocks_full_attn = self.full_attention_manager_cls.find_longest_cache_hit(
-            block_hashes=block_hashes,
+            block_hashes=full_attention_block_hashes,
             max_length=max_cache_hit_length,
             kv_cache_group_ids=self.full_attention_group_ids,
             block_pool=self.block_pool,
             kv_cache_spec=self.full_attention_spec,
             use_eagle=self.use_eagle,
+            alignment_tokens=self.lcm_block_size,
         )
         hit_length = len(hit_blocks_full_attn[0]) * self.full_attention_block_size
 
         # Next, find the cache hit for the other attention WITHIN
         # the cache hit of full attention.
+        if self.other_spec.block_size == self.hash_block_size:
+            # Common case.
+            other_block_hashes: BlockHashList = block_hashes
+        else:
+            # Similar to the full attention case, here we need to recalculate
+            # block_hashes at the granularity of block_size, using the original
+            # block_hashes (at the granularity of hash_block_size).
+            other_block_hashes = BlockHashListWithBlockSize(
+                block_hashes, self.hash_block_size, self.other_spec.block_size
+            )
         hit_blocks_other_attn = self.other_attention_cls.find_longest_cache_hit(
-            block_hashes=block_hashes,
+            block_hashes=other_block_hashes,
             max_length=hit_length,
             kv_cache_group_ids=self.other_group_ids,
             block_pool=self.block_pool,
             kv_cache_spec=self.other_spec,
             use_eagle=self.use_eagle,
+            alignment_tokens=self.lcm_block_size,
         )
         hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
 
@@ -466,6 +522,7 @@ def get_kv_cache_coordinator(
     enable_kv_cache_events: bool,
     dcp_world_size: int,
     pcp_world_size: int,
+    hash_block_size: int,
 ) -> KVCacheCoordinator:
     if not enable_caching:
         return KVCacheCoordinatorNoPrefixCache(
@@ -473,8 +530,9 @@ def get_kv_cache_coordinator(
             max_model_len,
             use_eagle,
             enable_kv_cache_events,
-            dcp_world_size=dcp_world_size,
-            pcp_world_size=pcp_world_size,
+            dcp_world_size,
+            pcp_world_size,
+            hash_block_size,
         )
     if len(kv_cache_config.kv_cache_groups) == 1:
         return UnitaryKVCacheCoordinator(
@@ -483,8 +541,9 @@ def get_kv_cache_coordinator(
             use_eagle,
             enable_caching,
             enable_kv_cache_events,
-            dcp_world_size=dcp_world_size,
-            pcp_world_size=pcp_world_size,
+            dcp_world_size,
+            pcp_world_size,
+            hash_block_size,
         )
     return HybridKVCacheCoordinator(
         kv_cache_config,
@@ -492,6 +551,7 @@ def get_kv_cache_coordinator(
         use_eagle,
         enable_caching,
         enable_kv_cache_events,
-        dcp_world_size=dcp_world_size,
-        pcp_world_size=pcp_world_size,
+        dcp_world_size,
+        pcp_world_size,
+        hash_block_size,
     )
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 2012c3fef88bc..b061e5cc831dd 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -95,6 +95,7 @@ class KVCacheManager:
         self,
         kv_cache_config: KVCacheConfig,
         max_model_len: int,
+        hash_block_size: int,
         enable_caching: bool = True,
         use_eagle: bool = False,
         log_stats: bool = False,
@@ -107,28 +108,11 @@ class KVCacheManager:
         self.enable_caching = enable_caching
         self.use_eagle = use_eagle
         self.log_stats = log_stats
-        # FIXME: make prefix cache stats conditional on log_stats
+        # FIXME: make prefix cache stats conditional on log_stats. We still need
+        # this comment because when the log stats is enabled there are still
+        # potential configs we could expose in the future.
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
 
-        self.block_size: int | None = None
-        if self.enable_caching:
-            assert (
-                len(
-                    set(
-                        g.kv_cache_spec.block_size
-                        for g in kv_cache_config.kv_cache_groups
-                    )
-                )
-                == 1
-            ), "Only one block size is supported for now"
-            self.block_size = kv_cache_config.kv_cache_groups[
-                0
-            ].kv_cache_spec.block_size
-
-            if dcp_world_size * pcp_world_size > 1:
-                assert len(kv_cache_config.kv_cache_groups) == 1
-                self.block_size *= dcp_world_size * pcp_world_size
-
         self.coordinator = get_kv_cache_coordinator(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
@@ -137,6 +121,7 @@ class KVCacheManager:
             enable_kv_cache_events=enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
             pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
         )
         self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
         self.block_pool = self.coordinator.block_pool
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index a0033fa650baa..602eb81beb010 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -5,9 +5,9 @@
 import copy
 import os
 from collections import defaultdict
-from collections.abc import Callable, Iterable, Sequence
-from dataclasses import dataclass
-from typing import Any, NewType, TypeAlias
+from collections.abc import Callable, Iterable, Iterator, Sequence
+from dataclasses import dataclass, replace
+from typing import Any, NewType, TypeAlias, overload
 
 from vllm import envs
 from vllm.config import VllmConfig
@@ -825,11 +825,11 @@ def get_num_blocks(
     return num_blocks
 
 
-def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
+def get_uniform_page_size(kv_cache_specs: Iterable[KVCacheSpec]) -> int:
     """
     Get the page size of the KV cache.
     """
-    page_sizes = set(layer.page_size_bytes for layer in kv_cache_spec.values())
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_specs}
     assert len(page_sizes) == 1
     return page_sizes.pop()
 
@@ -882,6 +882,46 @@ def is_kv_cache_page_size_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool
     return len(page_sizes) == 1
 
 
+def unify_kv_cache_spec_page_size(
+    kv_cache_spec: dict[str, KVCacheSpec],
+) -> dict[str, KVCacheSpec]:
+    """
+    Unify the page size of the given KVCacheSpec. If the page size of all layers
+    are the same, return the original KVCacheSpec. If not same, unify the page
+    size by increasing the block size of layers with smaller page size. Raise
+    NotImplementedError if failed to unify the page size.
+
+    Args:
+        kv_cache_spec: The KVCacheSpec of each attention layer in the model
+
+    Returns:
+        The updated KVCacheSpec with the same page_size_bytes.
+    """
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
+    if len(page_sizes) <= 1:
+        # All layers have the same page size, no need to unify.
+        return kv_cache_spec
+
+    max_page_size = max(page_sizes)
+    new_kv_cache_spec = {}
+    for layer_name, layer_spec in kv_cache_spec.items():
+        if layer_spec.page_size_bytes == max_page_size:
+            new_kv_cache_spec[layer_name] = layer_spec
+        else:
+            layer_page_size = layer_spec.page_size_bytes
+            if max_page_size % layer_page_size != 0:
+                raise NotImplementedError(
+                    "The page size of the layer is not divisible by the "
+                    "maximum page size. Cannot unify by adjusting block_size."
+                )
+            ratio = max_page_size // layer_page_size
+            new_block_size = layer_spec.block_size * ratio
+            new_spec = replace(layer_spec, block_size=new_block_size)
+            assert new_spec.page_size_bytes == max_page_size
+            new_kv_cache_spec[layer_name] = new_spec
+    return new_kv_cache_spec
+
+
 def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     # kv_cache_spec is an empty dict for attention free models
     return not kv_cache_spec
@@ -1010,7 +1050,6 @@ def _get_kv_cache_groups_uniform_page_size(
 def get_kv_cache_config_from_groups(
     vllm_config: VllmConfig,
     kv_cache_groups: list[KVCacheGroupSpec],
-    kv_cache_specs: dict[str, KVCacheSpec],
     available_memory: int,
 ) -> KVCacheConfig:
     """
@@ -1020,7 +1059,6 @@ def get_kv_cache_config_from_groups(
     Args:
         vllm_config: The global VllmConfig
         kv_cache_groups: The KV cache groups
-        kv_cache_specs: The KV cache spec of each attention layer in the model
         available_memory: Memory available for KV cache in bytes
     Returns:
         The generated KVCacheConfig
@@ -1064,7 +1102,9 @@ def get_kv_cache_config_from_groups(
         # full.1, sw.2: share another Tensor with size=available_memory//2
         group_size = max(len(group.layer_names) for group in kv_cache_groups)
 
-        page_size = get_uniform_page_size(kv_cache_specs)
+        page_size = get_uniform_page_size(
+            [group.kv_cache_spec for group in kv_cache_groups]
+        )
         assert group_size > 0, "group_size must be greater than 0"
         num_blocks = get_num_blocks(
             vllm_config, group_size, available_memory, page_size
@@ -1166,7 +1206,8 @@ def get_kv_cache_groups(
         # This returns an empty list to allow for the KVCacheManager to handle
         # attention free models.
         return []
-    elif is_kv_cache_spec_uniform(kv_cache_spec):
+
+    if is_kv_cache_spec_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
         # each layer.
@@ -1176,14 +1217,16 @@ def get_kv_cache_groups(
         # full attention, or all layers are sliding window attention with the
         # same window size). Put all layers into one group.
         return _get_kv_cache_groups_uniform_type(uniform_spec)
-    elif is_kv_cache_page_size_uniform(kv_cache_spec):
-        # Model contains multiple attention types, but KV cache of all layers
-        # have the same physical memory per block per layer. Split the layers
-        # into groups with the same number of layers, and thus same total page
-        # size.
-        return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
 
-    raise NotImplementedError
+    # As KVCacheManager can only allocate memory of one size, we need to unify
+    # the page size of the layers. For cases cannot be unified, this function
+    # will raise an error.
+    kv_cache_spec = unify_kv_cache_spec_page_size(kv_cache_spec)
+    # Model contains multiple attention types, but KV cache of all layers
+    # have the same physical memory per block per layer. Split the layers
+    # into groups with the same number of layers, and thus same total page
+    # size.
+    return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
 
 
 def generate_scheduler_kv_cache_config(
@@ -1327,10 +1370,7 @@ def get_kv_cache_configs(
         ) == len(kv_cache_spec_one_worker), "Some layers are not assigned to any group."
         kv_cache_configs.append(
             get_kv_cache_config_from_groups(
-                vllm_config,
-                kv_cache_groups_one_worker,
-                kv_cache_spec_one_worker,
-                available_memory_one_worker,
+                vllm_config, kv_cache_groups_one_worker, available_memory_one_worker
             )
         )
 
@@ -1353,3 +1393,79 @@ def get_kv_cache_configs(
             _report_kv_cache_config(vllm_config, kv_cache_config)
 
     return kv_cache_configs
+
+
+class BlockHashListWithBlockSize:
+    """
+    Convert block-hash granularity from `hash_block_size` to `target_block_size`.
+    Used when KV cache groups have different block sizes: `hash_block_size`
+    is the size used to compute the original `block_hashes`; `target_block_size`
+    is the group's actual block size.
+
+    Currently, only scaling up by an integer factor is supported (i.e.,
+    `target_block_size` is a multiple of `hash_block_size`). Conversion is
+    performed lazily on access for efficiency, by concatenating consecutive
+    hashes at `hash_block_size` to form each hash at `target_block_size`.
+
+    Example (`hash_block_size` = 16, `target_block_size` = 32):
+    concatenating two 16-size hashes yields one 32-size hash:
+
+    Block hashes with block_size 16:
+    | Token Range | 0-15 | 16-31 | 32-47 | 48-63 |
+    |-------------|------|-------|-------|-------|
+    | Hash        | A    | B     | C     | D     |
+
+    Block hashes with block_size 32:
+    | Token Range | 0-31 | 32-63 |
+    |-------------|------|-------|
+    | Hash        | AB   | CD    |
+
+    Args:
+        block_hashes: Block hashes to convert, computed at `hash_block_size`.
+        hash_block_size: Block size at which `block_hashes` were computed.
+        target_block_size: Desired block size; must be a multiple of `hash_block_size`.
+    """
+
+    def __init__(
+        self,
+        block_hashes: list[BlockHash],
+        hash_block_size: int,
+        target_block_size: int,
+    ):
+        self.block_hashes = block_hashes
+        assert target_block_size % hash_block_size == 0
+        self.scale_factor = target_block_size // hash_block_size
+
+    def __len__(self) -> int:
+        return len(self.block_hashes) // self.scale_factor
+
+    @overload
+    def __getitem__(self, idx: int) -> BlockHash: ...
+
+    @overload
+    def __getitem__(self, idx: slice) -> list[BlockHash]: ...
+
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            return self._get_value_at(idx)
+
+        if isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            return [self._get_value_at(i) for i in range(start, stop, step)]
+
+        raise TypeError(f"Invalid index type: {type(idx)!r}")
+
+    def __iter__(self) -> Iterator[BlockHash]:
+        for i in range(len(self)):
+            yield self._get_value_at(i)
+
+    def _get_value_at(self, idx: int) -> BlockHash:
+        base = idx * self.scale_factor
+        end = base + self.scale_factor
+        merged_hash: bytes = self.block_hashes[base]
+        for i in range(base + 1, end):
+            merged_hash += self.block_hashes[i]
+        return BlockHash(merged_hash)
+
+
+BlockHashList = list[BlockHash] | BlockHashListWithBlockSize
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 23af014c10364..bea2f865bad46 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -186,6 +186,7 @@ class Scheduler(SchedulerInterface):
             enable_kv_cache_events=self.enable_kv_cache_events,
             dcp_world_size=self.dcp_world_size,
             pcp_world_size=self.pcp_world_size,
+            hash_block_size=self.block_size,
         )
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
         self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index d90ec550f7666..4aeb17a156bb3 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -7,7 +7,7 @@ from collections.abc import Sequence
 
 from vllm.utils.math_utils import cdiv
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import BlockHashList, KVCacheBlock
 from vllm.v1.kv_cache_interface import (
     ChunkedLocalAttentionSpec,
     CrossAttentionSpec,
@@ -207,12 +207,13 @@ class SingleTypeKVCacheManager(ABC):
     @abstractmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
@@ -232,6 +233,11 @@ class SingleTypeKVCacheManager(ABC):
             block_pool: The block pool.
             kv_cache_spec: The kv cache spec.
             use_eagle: Whether to use eagle.
+            alignment_tokens: The returned cache hit length (in tokens) should
+                be a multiple of this value (in tokens). By default, it should
+                be set to the block_size.
+            dcp_world_size: The world size of decode context parallelism.
+            pcp_world_size: The world size of prefill context parallelism.
 
         Returns:
             A list of cached blocks with skipped blocks replaced by null block
@@ -299,17 +305,18 @@ class FullAttentionManager(SingleTypeKVCacheManager):
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(
-            kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec)
+            kv_cache_spec, FullAttentionSpec | ChunkedLocalAttentionSpec
         ), (
             "FullAttentionManager can only be used for full attention "
             "and chunked local attention groups"
@@ -333,6 +340,13 @@ class FullAttentionManager(SingleTypeKVCacheManager):
             else:
                 break
         if use_eagle and computed_blocks[0]:
+            # Need to drop the last matched block if eagle is enabled.
+            for computed in computed_blocks:
+                computed.pop()
+        while (
+            block_size != alignment_tokens  # Faster for common case.
+            and len(computed_blocks[0]) * block_size % alignment_tokens != 0
+        ):
             for computed in computed_blocks:
                 computed.pop()
         return computed_blocks
@@ -359,12 +373,13 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
@@ -396,6 +411,7 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
             [block_pool.null_block] * max_num_blocks
             for _ in range(len(kv_cache_group_ids))
         )
+        block_size = kv_cache_spec.block_size
         num_contiguous_blocks = 0
         match_found = False
         # Search from right to left and early stop when a match is found.
@@ -403,6 +419,15 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
             if cached_block := block_pool.get_cached_block(
                 block_hashes[i], kv_cache_group_ids
             ):
+                # Skip prefix matching check if the block is not aligned with
+                # `alignment_tokens`.
+                if (
+                    num_contiguous_blocks == 0
+                    and block_size != alignment_tokens  # Faster for common case.
+                    and (i + 1) * block_size % alignment_tokens != 0
+                ):
+                    continue
+                # Add the cached block to the computed blocks.
                 for computed, cached in zip(computed_blocks, cached_block):
                     computed[i] = cached
                 num_contiguous_blocks += 1
@@ -421,7 +446,16 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
             # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
             for computed in computed_blocks:
                 del computed[num_contiguous_blocks:]
+            while (
+                block_size != alignment_tokens  # Faster for common case.
+                and len(computed_blocks[0]) * block_size % alignment_tokens != 0
+            ):
+                for computed in computed_blocks:
+                    computed.pop()
         if use_eagle and computed_blocks[0]:
+            assert kv_cache_spec.block_size == alignment_tokens, (
+                "aligned_length is not compatible with eagle now"
+            )
             for computed in computed_blocks:
                 computed.pop()
         return computed_blocks
@@ -475,12 +509,13 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
@@ -511,6 +546,10 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
             block_pool: The block pool.
             kv_cache_spec: The kv cache spec.
             use_eagle: Whether to use eagle.
+            dcp_world_size: The world size of decode context parallelism.
+            pcp_world_size: The world size of prefill context parallelism.
+            alignment_tokens: The returned cache hit length (in tokens) should
+                be a multiple of this value (in tokens).
 
         Returns:
             A list of cached blocks
@@ -524,6 +563,10 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
         )
         assert dcp_world_size == 1, "DCP not support chunked local attn now."
         assert pcp_world_size == 1, "PCP not support chunked local attn now."
+        assert kv_cache_spec.block_size == alignment_tokens, (
+            "KV cache groups with different block sizes are not compatible with "
+            "chunked local attention now"
+        )
         max_num_blocks = max_length // kv_cache_spec.block_size
         if max_length > 0:
             local_attention_start_idx = (
@@ -612,12 +655,13 @@ class MambaManager(SingleTypeKVCacheManager):
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
@@ -630,12 +674,21 @@ class MambaManager(SingleTypeKVCacheManager):
             [] for _ in range(len(kv_cache_group_ids))
         )
 
-        max_num_blocks = max_length // kv_cache_spec.block_size
+        block_size = kv_cache_spec.block_size
+        max_num_blocks = max_length // block_size
         # Search from right to left and early stop when a match is found.
         for i in range(max_num_blocks - 1, -1, -1):
             if cached_block := block_pool.get_cached_block(
                 block_hashes[i], kv_cache_group_ids
             ):
+                # When enable Mamba prefix caching, `block_size` will be aligned
+                # across full attention layers and Mamba layers to ensure the
+                # prefix hit length aligned at block
+                if (
+                    block_size != alignment_tokens  # Faster for common case.
+                    and (i + 1) * block_size % alignment_tokens != 0
+                ):
+                    continue
                 for computed, cached in zip(computed_blocks, cached_block):
                     # the hit length logic later assumes:
                     #  hit_length = len(hit_blocks_other_attn[0])
@@ -708,12 +761,13 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:

From a1f267687956ae1f0b7c1668b48020055d764619 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 16:08:57 +0000
Subject: [PATCH 071/134] Scheduled removal of `override_pooler_config` and
 `disable_log_requests` (#29402)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/config/model.py         | 16 ----------------
 vllm/engine/arg_utils.py     | 29 +----------------------------
 vllm/entrypoints/llm.py      |  5 -----
 vllm/utils/argparse_utils.py |  8 --------
 vllm/v1/engine/async_llm.py  |  8 --------
 5 files changed, 1 insertion(+), 65 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 14ffdec2e09d1..ce5e824da5c22 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -299,9 +299,6 @@ class ModelConfig:
     pooler_config: PoolerConfig | None = None
     """Pooler config which controls the behaviour of output pooling in pooling
     models."""
-    override_pooler_config: dict | PoolerConfig | None = None
-    """[DEPRECATED] Use `pooler_config` instead. This field will be removed in
-    v0.12.0 or v1.0.0, whichever is sooner."""
 
     # Multimodal config and init vars
     multimodal_config: MultiModalConfig | None = None
@@ -359,7 +356,6 @@ class ModelConfig:
             "logits_processors",
             "io_processor_plugin",
             "pooler_config",
-            "override_pooler_config",
             "multimodal_config",
             "limit_mm_per_prompt",
             "media_io_kwargs",
@@ -648,18 +644,6 @@ class ModelConfig:
 
         # Init pooler config if needed
         if self.runner_type == "pooling":
-            if self.override_pooler_config is not None:
-                logger.warning_once(
-                    "`override_pooler_config` is deprecated and will be "
-                    "removed in v0.12.0 or v1.0.0, whichever is sooner. "
-                    "Please use `pooler_config` instead."
-                )
-
-                if isinstance(self.override_pooler_config, dict):
-                    self.pooler_config = PoolerConfig(**self.override_pooler_config)
-                else:
-                    self.pooler_config = self.override_pooler_config
-
             if self.pooler_config is None:
                 self.pooler_config = PoolerConfig()
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bdccb15e3f655..696ff3a1f4024 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -29,7 +29,7 @@ import regex as re
 import torch
 from pydantic import TypeAdapter, ValidationError
 from pydantic.fields import FieldInfo
-from typing_extensions import TypeIs, deprecated
+from typing_extensions import TypeIs
 
 import vllm.envs as envs
 from vllm.attention.backends.registry import AttentionBackendEnum
@@ -520,9 +520,6 @@ class EngineArgs:
     scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
     pooler_config: PoolerConfig | None = ModelConfig.pooler_config
-    override_pooler_config: dict | PoolerConfig | None = (
-        ModelConfig.override_pooler_config
-    )
     compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
     worker_cls: str = ParallelConfig.worker_cls
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
@@ -659,11 +656,6 @@ class EngineArgs:
         )
         model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"])
         model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"])
-        model_group.add_argument(
-            "--override-pooler-config",
-            **model_kwargs["override_pooler_config"],
-            deprecated=True,
-        )
         model_group.add_argument(
             "--logits-processor-pattern", **model_kwargs["logits_processor_pattern"]
         )
@@ -1243,7 +1235,6 @@ class EngineArgs:
             mm_encoder_tp_mode=self.mm_encoder_tp_mode,
             mm_encoder_attn_backend=self.mm_encoder_attn_backend,
             pooler_config=self.pooler_config,
-            override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern,
             generation_config=self.generation_config,
             override_generation_config=self.override_generation_config,
@@ -2047,24 +2038,6 @@ class AsyncEngineArgs(EngineArgs):
 
     enable_log_requests: bool = False
 
-    @property
-    @deprecated(
-        "`disable_log_requests` is deprecated and has been replaced with "
-        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
-        "`enable_log_requests` instead."
-    )
-    def disable_log_requests(self) -> bool:
-        return not self.enable_log_requests
-
-    @disable_log_requests.setter
-    @deprecated(
-        "`disable_log_requests` is deprecated and has been replaced with "
-        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
-        "`enable_log_requests` instead."
-    )
-    def disable_log_requests(self, value: bool):
-        self.enable_log_requests = not value
-
     @staticmethod
     def add_cli_args(
         parser: FlexibleArgumentParser, async_args_only: bool = False
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 848916dbd8763..1860f383d45fb 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -174,9 +174,6 @@ class LLM:
             For example, for Phi-3-Vision: `{"num_crops": 4}`.
         pooler_config: Initialize non-default pooling config for the pooling
             model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
-        override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This
-            argument is deprecated and will be removed in v0.12.0 or v1.0.0,
-            whichever is sooner.
         compilation_config: Either an integer or a dictionary. If it is an
             integer, it is used as the mode of compilation optimization. If it
             is a dictionary, it can specify the full compilation configuration.
@@ -214,7 +211,6 @@ class LLM:
         hf_overrides: HfOverrides | None = None,
         mm_processor_kwargs: dict[str, Any] | None = None,
         pooler_config: PoolerConfig | None = None,
-        override_pooler_config: PoolerConfig | None = None,
         structured_outputs_config: dict[str, Any]
         | StructuredOutputsConfig
         | None = None,
@@ -330,7 +326,6 @@ class LLM:
             hf_overrides=hf_overrides,
             mm_processor_kwargs=mm_processor_kwargs,
             pooler_config=pooler_config,
-            override_pooler_config=override_pooler_config,
             structured_outputs_config=structured_outputs_instance,
             compilation_config=compilation_config_instance,
             logits_processors=logits_processors,
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index 3d105a3685b37..692e756d19634 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -73,14 +73,6 @@ class FlexibleArgumentParser(ArgumentParser):
         # Enable the deprecated kwarg for Python 3.12 and below
 
         def parse_known_args(self, args=None, namespace=None):
-            if args is not None and "--disable-log-requests" in args:
-                # Special case warning because the warning below won't trigger
-                # if –-disable-log-requests because its value is default.
-                logger.warning_once(
-                    "argument '--disable-log-requests' is deprecated and "
-                    "replaced with '--enable-log-requests'. This will be "
-                    "removed in v0.12.0."
-                )
             namespace, args = super().parse_known_args(args, namespace)
             for action in FlexibleArgumentParser._deprecated:
                 if (
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 55087baadff97..827a2736af284 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -31,7 +31,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.async_utils import cancel_task_threadsafe
 from vllm.utils.collection_utils import as_list
-from vllm.utils.func_utils import deprecate_kwargs
 from vllm.utils.math_utils import cdiv
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
@@ -195,12 +194,6 @@ class AsyncLLM(EngineClient):
             self.profiler = None
 
     @classmethod
-    @deprecate_kwargs(
-        "disable_log_requests",
-        additional_message=(
-            "This argument will have no effect. Use `enable_log_requests` instead."
-        ),
-    )
     def from_vllm_config(
         cls,
         vllm_config: VllmConfig,
@@ -213,7 +206,6 @@ class AsyncLLM(EngineClient):
         client_addresses: dict[str, str] | None = None,
         client_count: int = 1,
         client_index: int = 0,
-        disable_log_requests: bool = True,  # Deprecated, will be removed
     ) -> "AsyncLLM":
         # Create the LLMEngine.
         return cls(

From 0353d2e162cbda776d9dbfe026e65303204a7f1f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 16:23:45 +0000
Subject: [PATCH 072/134] Fix RoPE related failures in Transformers nightly
 tests (#29333)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/baichuan.py |  2 +-
 vllm/model_executor/models/gpt_j.py    |  2 +-
 vllm/model_executor/models/grok1.py    |  2 +-
 vllm/model_executor/models/llama.py    |  2 +-
 vllm/transformers_utils/config.py      | 62 ++++++++++++++------------
 5 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index edf47270e5277..024788918d024 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -233,7 +233,7 @@ class BaiChuanDecoderLayer(nn.Module):
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             position_embedding=position_embedding,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index e94de8952fa63..bd1bfea3c0fef 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -100,7 +100,7 @@ class GPTJAttention(nn.Module):
             self.head_size,
             rotary_dim=config.rotary_dim,
             max_position=max_position_embeddings,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
             is_neox_style=False,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index 4bf23cd6fd19a..cfca564920111 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -239,7 +239,7 @@ class Grok1DecoderLayer(nn.Module):
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index eebb9e07fa89d..f6af2bb3b12e9 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -262,7 +262,7 @@ class LlamaAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
             is_neox_style=is_neox_style,
             partial_rotary_factor=self.partial_rotary_factor,
         )
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index a29d92f67f5df..66680f410cb3c 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -456,51 +456,55 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
 
 def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
-    # Retrieve rope_parameters differently based on Transformers version
+    # Patch rope_parameters differently based on Transformers version
     if Version(version("transformers")) >= Version("5.0.0.dev0"):
-        from transformers.modeling_rope_utils import RopeParameters
-
-        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr(
-            config, "rope_parameters", None
+        from transformers.modeling_rope_utils import (
+            rope_config_validation,
+            standardize_rope_params,
         )
-    elif hasattr(config, "rope_parameters"):
-        # We are in Transformers v4 and rope_parameters
-        # has already been patched for this config
-        return
+
+        # When Transformers v5 is installed, legacy rope_theta may be present
+        # when using custom code models written for Transformers v4
+        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
+            standardize_rope_params(config, rope_theta=rope_theta)
+            rope_config_validation(config)
+            # Delete rope_theta to avoid confusion in downstream code
+            del config.rope_theta
     else:
-        # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters
-        rope_theta: float | None = getattr(config, "rope_theta", None)
-        rope_scaling: dict | None = getattr(config, "rope_scaling", None)
-        rope_parameters = rope_scaling
-        # Move rope_theta into rope_parameters
-        if rope_theta is not None:
-            rope_parameters = rope_parameters or {"rope_type": "default"}
-            rope_parameters["rope_theta"] = rope_theta
-        # Add original_max_position_embeddings if present
-        if rope_parameters and (
-            ompe := getattr(config, "original_max_position_embeddings", None)
-        ):
-            rope_parameters["original_max_position_embeddings"] = ompe
-        # Write back to config
-        config.rope_parameters = rope_parameters
+        # When Transformers v4 is installed, legacy rope_scaling may be present
+        if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
+            config.rope_parameters = rope_scaling
+        # When Transformers v4 is installed, legacy rope_theta may be present
+        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
+            if not hasattr(config, "rope_parameters"):
+                config.rope_parameters = {"rope_type": "default"}
+            config.rope_parameters["rope_theta"] = rope_theta
 
     # No RoPE parameters to patch
-    if rope_parameters is None:
+    if not hasattr(config, "rope_parameters"):
         return
 
+    # Add original_max_position_embeddings if present
+    if ompe := getattr(config, "original_max_position_embeddings", None):
+        config.rope_parameters["original_max_position_embeddings"] = ompe
+
     # Handle nested rope_parameters in interleaved sliding attention models
-    if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
-        for rope_parameters_layer_type in rope_parameters.values():
+    if set(config.rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+        for rope_parameters_layer_type in config.rope_parameters.values():
             patch_rope_parameters_dict(rope_parameters_layer_type)
     else:
-        patch_rope_parameters_dict(rope_parameters)
+        patch_rope_parameters_dict(config.rope_parameters)
 
 
 def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
     if "rope_type" in rope_parameters and "type" in rope_parameters:
         rope_type = rope_parameters["rope_type"]
         rope_type_legacy = rope_parameters["type"]
-        if rope_type != rope_type_legacy:
+        if (rope_type_legacy == "su" and rope_type == "longrope") or (
+            rope_type_legacy == "mrope" and rope_type == "default"
+        ):
+            pass  # No action needed
+        elif rope_type != rope_type_legacy:
             raise ValueError(
                 f"Found conflicts between 'rope_type={rope_type}' (modern "
                 f"field) and 'type={rope_type_legacy}' (legacy field). "

From b07555d26f4c7ad9a2d1ec45428a9d4287db612c Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Tue, 25 Nov 2025 10:27:26 -0800
Subject: [PATCH 073/134] [responsesAPI][2] parse
 ResponseFunctionToolCallOutputItem (#29383)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 tests/entrypoints/test_responses_utils.py | 15 +++++++++++++++
 vllm/entrypoints/openai/protocol.py       |  5 +----
 vllm/entrypoints/responses_utils.py       |  9 +++++++++
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
index 91c818374e3fd..893d806b65742 100644
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@@ -2,6 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
 from openai.types.responses.response_reasoning_item import (
     Content,
     ResponseReasoningItem,
@@ -76,6 +79,18 @@ class TestResponsesUtils:
             == 'Hmm, the user has just started with a simple "Hello,"'
         )
 
+        tool_call_output = ResponseFunctionToolCallOutputItem(
+            id="temp_id",
+            type="function_call_output",
+            call_id="temp",
+            output="1234",
+            status="completed",
+        )
+        formatted_item = construct_chat_message_with_tool_call(tool_call_output)
+        assert formatted_item["role"] == "tool"
+        assert formatted_item["content"] == "1234"
+        assert formatted_item["tool_call_id"] == "temp"
+
         item = ResponseReasoningItem(
             id="lol",
             summary=[],
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 98a385a1dcd5f..688ea9697d9d6 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -29,7 +29,6 @@ from openai.types.responses import (
     ResponseOutputItemAddedEvent,
     ResponseOutputItemDoneEvent,
     ResponsePrompt,
-    ResponseReasoningItem,
     ResponseReasoningTextDeltaEvent,
     ResponseReasoningTextDoneEvent,
     ResponseStatus,
@@ -304,9 +303,7 @@ def get_logits_processors(
     return None
 
 
-ResponseInputOutputItem: TypeAlias = (
-    ResponseInputItemParam | ResponseReasoningItem | ResponseFunctionToolCall
-)
+ResponseInputOutputItem: TypeAlias = ResponseInputItemParam | ResponseOutputItem
 
 
 class ResponsesRequest(OpenAIBaseModel):
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index b02c43c7f8246..07abb80ebc9e3 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -10,6 +10,9 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
     Function as FunctionCallTool,
 )
 from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
 from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
@@ -94,6 +97,12 @@ def construct_chat_message_with_tool_call(
             "role": "assistant",
             "reasoning": reasoning_content,
         }
+    elif isinstance(item, ResponseFunctionToolCallOutputItem):
+        return ChatCompletionToolMessageParam(
+            role="tool",
+            content=item.output,
+            tool_call_id=item.call_id,
+        )
     elif item.get("type") == "function_call_output":
         # Append the function call output as a tool message.
         return ChatCompletionToolMessageParam(

From c32a18cbe7342ac0700802b94ae98bbf928a00f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eldar=20Kurti=C4=87?=
 <8884008+eldarkurtic@users.noreply.github.com>
Date: Tue, 25 Nov 2025 20:23:36 +0100
Subject: [PATCH 074/134] Attempt to fix GPU OOM in a spec-decoding test
 (#29419)

Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com>
---
 examples/offline_inference/spec_decode.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 67a0732459709..29b2e95d262f8 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -133,7 +133,7 @@ def main(args):
         tensor_parallel_size=args.tp,
         enable_chunked_prefill=args.enable_chunked_prefill,
         enforce_eager=args.enforce_eager,
-        gpu_memory_utilization=0.8,
+        gpu_memory_utilization=0.9,
         speculative_config=speculative_config,
         disable_log_stats=False,
         max_model_len=args.max_model_len,

From e7d776273de379bb6c9fc11ce070c57e0fcd84f9 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Tue, 25 Nov 2025 20:58:56 +0100
Subject: [PATCH 075/134] [Compile] Refactor. Move PostGradPassManager out of
 Compilation config (#29340)

Signed-off-by: ilmarkov <markovilya197@gmail.com>
---
 vllm/compilation/backends.py          | 33 ++++++++++++++++-----------
 vllm/compilation/piecewise_backend.py |  2 +-
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 2d8dd4c51c7ef..1773913d0b6c6 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -11,6 +11,7 @@ import pprint
 import time
 from collections.abc import Callable, Sequence
 from contextlib import contextmanager
+from copy import deepcopy
 from functools import partial
 from typing import Any
 
@@ -429,7 +430,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                 self.vllm_backend.compiler_manager.compile(
                     submod,
                     args,
-                    self.compilation_config.inductor_compile_config,
+                    self.vllm_backend.inductor_config,
                     self.compilation_config,
                     graph_index=index,
                     num_graphs=len(self.compile_submod_names),
@@ -531,6 +532,9 @@ class VllmBackend:
     sym_tensor_indices: list[int]
     input_buffers: list[torch.Tensor]
     compiler_manager: CompilerManager
+    # Copy of CompilationConfig.inductor_compile_config +
+    # an entry for PostGradPassManager
+    inductor_config: dict[str, Any]
 
     def __init__(
         self,
@@ -561,25 +565,30 @@ class VllmBackend:
             self.compilation_config
         )
 
+        # Deepcopy the inductor config to detach the post-grad custom pass
+        # from CompilationConfig.
+        # We want to avoid PostGradPassManager in CompilationConfig because
+        # in future we need PostGradPassManager.uuid() to be executed
+        # only at compile time.
+        self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config)
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
 
     def configure_post_pass(self):
-        config = self.compilation_config
         self.pass_manager.configure(self.vllm_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
         # hook. If a pass for that hook exists, add it to the pass manager.
-        inductor_config = config.inductor_compile_config
-        if self.pass_key in inductor_config:
-            if isinstance(inductor_config[self.pass_key], PostGradPassManager):
-                # PassManager already added to config, make sure it's correct
-                assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid()
+        if self.pass_key in self.inductor_config:
+            if isinstance(self.inductor_config[self.pass_key], PostGradPassManager):
+                raise ValueError(
+                    "PostGradPassManager can not be kept in CompilationConfig."
+                )
             else:
                 # Config should automatically wrap all inductor passes
-                assert isinstance(inductor_config[self.pass_key], InductorPass)
-                self.pass_manager.add(inductor_config[self.pass_key])
-        inductor_config[self.pass_key] = self.pass_manager
+                assert isinstance(self.inductor_config[self.pass_key], InductorPass)
+                self.pass_manager.add(self.inductor_config[self.pass_key])
+        self.inductor_config[self.pass_key] = self.pass_manager
 
     def __call__(
         self, graph: fx.GraphModule, example_inputs
@@ -638,9 +647,7 @@ class VllmBackend:
         self.compilation_config.local_cache_dir = local_cache_dir
 
         # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE.
-        disable_cache = not is_compile_cache_enabled(
-            self.compilation_config.inductor_compile_config
-        )
+        disable_cache = not is_compile_cache_enabled(self.inductor_config)
 
         if disable_cache:
             logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index 2931580afbbb0..e535d2c461c6e 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -107,7 +107,7 @@ class PiecewiseBackend:
             entry.runnable = self.vllm_backend.compiler_manager.compile(
                 self.graph,
                 args,
-                self.compilation_config.inductor_compile_config,
+                self.vllm_backend.inductor_config,
                 self.compilation_config,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,

From 4e57c6587fe062211177f6b5d6785f00c3aea562 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 25 Nov 2025 12:55:24 -0800
Subject: [PATCH 076/134] [Core] Support logprobs with spec decode + async
 scheduling  (#29223)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/e2e/test_async_scheduling.py |  7 ++++-
 vllm/v1/core/sched/scheduler.py       |  2 --
 vllm/v1/sample/rejection_sampler.py   | 14 ++++++++--
 vllm/v1/worker/gpu_model_runner.py    | 37 ++++++++++++---------------
 4 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 00d93e1ba0b53..945276376d665 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -87,6 +87,11 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
     # Set small draft model len to force doesn't-fit-in-drafter case.
     spec_config_short = spec_config | {"max_model_len": 50}
 
+    test_sampling_params = [
+        dict(),
+        dict(logprobs=2),
+    ]
+
     # test_preemption, executor, async_scheduling,
     # spec_config, test_prefill_chunking
     test_configs = [
@@ -103,7 +108,7 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
         (True, "uni", True, spec_config_short, True),
     ]
 
-    run_tests(monkeypatch, MTP_MODEL, test_configs, [{}])
+    run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)
 
 
 @dynamo_config.patch(cache_size_limit=16)
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index bea2f865bad46..0304a8ec48bf7 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1089,8 +1089,6 @@ class Scheduler(SchedulerInterface):
                 and request.sampling_params.logprobs is not None
                 and logprobs
             ):
-                # NOTE: once we support N tokens per step (spec decode),
-                # the outer lists can be of length > 1.
                 new_logprobs = logprobs.slice(req_index, req_index + 1)
 
             if new_token_ids and self.structured_output_manager.should_advance(request):
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 926305d25f56b..ccaf07e18c468 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Sequence
 from dataclasses import replace
 
 import torch
@@ -204,7 +205,9 @@ class RejectionSampler(nn.Module):
     def parse_output(
         output_token_ids: torch.Tensor,
         vocab_size: int,
-    ) -> list[list[int]]:
+        discard_req_indices: Sequence[int] = (),
+        return_cu_num_tokens: bool = False,
+    ) -> tuple[list[list[int]], list[int] | None]:
         """Parse the output of the rejection sampler.
         Args:
             output_token_ids: The sampled token IDs in shape
@@ -212,6 +215,8 @@ class RejectionSampler(nn.Module):
                 replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                 and will be filtered out in this function.
             vocab_size: The size of the vocabulary.
+            discard_req_indices: Optional row indices to discard tokens in.
+            return_cu_num_tokens: Whether to also return cumulative token counts.
         Returns:
             A list of lists of token IDs.
         """
@@ -220,10 +225,15 @@ class RejectionSampler(nn.Module):
         valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
             output_token_ids_np < vocab_size
         )
+        cu_num_tokens = None
+        if return_cu_num_tokens:
+            cu_num_tokens = [0] + valid_mask.sum(axis=1).cumsum().tolist()
+        if len(discard_req_indices) > 0:
+            valid_mask[discard_req_indices] = False
         outputs = [
             row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
         ]
-        return outputs
+        return outputs, cu_num_tokens
 
     def apply_logits_processors(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e78d3c71af77a..bb44c5ad84cc1 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -183,7 +183,7 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
         self,
         model_runner_output: ModelRunnerOutput,
         sampled_token_ids: torch.Tensor,
-        logprobs_tensors: torch.Tensor | None,
+        logprobs_tensors: LogprobsTensors | None,
         invalid_req_indices: list[int],
         async_output_copy_stream: torch.cuda.Stream,
         vocab_size: int,
@@ -219,28 +219,29 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
 
         This function blocks until the copy is finished.
         """
+        max_gen_len = self.sampled_token_ids_cpu.shape[-1]
         self.async_copy_ready_event.synchronize()
 
         # Release the device tensors once the copy has completed.
         del self._logprobs_tensors
         del self._sampled_token_ids
-        max_gen_len = self.sampled_token_ids_cpu.shape[-1]
         if max_gen_len == 1:
             valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
+            for i in self._invalid_req_indices:
+                valid_sampled_token_ids[i].clear()
+            cu_num_tokens = None
         else:
-            valid_sampled_token_ids = RejectionSampler.parse_output(
+            valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
                 self.sampled_token_ids_cpu,
                 self.vocab_size,
+                self._invalid_req_indices,
+                return_cu_num_tokens=self._logprobs_tensors_cpu is not None,
             )
-        for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i].clear()
 
         output = self._model_runner_output
         output.sampled_token_ids = valid_sampled_token_ids
         if self._logprobs_tensors_cpu:
-            # NOTE(nick): this will need to be updated to use cu_num_accepted_tokens
-            # for async sched + spec decode + logprobs compatibility.
-            output.logprobs = self._logprobs_tensors_cpu.tolists()
+            output.logprobs = self._logprobs_tensors_cpu.tolists(cu_num_tokens)
         return output
 
 
@@ -2597,28 +2598,24 @@ class GPUModelRunner(
         sampled_token_ids = sampler_output.sampled_token_ids
         logprobs_tensors = sampler_output.logprobs_tensors
         invalid_req_indices = []
-        cu_num_new_tokens: list[int] | None = None
+        cu_num_tokens: list[int] | None = None
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
             if max_gen_len == 1:
                 # No spec decode tokens.
                 valid_sampled_token_ids = self._to_list(sampled_token_ids)
+                # Mask out the sampled tokens that should not be sampled.
+                for i in discard_sampled_tokens_req_indices:
+                    valid_sampled_token_ids[int(i)].clear()
             else:
                 # Includes spec decode tokens.
-                valid_sampled_token_ids = self.rejection_sampler.parse_output(
+                valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
                     sampled_token_ids,
                     self.input_batch.vocab_size,
+                    discard_sampled_tokens_req_indices,
+                    return_cu_num_tokens=logprobs_tensors is not None,
                 )
-                if logprobs_tensors:
-                    # Needed for extracting logprobs when spec decoding.
-                    # This must be done prior to discarding sampled tokens.
-                    cu_num_new_tokens = [0]
-                    for toks in valid_sampled_token_ids:
-                        cu_num_new_tokens.append(cu_num_new_tokens[-1] + len(toks))
-            # Mask out the sampled tokens that should not be sampled.
-            for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[int(i)].clear()
         else:
             valid_sampled_token_ids = []
             invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
@@ -2672,7 +2669,7 @@ class GPUModelRunner(
             req_state.output_token_ids.extend(sampled_ids)
 
         logprobs_lists = (
-            logprobs_tensors.tolists(cu_num_new_tokens)
+            logprobs_tensors.tolists(cu_num_tokens)
             if not self.use_async_scheduling and logprobs_tensors is not None
             else None
         )

From 0abc79482a6d476f58530907443c46134ba6e2e1 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Tue, 25 Nov 2025 16:46:41 -0500
Subject: [PATCH 077/134] [caching] Add enable_prompt_embeds and cpu_offload_gb
 to compile hashes. (#29435)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
---
 vllm/config/cache.py | 4 +---
 vllm/config/model.py | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index ef6928d8ebd5c..00530846fce00 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -144,7 +144,7 @@ class CacheConfig:
 
     kv_offloading_backend: KVOffloadingBackend | None = None
     """The backend to use for KV cache offloading. Supported backends include
-    'native' (vLLM native CPU offloading), 'lmcache' This option must be used 
+    'native' (vLLM native CPU offloading), 'lmcache' This option must be used
     together with kv_offloading_size."""
 
     def compute_hash(self) -> str:
@@ -167,8 +167,6 @@ class CacheConfig:
             "num_gpu_blocks_override",
             "enable_prefix_caching",
             "prefix_caching_hash_algo",
-            # `cpu_offload_gb` does not use `torch.compile` yet.
-            "cpu_offload_gb",
             "cpu_kvcache_space_bytes",
             "mamba_page_size_padded",
             # Post-init/derived counters
diff --git a/vllm/config/model.py b/vllm/config/model.py
index ce5e824da5c22..25972f097f53d 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -345,7 +345,6 @@ class ModelConfig:
             "logprobs_mode",
             "disable_cascade_attn",
             "skip_tokenizer_init",
-            "enable_prompt_embeds",
             "served_model_name",
             "config_format",
             "hf_token",

From 7df0289782ab500b2713b7521979c28de2b21cac Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 25 Nov 2025 17:52:31 -0500
Subject: [PATCH 078/134] Change warning logs to debug for unimplemented MXFP4
 Linear/Attention (#29441)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 198feb03be3e4..d975131f7cff7 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -196,9 +196,10 @@ class Mxfp4Config(QuantizationConfig):
             # TODO: Add support for MXFP4 Linear Method.
             # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation
             # if you are interested in enabling MXFP4 here.
-            logger.warning_once(
+            logger.debug_once(
                 "MXFP4 linear layer is not implemented - falling back to "
-                "UnquantizedLinearMethod."
+                "UnquantizedLinearMethod.",
+                scope="local",
             )
             return UnquantizedLinearMethod()
         elif isinstance(layer, FusedMoE):
@@ -208,9 +209,10 @@ class Mxfp4Config(QuantizationConfig):
                 return Mxfp4MoEMethod(layer.moe_config)
         elif isinstance(layer, Attention):
             # TODO: Add support for MXFP4 Attention.
-            logger.warning_once(
+            logger.debug_once(
                 "MXFP4 attention layer is not implemented. "
-                "Skipping quantization for this layer."
+                "Skipping quantization for this layer.",
+                scope="local",
             )
         return None
 

From de75b0bb701c89c1b2bffe09b23f86446c951f73 Mon Sep 17 00:00:00 2001
From: Andrey Khalyavin <halyavin@yandex-team.ru>
Date: Wed, 26 Nov 2025 02:45:58 +0300
Subject: [PATCH 079/134] [BugFix] Fix initialization of draft model.  (#29319)

Signed-off-by: Andrey Khalyavin <halyavin@yandex-team.ru>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bb44c5ad84cc1..9f3c34b15e2a8 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3460,6 +3460,10 @@ class GPUModelRunner(
             scope="local",
         )
         prepare_communication_buffer_for_model(self.model)
+        if (drafter := getattr(self, "drafter", None)) and (
+            drafter_model := getattr(drafter, "model", None)
+        ):
+            prepare_communication_buffer_for_model(drafter_model)
         mm_config = self.model_config.multimodal_config
         self.is_multimodal_pruning_enabled = (
             supports_multimodal_pruning(self.get_model())

From d8819c88eb633f1435ef389f87c23a9fd6010917 Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Tue, 25 Nov 2025 16:14:23 -0800
Subject: [PATCH 080/134] fix assertion for single world use case (uni)
 (#29429)

Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
---
 vllm/config/parallel.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 913e97250d3d3..7ba1da5db3849 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -593,9 +593,10 @@ class ParallelConfig:
                 "max_parallel_loading_workers is currently "
                 "not supported and will be ignored."
             )
-        if self.distributed_executor_backend != "mp" and self.nnodes > 1:
+        if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
             raise ValueError(
-                "nnodes > 1 can only be set when distributed exectuor backend is mp."
+                "nnodes > 1 can only be set when distributed executor "
+                "backend is mp or uni."
             )
 
     @property

From 12866af748f3933b62891f02646526c64395b26f Mon Sep 17 00:00:00 2001
From: Xieyang Xu <ashlippers@gmail.com>
Date: Tue, 25 Nov 2025 16:20:35 -0800
Subject: [PATCH 081/134] dummy run corner case (#29433)

---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9f3c34b15e2a8..d3c61794f8b0d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2789,7 +2789,7 @@ class GPUModelRunner(
                         # returns True. before returning early here we call
                         # dummy run to ensure coordinate_batch_across_dp
                         # is called into to avoid out of sync issues.
-                        self._dummy_run(1)
+                        self._dummy_run(self._get_num_input_tokens(1))
                     if not has_kv_transfer_group():
                         # Return empty ModelRunnerOutput if no work to do.
                         return EMPTY_MODEL_RUNNER_OUTPUT

From 56531b79ccc746bb579a49411f32be31bc307d4b Mon Sep 17 00:00:00 2001
From: "George D. Torres" <41129492+geodavic@users.noreply.github.com>
Date: Tue, 25 Nov 2025 18:50:22 -0600
Subject: [PATCH 082/134] [Misc] Add backup hash algorithm for FIPS constrained
 environments (#28795)

Signed-off-by: George D. Torres <gdavtor@gmail.com>
Signed-off-by: George D. Torres <41129492+geodavic@users.noreply.github.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/compilation/caching.py                    |  4 ++--
 vllm/compilation/compiler_interface.py         | 14 +++++++-------
 vllm/config/device.py                          |  4 ++--
 vllm/config/kv_transfer.py                     |  4 ++--
 vllm/config/load.py                            |  4 ++--
 vllm/config/lora.py                            |  4 ++--
 vllm/config/multimodal.py                      |  4 ++--
 vllm/config/observability.py                   |  4 ++--
 vllm/config/pooler.py                          |  4 ++--
 vllm/config/scheduler.py                       |  4 ++--
 vllm/config/speculative.py                     |  4 ++--
 vllm/config/structured_outputs.py              |  4 ++--
 vllm/config/vllm.py                            | 10 +++++-----
 .../v1/shared_storage_connector.py             |  4 ++--
 vllm/model_executor/models/registry.py         |  4 ++--
 vllm/utils/hashing.py                          | 18 ++++++++++++++++++
 16 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index 6297d9f995aa4..ce482572b401b 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 import inspect
 import os
 import pickle
@@ -14,6 +13,7 @@ import vllm.envs as envs
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.config.utils import hash_factors
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 
 try:
     from torch._dynamo.aot_compile import SerializableCallable
@@ -160,7 +160,7 @@ def _compute_code_hash_with_content(file_contents: dict[str, str]) -> str:
             # e.g. exec(). We can't actually check these.
             continue
         hash_content.append(content)
-    return hashlib.md5(
+    return safe_hash(
         "\n".join(hash_content).encode(), usedforsecurity=False
     ).hexdigest()
 
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 11cf0f85c1787..7deaba1a99fad 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import copy
-import hashlib
 import os
 from collections.abc import Callable
 from contextlib import ExitStack
@@ -16,6 +15,7 @@ import torch.fx as fx
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
+from vllm.utils.hashing import safe_hash
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 
@@ -197,9 +197,9 @@ class InductorStandaloneAdaptor(CompilerInterface):
 
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         factors = get_inductor_factors()
-        hash_str = hashlib.md5(
-            str(factors).encode(), usedforsecurity=False
-        ).hexdigest()[:10]
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
+            :10
+        ]
         return hash_str
 
     def initialize_cache(
@@ -286,9 +286,9 @@ class InductorAdaptor(CompilerInterface):
 
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         factors = get_inductor_factors()
-        hash_str = hashlib.md5(
-            str(factors).encode(), usedforsecurity=False
-        ).hexdigest()[:10]
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
+            :10
+        ]
         return hash_str
 
     def initialize_cache(
diff --git a/vllm/config/device.py b/vllm/config/device.py
index e85cd15de8cf4..85662ddff76b7 100644
--- a/vllm/config/device.py
+++ b/vllm/config/device.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from dataclasses import field
 from typing import Any, Literal
 
@@ -10,6 +9,7 @@ from pydantic import ConfigDict, SkipValidation
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
 
@@ -45,7 +45,7 @@ class DeviceConfig:
         # the device/platform information will be summarized
         # by torch/vllm automatically.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py
index dfd7ef63712a3..88f8b91c292bb 100644
--- a/vllm/config/kv_transfer.py
+++ b/vllm/config/kv_transfer.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 import uuid
 from dataclasses import field
 from typing import Any, Literal, get_args
@@ -9,6 +8,7 @@ from typing import Any, Literal, get_args
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 KVProducer = Literal["kv_producer", "kv_both"]
 KVConsumer = Literal["kv_consumer", "kv_both"]
@@ -79,7 +79,7 @@ class KVTransferConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self) -> None:
diff --git a/vllm/config/load.py b/vllm/config/load.py
index e424f8c5edb62..579a0bc31020e 100644
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from typing import TYPE_CHECKING, Any
 
 from pydantic import Field, field_validator
@@ -9,6 +8,7 @@ from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 
 if TYPE_CHECKING:
     from vllm.model_executor.model_loader import LoadFormats
@@ -104,7 +104,7 @@ class LoadConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @field_validator("load_format", mode="after")
diff --git a/vllm/config/lora.py b/vllm/config/lora.py
index 072e0ec2104f5..6a8fd6359aadd 100644
--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from typing import TYPE_CHECKING, Any, Literal
 
 import torch
@@ -11,6 +10,7 @@ from typing_extensions import Self
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -74,7 +74,7 @@ class LoRAConfig:
         factors.append(self.fully_sharded_loras)
         factors.append(self.lora_dtype)
 
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @model_validator(mode="after")
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 00a81a319bf72..590bc4dcd0760 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from collections.abc import Mapping
 from typing import TYPE_CHECKING, Any, Literal, TypeAlias
 
@@ -9,6 +8,7 @@ from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 if TYPE_CHECKING:
     from vllm.attention.backends.registry import AttentionBackendEnum
@@ -216,7 +216,7 @@ class MultiModalConfig:
             if self.mm_encoder_attn_backend is not None
             else None
         ]
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     def get_limit_per_prompt(self, modality: str) -> int:
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index 564c4f7aed419..ff35e12fe20ed 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from functools import cached_property
 from typing import Any, Literal, cast
 
@@ -11,6 +10,7 @@ from pydantic.dataclasses import dataclass
 
 from vllm import version
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 DetailedTraceModules = Literal["model", "worker", "all"]
 
@@ -78,7 +78,7 @@ class ObservabilityConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @field_validator("show_hidden_metrics_for_version")
diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py
index 6bece8d0785bd..85950bbcd666f 100644
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from typing import Any
 
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 
 logger = init_logger(__name__)
 
@@ -102,7 +102,7 @@ class PoolerConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index b6078706daacf..2cf42d57ec217 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from collections.abc import Callable
 from dataclasses import InitVar
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
@@ -12,6 +11,7 @@ from typing_extensions import Self, deprecated
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
 if TYPE_CHECKING:
@@ -178,7 +178,7 @@ class SchedulerConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @field_validator("scheduler_cls", "async_scheduling", mode="wrap")
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index d7c019c73d598..80d53a543f149 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ast
-import hashlib
 from typing import TYPE_CHECKING, Any, Literal, get_args
 
 from pydantic import Field, SkipValidation, model_validator
@@ -13,6 +12,7 @@ from vllm.config.model import ModelConfig
 from vllm.config.parallel import ParallelConfig
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 from vllm.utils.import_utils import LazyLoader, has_arctic_inference
 
 if TYPE_CHECKING:
@@ -162,7 +162,7 @@ class SpeculativeConfig:
         # Eagle3 affects the computation graph because it returns intermediate
         # hidden states in addition to the final hidden state.
         factors.append(self.method == "eagle3")
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @staticmethod
diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py
index 9530d3d81e15d..1b32675c3dbd2 100644
--- a/vllm/config/structured_outputs.py
+++ b/vllm/config/structured_outputs.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from typing import Any, Literal
 
 from pydantic import model_validator
@@ -9,6 +8,7 @@ from pydantic.dataclasses import dataclass
 from typing_extensions import Self
 
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 StructuredOutputsBackend = Literal[
     "auto", "xgrammar", "guidance", "outlines", "lm-format-enforcer"
@@ -58,7 +58,7 @@ class StructuredOutputsConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @model_validator(mode="after")
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 8a3599416bc72..9342564aa3d3f 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -3,7 +3,6 @@
 
 import copy
 import getpass
-import hashlib
 import json
 import os
 import tempfile
@@ -25,6 +24,7 @@ from vllm.config.speculative import EagleModelTypes
 from vllm.logger import enable_trace_function_call, init_logger
 from vllm.transformers_utils.runai_utils import is_runai_obj_uri
 from vllm.utils import random_uuid
+from vllm.utils.hashing import safe_hash
 
 from .cache import CacheConfig
 from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
@@ -193,7 +193,7 @@ class VllmConfig:
             vllm_factors.append("None")
         if self.additional_config:
             if isinstance(additional_config := self.additional_config, dict):
-                additional_config_hash = hashlib.md5(
+                additional_config_hash = safe_hash(
                     json.dumps(additional_config, sort_keys=True).encode(),
                     usedforsecurity=False,
                 ).hexdigest()
@@ -204,9 +204,9 @@ class VllmConfig:
             vllm_factors.append("None")
         factors.append(vllm_factors)
 
-        hash_str = hashlib.md5(
-            str(factors).encode(), usedforsecurity=False
-        ).hexdigest()[:10]
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
+            :10
+        ]
         return hash_str
 
     def pad_for_cudagraph(self, batch_size: int) -> int:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 016d1d45b3593..4611b4d1ff7b8 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import hashlib
 import os
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Optional
@@ -15,6 +14,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorRole,
 )
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 
@@ -423,7 +423,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
         if mm_hashes:
             mm_str = "-".join(mm_hashes)
             token_bytes += mm_str.encode("utf-8")
-        input_ids_hash = hashlib.md5(token_bytes, usedforsecurity=False).hexdigest()
+        input_ids_hash = safe_hash(token_bytes, usedforsecurity=False).hexdigest()
 
         foldername = os.path.join(self._storage_path, input_ids_hash)
         if create_folder:
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a0d8a78a2ae76..53644f9cb8788 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -5,7 +5,6 @@ Whenever you add an architecture to this page, please also update
 `tests/models/registry.py` with example HuggingFace models for it.
 """
 
-import hashlib
 import importlib
 import json
 import os
@@ -32,6 +31,7 @@ from vllm.config import (
 from vllm.logger import init_logger
 from vllm.logging_utils import logtime
 from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
+from vllm.utils.hashing import safe_hash
 
 from .interfaces import (
     has_inner_state,
@@ -654,7 +654,7 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
 
         if model_path.exists():
             with open(model_path, "rb") as f:
-                module_hash = hashlib.md5(f.read(), usedforsecurity=False).hexdigest()
+                module_hash = safe_hash(f.read(), usedforsecurity=False).hexdigest()
 
             mi = self._load_modelinfo_from_cache(module_hash)
             if mi is not None:
diff --git a/vllm/utils/hashing.py b/vllm/utils/hashing.py
index 49f4f13d115f3..edf1e9cb34e56 100644
--- a/vllm/utils/hashing.py
+++ b/vllm/utils/hashing.py
@@ -5,6 +5,7 @@ from __future__ import annotations
 
 import hashlib
 import pickle
+from _hashlib import HASH, UnsupportedDigestmodError
 from collections.abc import Callable
 from typing import Any
 
@@ -61,3 +62,20 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
         return sha256_cbor
 
     raise ValueError(f"Unsupported hash function: {hash_fn_name}")
+
+
+def safe_hash(data: bytes, usedforsecurity: bool = True) -> HASH:
+    """Hash for configs, defaulting to md5 but falling back to sha256
+    in FIPS constrained environments.
+
+    Args:
+        data: bytes
+        usedforsecurity: Whether the hash is used for security purposes
+
+    Returns:
+        Hash object
+    """
+    try:
+        return hashlib.md5(data, usedforsecurity=usedforsecurity)
+    except (UnsupportedDigestmodError, ValueError):
+        return hashlib.sha256(data)

From 8d6a89dffd9e2e238b3c6ead964783cfdb053756 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 25 Nov 2025 20:19:35 -0500
Subject: [PATCH 083/134] [UX] Suppress gloo log spam (#29250)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/distributed/parallel_state.py |  4 ++-
 vllm/distributed/utils.py          | 52 ++++++++++++++++--------------
 vllm/utils/system_utils.py         | 33 +++++++++++++++++++
 3 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index f81612fd1f4a3..69c28e278f2d2 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -51,6 +51,7 @@ from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.network_utils import get_distributed_init_method
+from vllm.utils.system_utils import suppress_stdout
 from vllm.utils.torch_utils import (
     direct_register_custom_op,
     supports_custom_op,
@@ -329,7 +330,8 @@ class GroupCoordinator:
             )
             # a group with `gloo` backend, to allow direct coordination between
             # processes through the CPU.
-            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            with suppress_stdout():
+                cpu_group = torch.distributed.new_group(ranks, backend="gloo")
             if self.rank in ranks:
                 self.ranks = ranks
                 self.world_size = len(ranks)
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index debf69c49b7d9..242ce393e4dc8 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -30,6 +30,7 @@ from torch.distributed.rendezvous import rendezvous
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.utils.network_utils import get_tcp_uri
+from vllm.utils.system_utils import suppress_stdout
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 logger = init_logger(__name__)
@@ -427,33 +428,34 @@ def init_gloo_process_group(
     Stateless init ProcessGroup with gloo backend compatible with
     different torch versions.
     """
-    if is_torch_equal_or_newer("2.6"):
-        pg = ProcessGroup(
-            prefix_store,
-            group_rank,
-            group_size,
-        )
-    else:
-        options = ProcessGroup.Options(backend="gloo")
-        pg = ProcessGroup(
-            prefix_store,
-            group_rank,
-            group_size,
-            options,
-        )
-    from torch.distributed.distributed_c10d import ProcessGroupGloo
+    with suppress_stdout():
+        if is_torch_equal_or_newer("2.6"):
+            pg = ProcessGroup(
+                prefix_store,
+                group_rank,
+                group_size,
+            )
+        else:
+            options = ProcessGroup.Options(backend="gloo")
+            pg = ProcessGroup(
+                prefix_store,
+                group_rank,
+                group_size,
+                options,
+            )
+        from torch.distributed.distributed_c10d import ProcessGroupGloo
 
-    backend_class = ProcessGroupGloo(
-        prefix_store, group_rank, group_size, timeout=timeout
-    )
-    backend_type = ProcessGroup.BackendType.GLOO
-    device = torch.device("cpu")
-    if is_torch_equal_or_newer("2.6"):
-        # _set_default_backend is supported in torch >= 2.6
-        pg._set_default_backend(backend_type)
-    backend_class._set_sequence_number_for_group()
+        backend_class = ProcessGroupGloo(
+            prefix_store, group_rank, group_size, timeout=timeout
+        )
+        backend_type = ProcessGroup.BackendType.GLOO
+        device = torch.device("cpu")
+        if is_torch_equal_or_newer("2.6"):
+            # _set_default_backend is supported in torch >= 2.6
+            pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
 
-    pg._register_backend(device, backend_type, backend_class)
+        pg._register_backend(device, backend_type, backend_class)
     return pg
 
 
diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py
index cc872040b6c5f..a4eb8f4d4fd7d 100644
--- a/vllm/utils/system_utils.py
+++ b/vllm/utils/system_utils.py
@@ -56,6 +56,39 @@ def set_env_var(key: str, value: str) -> Iterator[None]:
             os.environ[key] = old
 
 
+@contextlib.contextmanager
+def suppress_stdout():
+    """
+    Suppress stdout from C libraries at the file descriptor level.
+
+    Only suppresses stdout, not stderr, to preserve error messages.
+    Suppression is disabled when VLLM_LOGGING_LEVEL is set to DEBUG.
+
+    Example:
+        with suppress_stdout():
+            # C library calls that would normally print to stdout
+            torch.distributed.new_group(ranks, backend="gloo")
+    """
+    # Don't suppress if logging level is DEBUG
+    if envs.VLLM_LOGGING_LEVEL == "DEBUG":
+        yield
+        return
+
+    stdout_fd = sys.stdout.fileno()
+    stdout_dup = os.dup(stdout_fd)
+    devnull_fd = os.open(os.devnull, os.O_WRONLY)
+
+    try:
+        sys.stdout.flush()
+        os.dup2(devnull_fd, stdout_fd)
+        yield
+    finally:
+        sys.stdout.flush()
+        os.dup2(stdout_dup, stdout_fd)
+        os.close(stdout_dup)
+        os.close(devnull_fd)
+
+
 # File path utilities
 
 
From c5ee430328789c37a54526da3d254ae77d53ea4f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 26 Nov 2025 01:57:08 +0000
Subject: [PATCH 084/134] Bump actions/checkout from 4 to 6 (#29293)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/cleanup_pr_body.yml  | 2 +-
 .github/workflows/macos-smoke-test.yml | 2 +-
 .github/workflows/pre-commit.yml       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index c3e132a536a42..861290ea43c87 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,7 +13,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Set up Python
         uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index a183033c9adde..3a12c4b3a8300 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -12,7 +12,7 @@ jobs:
     timeout-minutes: 30
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
 
       - uses: astral-sh/setup-uv@v7
         with:
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index e21d13b8161f3..d5e70f30ef638 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,7 +16,7 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
     - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
       with:
         python-version: "3.12"

From 53d7f1f601a12b8fa58aa0bffb7fc27d63d1eb5e Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Tue, 25 Nov 2025 18:21:00 -0800
Subject: [PATCH 085/134] [Kernel] Use pre-allocated output buffer for triton
 kernel fused_experts (#29219)

Signed-off-by: Xin Yang <xyangx@amazon.com>
---
 .../fused_moe/gpt_oss_triton_kernels_moe.py   | 84 ++++++++++++++++---
 1 file changed, 73 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index badedfc54c382..128507639fdfd 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -12,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP,
 )
+from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.triton_utils import tl, triton
 from vllm.utils.import_utils import has_triton_kernels
 
@@ -88,14 +90,17 @@ def triton_kernel_moe_forward(
         gating_output, topk, sm_first=not renormalize
     )
 
+    output = torch.empty_like(hidden_states)
+
     return triton_kernel_fused_experts(
-        None,
+        output,
         hidden_states,
         w1,
         w2,
         routing_data,
         gather_idx,
         scatter_idx,
+        topk=topk,
         activation=activation,
         quant_config=quant_config,
         apply_router_weight_on_input=apply_router_weight_on_input,
@@ -113,6 +118,7 @@ def triton_kernel_fused_experts(
     routing_data,  # RoutingData
     gather_indx,  # GatherIndx
     scatter_indx,  # ScatterIndx
+    topk: int,
     activation: str = "silu",
     quant_config: FusedMoEQuantConfig | None = None,
     swiglu_alpha: float = 1.702,
@@ -120,6 +126,7 @@ def triton_kernel_fused_experts(
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
     expert_map: torch.Tensor | None = None,
+    intermediate_cache: torch.Tensor | None = None,
     a1q_scale: torch.Tensor | None = None,
 ) -> torch.Tensor:
     if quant_config is None:
@@ -131,14 +138,30 @@ def triton_kernel_fused_experts(
     assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32
 
     # Shape check, only check non-mxfp4
+    assert hidden_states.ndim == 2
     assert hidden_states.shape[-1] == w1.shape[-2]
     assert w2.shape[-1] == w1.shape[1]
 
+    batch_dim = 1
+    M, K = hidden_states.shape[-2:]
     E, _, N = w1.shape
 
     if global_num_experts == -1:
         global_num_experts = E
 
+    if intermediate_cache is None:
+        intermediate_cache = torch.empty(
+            (batch_dim, M * topk, N // 2),
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+
+    # Add batch_dim to output buffer because matmul_ogs expects 3D output
+    intermediate_cache = _resize_cache(
+        intermediate_cache, (batch_dim, M * topk, N // 2)
+    )
+    output_tensor = _resize_cache(output_tensor, (batch_dim, M, K))
+
     act = FusedActivation(
         FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
         (swiglu_alpha, swiglu_limit),
@@ -146,7 +169,7 @@ def triton_kernel_fused_experts(
     )
     gammas = routing_data.gate_scal if routing_data else None
 
-    intermediate_cache1 = matmul_ogs(
+    matmul_ogs(
         hidden_states,
         w1,
         quant_config.w1_bias,
@@ -155,10 +178,11 @@ def triton_kernel_fused_experts(
         precision_config=quant_config.w1_precision,
         gammas=gammas if apply_router_weight_on_input else None,
         fused_activation=act,
+        y=intermediate_cache,
     )
 
-    intermediate_cache3 = matmul_ogs(
-        intermediate_cache1,
+    matmul_ogs(
+        intermediate_cache.view(M * topk, N // 2),
         w2,
         quant_config.w2_bias,
         routing_data,
@@ -167,7 +191,8 @@ def triton_kernel_fused_experts(
         gammas=None if apply_router_weight_on_input else gammas,
         y=output_tensor,
     )
-    return intermediate_cache3
+    output_tensor = output_tensor.view(M, K)
+    return output_tensor
 
 
 def make_routing_data(
@@ -221,6 +246,42 @@ class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def supports_expert_map(self) -> bool:
         return True
 
+    def moe_problem_size(
+        self,
+        a1: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ) -> tuple[int, int, int, int, int]:
+        """
+        Extract the MoE problem size from the given tensor arguments:
+        - a: The hidden states, input to the MoE layer.
+        - w1: The first set of expert weights.
+        - w2: The second set of expert weights.
+        - topk_ids: The topk ids.
+        Note: extracting the problem shape from the weight and activation
+        tensors is not obvious.  It needs to be done this way specifically
+        due to subtle issues with particular kernels, e.g. the int4 kernels
+        divide the trailing dimension by two, so it's not "correct" to
+        extract N or K from the trailing dimension of w1 or w2.  Similarly,
+        some kernels transpose the weights, so this needs to be kept in mind.
+        Note: This implementation covers most cases. However, if experts
+        require a specialized implementation, like MarlinExperts, they are free
+        to override this function.
+        """
+        assert w1.dim() == 3 and w2.dim() == 3
+        E, _, N = w1.size()
+        K = a1.size(-1)
+
+        assert a1.dim() == 2
+        assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}"
+        M = a1.size(0)
+
+        assert topk_ids.dim() == 2
+        topk = topk_ids.size(1)
+
+        return E, M, N, K, topk
+
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         # Weight application and reduction happens in the fused_experts kernel.
         return TopKWeightAndReduceNoOP()
@@ -263,8 +324,8 @@ class OAITritonExperts(BaseOAITritonExperts):
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # workspace are allocated inside the kernel
-        workspace1 = (M, K)
-        workspace2 = (0, 0)
+        workspace1 = (0, 0)
+        workspace2 = (M * topk, N // 2)
         output = (M, K)
         return (workspace1, workspace2, output)
 
@@ -297,20 +358,21 @@ class OAITritonExperts(BaseOAITritonExperts):
             topk_ids, topk_weights, local_num_experts
         )
 
-        experts_output = triton_kernel_fused_experts(
-            None,
+        topk = topk_ids.size(1)
+        triton_kernel_fused_experts(
+            output,
             hidden_states,
             w1,
             w2,
             routing_data,
             gather_indx,
             scatter_indx,
+            topk=topk,
             activation=activation,
             quant_config=self.quant_config,
             apply_router_weight_on_input=False,
             global_num_experts=local_num_experts,
             expert_map=None,  # applied already
+            intermediate_cache=workspace2,
             a1q_scale=a1q_scale,
         )
-
-        output.copy_(experts_output, non_blocking=True)

From d9d342d214b8c13f71215318a6d9252cc4a5ca47 Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Wed, 26 Nov 2025 12:45:28 +0800
Subject: [PATCH 086/134] [Performance][MLA][ROCm] Remove redundant D2D copy in
 deepseek (#27457)

Signed-off-by: ganyi <ygan@amd.com>
---
 csrc/attention/merge_attn_states.cu           | 27 +++++++--------
 csrc/ops.h                                    |  3 +-
 csrc/torch_bindings.cpp                       |  3 +-
 .../attention/ops/triton_merge_attn_states.py | 23 +++++++++----
 vllm/v1/attention/backends/mla/common.py      | 34 ++++++++++---------
 5 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
index 229d9862fb670..27d1e990c611e 100644
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
     scalar_t* output, float* output_lse, const scalar_t* prefix_output,
     const float* prefix_lse, const scalar_t* suffix_output,
     const float* suffix_lse, const uint num_tokens, const uint num_heads,
-    const uint head_size) {
+    const uint head_size, const uint prefix_head_stride,
+    const uint output_head_stride) {
   using pack_128b_t = uint4;
   const uint pack_size = 16 / sizeof(scalar_t);
   const uint threads_per_head = head_size / pack_size;
@@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
   const uint head_idx = token_head_idx % num_heads;
 
   const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
-  const uint head_offset =
-      token_idx * num_heads * head_size + head_idx * head_size;
-  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
-  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
-  scalar_t* output_head_ptr = output + head_offset;
+  const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
+                               head_idx * prefix_head_stride;
+  const uint dst_head_offset = token_idx * num_heads * output_head_stride +
+                               head_idx * output_head_stride;
+  const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
+  const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
+  scalar_t* output_head_ptr = output + dst_head_offset;
 
   float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
   float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
             reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
             reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
             reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
-            num_heads, head_size);                                          \
+            num_heads, head_size, prefix_head_stride, output_head_stride);  \
   }
 
 /*@brief Merges the attention states from prefix and suffix
@@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
   const uint num_tokens = output.size(0);
   const uint num_heads = output.size(1);
   const uint head_size = output.size(2);
+  const uint prefix_head_stride = prefix_output.stride(1);
+  const uint output_head_stride = output.stride(1);
   const uint pack_size = 16 / sizeof(scalar_t);
   TORCH_CHECK(head_size % pack_size == 0,
               "headsize must be multiple of pack_size:", pack_size);
-  TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
-              "output heads must be contiguous in memory");
-  TORCH_CHECK(
-      prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
-      "prefix_output heads must be contiguous in memory");
-  TORCH_CHECK(
-      suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
-      "suffix_output heads must be contiguous in memory");
   float* output_lse_ptr = nullptr;
   if (output_lse.has_value()) {
     output_lse_ptr = output_lse.value().data_ptr<float>();
diff --git a/csrc/ops.h b/csrc/ops.h
index f8bdc61aaa8ec..4bb7857b15032 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -52,14 +52,13 @@ void paged_attention_v2(
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step);
 
-#ifndef USE_ROCM
 void merge_attn_states(torch::Tensor& output,
                        std::optional<torch::Tensor> output_lse,
                        const torch::Tensor& prefix_output,
                        const torch::Tensor& prefix_lse,
                        const torch::Tensor& suffix_output,
                        const torch::Tensor& suffix_lse);
-
+#ifndef USE_ROCM
 void convert_vertical_slash_indexes(
     torch::Tensor& block_count,      // [BATCH, N_HEADS, NUM_ROWS]
     torch::Tensor& block_offset,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 14913bef13125..e9c96bb8b56cf 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -63,7 +63,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
 
-#ifndef USE_ROCM
   // Merge attn states
   // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
   // can be used to combine partial attention results (in the split-KV case)
@@ -76,7 +75,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor suffix_output,"
       "    Tensor suffix_lse) -> ()");
   ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
-
+#ifndef USE_ROCM
   ops.def(
       "convert_vertical_slash_indexes("
       "   Tensor! block_count, Tensor! block_offset, "
diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
index 3c87a24afd9c7..74e4d778ded87 100644
--- a/vllm/attention/ops/triton_merge_attn_states.py
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -20,7 +20,11 @@ def merge_attn_states(
     num_query_heads = output.shape[1]
     head_size = output.shape[2]
     padded_head_size = triton.next_power_of_2(head_size)
-
+    # We assume the output stride on num_head is not always as same as the
+    # `suffix_output` and `prefix_output`, as them might be padded by the attention
+    # backend.
+    prefix_head_stride = prefix_output.stride(1)
+    output_head_stride = output.stride(1)
     # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead.
     merge_attn_states_kernel[(num_tokens, num_query_heads)](
         output,
@@ -29,6 +33,8 @@ def merge_attn_states(
         prefix_lse,
         suffix_output,
         suffix_lse,
+        prefix_head_stride,
+        output_head_stride,
         head_size,
         padded_head_size,
         output_lse is not None,
@@ -43,6 +49,8 @@ def merge_attn_states_kernel(
     prefix_lse,  # [NUM_HEADS, NUM_TOKENS]
     suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
     suffix_lse,  # [NUM_HEADS, NUM_TOKENS]
+    prefix_head_stride,
+    output_head_stride,
     HEAD_SIZE: tl.constexpr,
     PADDED_HEAD_SIZE: tl.constexpr,
     OUTPUT_LSE: tl.constexpr,
@@ -79,15 +87,15 @@ def merge_attn_states_kernel(
     head_mask = head_arange < HEAD_SIZE
     p_out = tl.load(
         prefix_output
-        + token_idx * num_heads * HEAD_SIZE
-        + head_idx * HEAD_SIZE
+        + token_idx * num_heads * prefix_head_stride
+        + head_idx * prefix_head_stride
         + head_arange,
         mask=head_mask,
     )
     s_out = tl.load(
         suffix_output
-        + token_idx * num_heads * HEAD_SIZE
-        + head_idx * HEAD_SIZE
+        + token_idx * num_heads * prefix_head_stride
+        + head_idx * prefix_head_stride
         + head_arange,
         mask=head_mask,
     )
@@ -99,7 +107,10 @@ def merge_attn_states_kernel(
     s_scale = s_se / out_se
     out = p_out * p_scale + s_out * s_scale
     tl.store(
-        output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange,
+        output
+        + token_idx * num_heads * output_head_stride
+        + head_idx * output_head_stride
+        + head_arange,
         out,
         mask=head_mask,
     )
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 87a3aac21d2c3..d94ed9183f639 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1238,15 +1238,13 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
     def _v_up_proj(self, x: torch.Tensor, out: torch.Tensor):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
+
         if self.is_aiter_triton_fp8_bmm_enabled:
+            out = out.view(-1, self.num_heads, self.v_head_dim)
             # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V)
             x = rocm_aiter_ops.triton_fp8_bmm(
-                x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True
+                x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True, YQ=out
             )
-            # Convert from (B, N, V) to (B, N * V)
-            x = x.reshape(-1, self.num_heads * self.v_head_dim)
-            # Copy result
-            out.copy_(x)
         else:
             # Convert from (B, N * V) to (N, B, V)
             out = out.view(-1, self.num_heads, self.v_head_dim).transpose(0, 1)
@@ -1824,7 +1822,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         k_scale: torch.Tensor,
-    ) -> torch.Tensor:
+        output: torch.Tensor,
+    ) -> None:
         # TODO (zyongye): Prefill function here
         assert attn_metadata.prefill is not None
         assert self.dcp_world_size is not None
@@ -1837,7 +1836,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
 
         k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
-        output = self._run_prefill_new_tokens(
+        output_prefill = self._run_prefill_new_tokens(
             prefill=attn_metadata.prefill,
             q=q,
             k=k,
@@ -1846,7 +1845,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         )
 
         if has_context:
-            suffix_output, suffix_lse = output
+            suffix_output, suffix_lse = output_prefill
             if self.dcp_world_size > 1:
                 context_output, context_lse = (
                     self._context_parallel_compute_prefill_context(
@@ -1862,7 +1861,12 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                     q, kv_c_and_k_pe_cache, attn_metadata, k_scale
                 )
 
-            output = torch.empty_like(suffix_output)
+            # unpad if necessary
+            if self._pad_v:
+                context_output = context_output[..., : v.shape[-1]]
+                suffix_output = suffix_output[..., : v.shape[-1]]
+
+            output = output.view(-1, self.num_heads, self.v_head_dim)
             merge_attn_states(
                 output=output,
                 prefix_output=context_output,
@@ -1870,12 +1874,9 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 suffix_output=suffix_output,
                 suffix_lse=suffix_lse,
             )
-
-        # unpad if necessary
-        if self._pad_v:
-            output = output[..., : v.shape[-1]]
-
-        return output.flatten(start_dim=-2)
+        else:
+            output_prefill = output_prefill[..., : v.shape[-1]].flatten(start_dim=-2)
+            output.copy_(output_prefill)
 
     @abstractmethod
     def _forward_decode(
@@ -1970,13 +1971,14 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
             kv_cache = kv_cache.view(current_platform.fp8_dtype())
 
         if has_prefill:
-            output[num_decode_tokens:] = self._forward_prefill(
+            self._forward_prefill(
                 prefill_q,
                 prefill_k_c_normed,
                 prefill_k_pe,
                 kv_cache,
                 attn_metadata,
                 layer._k_scale,
+                output=output[num_decode_tokens:],
             )
 
         if has_decode:

From 452a7c9f7c949cd20c3c0c81cd4352b2a0045076 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Wed, 26 Nov 2025 05:00:00 -0800
Subject: [PATCH 087/134] [Misc] Allow LM only loading for Pixtral (#29451)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 vllm/model_executor/models/pixtral.py | 73 +++++++++++++++++++--------
 1 file changed, 51 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 6011d93a795d1..3464de472add5 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -400,21 +400,30 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
             prefix=maybe_prefix(prefix, "language_model"),
         )
 
-        self.vision_encoder = VisionTransformer(self.vision_args)
-
-        if self.vision_args.add_pre_mm_projector_layer_norm:
-            self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size, eps=1e-5)
-
-        if self.vision_args.mm_projector_id == PATCH_MERGE:
-            self.patch_merger = PatchMerger(
-                vision_encoder_dim=self.vision_args.hidden_size,
-                spatial_merge_size=self.vision_args.spatial_merge_size,
-                use_mlp_bias=False,
+        if multimodal_config.get_limit_per_prompt("image"):
+            self.vision_encoder = VisionTransformer(self.vision_args)
+            self.pre_mm_projector_norm = (
+                RMSNorm(self.vision_args.hidden_size, eps=1e-5)
+                if self.vision_args.add_pre_mm_projector_layer_norm
+                else None
             )
-
-        self.vision_language_adapter = VisionLanguageAdapter(
-            self.vision_args, dim=config.text_config.hidden_size
-        )
+            self.patch_merger = (
+                PatchMerger(
+                    vision_encoder_dim=self.vision_args.hidden_size,
+                    spatial_merge_size=self.vision_args.spatial_merge_size,
+                    use_mlp_bias=False,
+                )
+                if self.vision_args.mm_projector_id == PATCH_MERGE
+                else None
+            )
+            self.vision_language_adapter = VisionLanguageAdapter(
+                self.vision_args, dim=config.text_config.hidden_size
+            )
+        else:
+            self.vision_encoder = None
+            self.pre_mm_projector_norm = None
+            self.patch_merger = None
+            self.vision_language_adapter = None
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors
@@ -436,13 +445,17 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
         self,
         image_input: PixtralImagePixelInputs,
     ) -> tuple[torch.Tensor, ...]:
+        assert (
+            self.vision_encoder is not None and self.vision_language_adapter is not None
+        )
+
         images = image_input["images"]
         image_features = self.vision_encoder(images)
         feature_sizes = [image_feature.shape[0] for image_feature in image_features]
         image_features = torch.cat(image_features)
-        if self.vision_args.add_pre_mm_projector_layer_norm:
+        if self.pre_mm_projector_norm is not None:
             image_features = self.pre_mm_projector_norm(image_features)
-        if self.vision_args.mm_projector_id == PATCH_MERGE:
+        if self.patch_merger is not None:
             patch_size = self.vision_args.patch_size
             spatial_merge_size_square = self.vision_args.spatial_merge_size**2
             img_patch_dims = [
@@ -508,41 +521,57 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
             return weight[0].startswith("pre_mm_projector_norm")
 
         # Get references to parameters for direct loading
-        vision_encoder_dict = dict(self.vision_encoder.named_parameters())
+        vision_encoder_dict = (
+            dict(self.vision_encoder.named_parameters())
+            if self.vision_encoder is not None
+            else {}
+        )
         patch_merger_dict = (
             dict(self.patch_merger.named_parameters())
-            if self.vision_args.mm_projector_id == PATCH_MERGE
-            else dict()
+            if self.patch_merger is not None
+            else {}
         )
         pre_mm_projector_norm_dict = (
             dict(self.pre_mm_projector_norm.named_parameters())
-            if self.vision_args.add_pre_mm_projector_layer_norm
-            else dict()
+            if self.pre_mm_projector_norm is not None
+            else {}
+        )
+        vision_lang_adapter_dict = (
+            dict(self.vision_language_adapter.named_parameters())
+            if self.vision_language_adapter is not None
+            else {}
         )
-        vision_lang_adapter_dict = dict(self.vision_language_adapter.named_parameters())
 
         def llm_weights_generator():
             # Single pass over weights
             for name, w in weights:
                 if is_vision_encoder_weights((name, w)):
+                    if self.vision_encoder is None:
+                        continue
                     # Load vision encoder weights directly
                     trimmed_name = ".".join(name.split(".")[1:])
                     param = vision_encoder_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
                 elif is_patch_merger((name, w)):
+                    if self.patch_merger is None:
+                        continue
                     # Load vision patch merger weights directly
                     trimmed_name = ".".join(name.split(".")[1:])
                     param = patch_merger_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
                 elif is_pre_mm_projector_norm((name, w)):
+                    if self.pre_mm_projector_norm is None:
+                        continue
                     # Load vision pre_mm_projector_norm weights directly
                     trimmed_name = ".".join(name.split(".")[1:])
                     param = pre_mm_projector_norm_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
                 elif is_vision_lang_adapter_weights((name, w)):
+                    if self.vision_language_adapter is None:
+                        continue
                     # Load vision-language adapter weights directly
                     trimmed_name = ".".join(name.split(".")[1:])
                     param = vision_lang_adapter_dict[trimmed_name]

From e30859dff3d93bd3e289f6e996afbb59ac475b72 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 26 Nov 2025 21:00:15 +0800
Subject: [PATCH 088/134] [Bugfix] Fix handling of image embeds in models
 (#29480)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/deepseek_vl2.py    | 15 ++-------------
 vllm/model_executor/models/llava_next.py      |  2 +-
 vllm/model_executor/models/llava_onevision.py |  2 +-
 3 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index e7b48e0f4e554..1b6e4110039c4 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -48,7 +48,6 @@ from vllm.transformers_utils.configs.deepseek_vl2 import (
 )
 from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils.collection_utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_dtype
 
@@ -595,19 +594,9 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _process_image_input(
         self, image_input: DeepseekVL2ImageInputs
-    ) -> list[torch.Tensor]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
-            image_data = image_input["data"]
-            if is_list_of(image_data, torch.Tensor):
-                # it's already a list of tensors
-                return image_data
-            if len(image_data.shape) == 3:
-                # 3D tensor
-                return list(torch.unbind(image_data, dim=0))
-            raise ValueError(
-                "We expect batched 2D tensors; "
-                "this can be either a list of 2D tensors or a single 3D tensor."
-            )
+            return image_input["data"]
 
         pixel_values = image_input["data"]
         images_spatial_crop = image_input["images_spatial_crop"]
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 98b1b46045c3d..b995cac47ac1c 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -460,7 +460,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
         image_input: LlavaNextImageInputs,
     ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
-            return [image_input["data"]]
+            return image_input["data"]
 
         patch_embeddings = self._process_image_pixels(image_input)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 322bde94ff66d..4e243ade68358 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -763,7 +763,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
         image_input: LlavaOnevisionImageInputs,
     ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
-            return [image_input["data"]]
+            return image_input["data"]
 
         patch_embeddings = self._process_image_pixels(image_input)
 

From bb706d60482233e2feb6bab894492e394dcdef94 Mon Sep 17 00:00:00 2001
From: Yejing Lai <yejing.lai@intel.com>
Date: Wed, 26 Nov 2025 21:15:00 +0800
Subject: [PATCH 089/134] Fix TeleChatForCausalLM not register issue (#29473)

Signed-off-by: Lai, Yejing <yejing.lai@intel.com>
---
 tests/models/registry.py               | 3 +++
 vllm/model_executor/models/registry.py | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index f8b3470e6d39b..c9d4823d52792 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -436,6 +436,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "SolarForCausalLM": _HfExamplesInfo(
         "upstage/solar-pro-preview-instruct", trust_remote_code=True
     ),
+    "TeleChatForCausalLM": _HfExamplesInfo(
+        "chuhac/TeleChat2-35B", trust_remote_code=True
+    ),
     "TeleChat2ForCausalLM": _HfExamplesInfo(
         "Tele-AI/TeleChat2-3B", trust_remote_code=True
     ),
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 53644f9cb8788..ba9f33819c950 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -170,6 +170,7 @@ _TEXT_GENERATION_MODELS = {
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
+    "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
     "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
@@ -207,6 +208,7 @@ _EMBEDDING_MODELS = {
     "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"),
     "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
     "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
+    "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     # [Multimodal]

From 3650a74ed8fb27d4d53199969f265e426c22891b Mon Sep 17 00:00:00 2001
From: yxt <xintong.yu@daocloud.io>
Date: Wed, 26 Nov 2025 21:16:12 +0800
Subject: [PATCH 090/134] =?UTF-8?q?Optimize=20the=20wording=20of=20the=20d?=
 =?UTF-8?q?ocument=20and=20unify=20the=20terminology=20and=20th=E2=80=A6?=
 =?UTF-8?q?=20(#29491)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/models/pooling_models.md | 46 +++++++++++++++++------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 18bb645ea9a9c..aca865f4bf77d 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -1,15 +1,15 @@
 # Pooling Models
 
-vLLM also supports pooling models, such as embedding, classification and reward models.
+vLLM also supports pooling models, such as embedding, classification, and reward models.
 
 In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
 These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
 before returning them.
 
 !!! note
-    We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly.
+    We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly.
 
-    We are now planning to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
+    We plan to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
 
 ## Configuration
 
@@ -19,7 +19,7 @@ Run a model in pooling mode via the option `--runner pooling`.
 
 !!! tip
     There is no need to set this option in the vast majority of cases as vLLM can automatically
-    detect the model runner to use via `--runner auto`.
+    detect the appropriate model runner via `--runner auto`.
 
 ### Model Conversion
 
@@ -78,7 +78,7 @@ When loading [Sentence Transformers](https://huggingface.co/sentence-transformer
 its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
 
 You can further customize this via the `--pooler-config` option,
-which takes priority over both the model's and Sentence Transformers's defaults.
+which takes priority over both the model's and Sentence Transformers' defaults.
 
 ## Offline Inference
 
@@ -168,11 +168,11 @@ The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
 
     - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
     - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
-    - For similarity scores, use `LLM.score(...)`.  
+    - For similarity scores, use `LLM.score(...)`.
     - For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`.
     - For token classification, use `pooling_task="token_classify"`.
-    - For multi-vector retrieval, use `pooling_task="token_embed"`
-    - For IO Processor Plugins , use `pooling_task="plugin"`
+    - For multi-vector retrieval, use `pooling_task="token_embed"`.
+    - For IO Processor Plugins, use `pooling_task="plugin"`.
 
 ```python
 from vllm import LLM
@@ -194,15 +194,15 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides
 - [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
 
 !!! note
-    Please use one of the more specific methods or set the task directly when using  [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.:
+    Please use one of the more specific endpoints or set the task directly when using the [Pooling API](../serving/openai_compatible_server.md#pooling-api):
 
     - For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`.
-    - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`.
-    - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).  
-    - For rewards, `task":"token_classify"`.
-    - For token classification, use `task":"token_classify"`.
-    - For multi-vector retrieval, use `task":"token_embed"`
-    - For IO Processor Plugins , use `task":"plugin"`
+    - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `"task":"classify"`.
+    - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).
+    - For rewards, use `"task":"token_classify"`.
+    - For token classification, use `"task":"token_classify"`.
+    - For multi-vector retrieval, use `"task":"token_embed"`.
+    - For IO Processor Plugins, use `"task":"plugin"`.
 
 ```python
 # start a supported embeddings model server with `vllm serve`, e.g.
@@ -232,7 +232,7 @@ for output in response.json()["data"]:
 
 ## Matryoshka Embeddings
 
-[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
+[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost.
 
 !!! warning
     Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
@@ -245,9 +245,9 @@ for output in response.json()["data"]:
 
 ### Manually enable Matryoshka Embeddings
 
-There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
+There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions.
 
-For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`,  `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
+For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'` (online).
 
 Here is an example to serve a model with Matryoshka Embeddings enabled.
 
@@ -278,7 +278,7 @@ A code example can be found here: [examples/offline_inference/pooling/embed_matr
 
 ### Online Inference
 
-Use the following command to start vllm server.
+Use the following command to start the vLLM server.
 
 ```bash
 vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
@@ -310,11 +310,11 @@ An OpenAI client example can be found here: [examples/online_serving/pooling/ope
 
 ### Encode task
 
-We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`:
+We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`:
 
-- `token_embed` is the same as embed, using normalize as activation.
-- `token_classify` is the same as classify, default using softmax as activation.
+- `token_embed` is the same as `embed`, using normalization as the activation.
+- `token_classify` is the same as `classify`, by default using softmax as the activation.
 
 ### Remove softmax from PoolingParams
 
-We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function.
+We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.

From 70d5953f820ec528e2b6050a7969130009410d1e Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Wed, 26 Nov 2025 06:27:26 -0800
Subject: [PATCH 091/134] Revert "[Bugfix] Fix GPT-OSS AR+NORM fusion (#28841)"
 (#29483)

Signed-off-by: Huamin Li <3ericli@gmail.com>
---
 .buildkite/test-pipeline.yaml                   |  1 -
 tests/compile/distributed/test_fusions_e2e.py   | 11 -----------
 .../device_communicators/symm_mem.py            |  2 +-
 vllm/model_executor/layers/fused_moe/layer.py   | 17 ++++++-----------
 4 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 10a19c52c72dc..d14b524b793a5 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -972,7 +972,6 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - vllm/model_executor/layers/fused_moe/layer.py
   - tests/compile/test_fusion_attn.py
   - tests/compile/test_silu_mul_quant_fusion.py
   - tests/compile/distributed/test_fusion_all_reduce.py
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 53c3f875d2003..661172e1965b5 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -111,17 +111,6 @@ if current_platform.is_cuda():
                 async_tp=96,  # MLP is MoE, half the fusions of dense
             ),
         ),
-        ModelBackendTestCase(
-            model_name="openai/gpt-oss-20b",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER,
-            matches=Matches(
-                attention_fusion=0,
-                allreduce_fusion=49,
-                sequence_parallel=49,
-                async_tp=48,
-            ),
-        ),
     ]
 
 elif current_platform.is_rocm():
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index 7a049b003cf73..eb1f173b11925 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -131,7 +131,7 @@ class SymmMemCommunicator:
             return None
         if out is None:
             out = torch.empty_like(inp)
-        self.buffer[: inp.numel()].copy_(inp.reshape(-1))
+        self.buffer[: inp.numel()].copy_(inp.view(-1))
 
         # Determine which algorithm to use
         use_multimem = False
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index bb30f1292a5fa..0ef3130b26333 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1690,10 +1690,6 @@ class FusedMoE(CustomOp):
             )
 
         def reduce_output(states: torch.Tensor) -> torch.Tensor:
-            # Slice before all_reduce to enable possible fusion
-            if self.hidden_size != og_hidden_states:
-                states = states[..., :og_hidden_states]
-
             if (
                 not self.is_sequence_parallel
                 and not self.use_dp_chunking
@@ -1716,12 +1712,11 @@ class FusedMoE(CustomOp):
             if self.zero_expert_num is not None and self.zero_expert_num > 0:
                 assert isinstance(fused_output, tuple)
                 fused_output, zero_expert_result = fused_output
-                return (
-                    reduce_output(fused_output)
-                    + zero_expert_result[..., :og_hidden_states]
-                )
+                return (reduce_output(fused_output) + zero_expert_result)[
+                    ..., :og_hidden_states
+                ]
             else:
-                return reduce_output(fused_output)
+                return reduce_output(fused_output)[..., :og_hidden_states]
         else:
             if current_platform.is_tpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
@@ -1734,8 +1729,8 @@ class FusedMoE(CustomOp):
                     hidden_states, router_logits, self.layer_name
                 )
             return (
-                reduce_output(shared_output),
-                reduce_output(fused_output),
+                reduce_output(shared_output)[..., :og_hidden_states],
+                reduce_output(fused_output)[..., :og_hidden_states],
             )
 
     def forward_cuda(

From 0b0aa874e85431a0f08a0d1fad95ae673e034392 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 26 Nov 2025 11:38:52 -0500
Subject: [PATCH 092/134] [Perf] Optimize batch invariant BMM, 18.1% Throughput
 improvement, 10.7% TTFT improvement (#29345)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/v1/determinism/test_batch_invariance.py |   3 -
 tests/v1/determinism/utils.py                 |   5 +-
 vllm/model_executor/layers/batch_invariant.py | 225 +++++++++++++++++-
 3 files changed, 217 insertions(+), 16 deletions(-)

diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index b9e2daafb8705..4311547baccf1 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -159,7 +159,6 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     "backend",
     BACKENDS,
 )
-@pytest.mark.forked
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     backend, monkeypatch: pytest.MonkeyPatch
 ):
@@ -429,7 +428,6 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
     "backend",
     BACKENDS,
 )
-@pytest.mark.forked
 def test_logprobs_without_batch_invariance_should_fail(
     backend, monkeypatch: pytest.MonkeyPatch
 ):
@@ -646,7 +644,6 @@ def test_logprobs_without_batch_invariance_should_fail(
 
 @skip_unsupported
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
-@pytest.mark.forked
 def test_decode_logprobs_match_prefill_logprobs(
     backend, monkeypatch: pytest.MonkeyPatch
 ):
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
index ecbb6a1126933..0d7da107728b4 100644
--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -8,6 +8,7 @@ import torch
 
 from vllm.attention.utils.fa_utils import flash_attn_supports_mla
 from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer
 
 skip_unsupported = pytest.mark.skipif(
     not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
@@ -16,9 +17,11 @@ skip_unsupported = pytest.mark.skipif(
 
 BACKENDS: list[str] = [
     "FLASH_ATTN",
-    "FLASHINFER",
 ]
 
+if has_flashinfer():
+    BACKENDS.append("FLASHINFER")
+
 if flash_attn_supports_mla():
     BACKENDS.append("FLASH_ATTN_MLA")
 
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index be7f673e5618f..4154122636dcf 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -215,6 +215,139 @@ def matmul_persistent(
     return c
 
 
+@triton.jit
+def bmm_kernel(
+    a_ptr,  # (*, ) pointer to A, (B, M, K)
+    b_ptr,  # (*, ) pointer to B, (B, K, N)
+    c_ptr,  # (*, ) pointer to C, (B, M, N)
+    B,  # int, batch size
+    M,  # int, output rows
+    N,  # int, output cols
+    K,  # int, reduction dim
+    stride_ab,
+    stride_am,
+    stride_ak,
+    stride_bb,
+    stride_bk,
+    stride_bn,
+    stride_cb,
+    stride_cm,
+    stride_cn,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    A_LARGE: tl.constexpr,
+    B_LARGE: tl.constexpr,
+    C_LARGE: tl.constexpr,
+):
+    """Batched GEMM: (B, M, K) x (B, K, N) -> (B, M, N)
+
+    Each program computes one (batch_idx, tile_m, tile_n) tile, accumulating
+    along K in a fixed order to preserve batch invariance.
+    """
+    pid_b = tl.program_id(0)
+    pid = tl.program_id(1)
+
+    if pid_b >= B:
+        return
+
+    # number of tiles along M / N
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    if pid_m >= num_pid_m or pid_n >= num_pid_n:
+        return
+
+    # offs_m / offs_n: raw global row/col indices for this tile
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    # masks for valid logical rows/cols within (M, N)
+    mask_m = offs_m < M  # [BLOCK_SIZE_M]
+    mask_n = offs_n < N  # [BLOCK_SIZE_N]
+
+    if A_LARGE or B_LARGE or C_LARGE:
+        offs_m = offs_m.to(tl.int64)
+        offs_n = offs_n.to(tl.int64)
+
+    offs_m = tl.where(mask_m, offs_m, 0)
+    offs_n = tl.where(mask_n, offs_n, 0)
+
+    # hint for triton contiguous memory
+    offs_m = tl.max_contiguous(tl.multiple_of(offs_m, BLOCK_SIZE_M), BLOCK_SIZE_M)
+    offs_n = tl.max_contiguous(tl.multiple_of(offs_n, BLOCK_SIZE_N), BLOCK_SIZE_N)
+
+    # base pointers for current batch, shape-wise:
+    #   a_batch_ptr points to A[pid_b, 0, 0]
+    #   b_batch_ptr points to B[pid_b, 0, 0]
+    #   c_batch_ptr points to C[pid_b, 0, 0]
+    a_batch_ptr = a_ptr + pid_b * stride_ab
+    b_batch_ptr = b_ptr + pid_b * stride_bb
+    c_batch_ptr = c_ptr + pid_b * stride_cb
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    # number of K-blocks this tile iterates over
+    k_tiles = tl.cdiv(K, BLOCK_SIZE_K)
+    offs_k_mask = tl.arange(0, BLOCK_SIZE_K)
+
+    for ki in range(k_tiles):
+        if A_LARGE or B_LARGE:
+            # offs_k: [BLOCK_SIZE_K], global K indices
+            offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K).to(tl.int64)
+        else:
+            offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+
+        # a_ptrs: [BLOCK_SIZE_M, BLOCK_SIZE_K]
+        #   element (i, j) points to A[pid_b, offs_m[i], offs_k[j]]
+        a_ptrs = a_batch_ptr + (
+            offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak
+        )
+        # b_ptrs: [BLOCK_SIZE_K, BLOCK_SIZE_N]
+        #   element (i, j) points to B[pid_b, offs_k[i], offs_n[j]]
+        b_ptrs = b_batch_ptr + (
+            offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn
+        )
+
+        # valid K lanes for this block
+        k_valid = offs_k_mask < (K - ki * BLOCK_SIZE_K)
+        # A mask within (M, K): [BLOCK_SIZE_M, BLOCK_SIZE_K]
+        a_mask = mask_m[:, None] & k_valid[None, :]
+        # B mask within (K, N): [BLOCK_SIZE_K, BLOCK_SIZE_N]
+        b_mask = k_valid[:, None] & mask_n[None, :]
+
+        # a: [BLOCK_SIZE_M, BLOCK_SIZE_K] from A[offs_m, offs_k]
+        a = tl.load(
+            a_ptrs,
+            mask=a_mask,
+            other=0.0,
+        )
+        # b: [BLOCK_SIZE_K, BLOCK_SIZE_N] from B[offs_k, offs_n]
+        b = tl.load(
+            b_ptrs,
+            mask=b_mask,
+            other=0.0,
+        )
+        accumulator = tl.dot(a, b, accumulator)
+
+    # c_m / c_n: [BLOCK_SIZE_M] / [BLOCK_SIZE_N], row/col indices for C
+    c_m = offs_m
+    c_n = offs_n
+    if C_LARGE:
+        c_m = c_m.to(tl.int64)
+        c_n = c_n.to(tl.int64)
+
+    # c_ptrs: [BLOCK_SIZE_M, BLOCK_SIZE_N]
+    #   element (i, j) points to C[pid_b, c_m[i], c_n[j]]
+    c_ptrs = c_batch_ptr + stride_cm * c_m[:, None] + stride_cn * c_n[None, :]
+    # mask out elements that fall outside logical (M, N) range
+    c_mask = mask_m[:, None] & mask_n[None, :]
+    # cast FP32 accumulator back to original dtype of C
+    c = accumulator.to(c_ptr.dtype.element_ty)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
 @triton.jit
 def _log_softmax_kernel(
     input_ptr,
@@ -526,23 +659,91 @@ def matmul_batch_invariant(a, b, *, out=None):
 
 def bmm_batch_invariant(a, b, *, out=None):
     # Batched matrix multiply: (B, M, K) x (B, K, N) -> (B, M, N)
-    # Process each batch separately with our persistent kernel
-    if a.ndim == 3 and b.ndim == 3:
-        results = []
-        for i in range(a.shape[0]):
-            results.append(matmul_persistent(a[i], b[i]))
-        result = torch.stack(results, dim=0)
-
-        if out is not None:
-            out.copy_(result)
-            return out
-        return result
-    else:
+    if not (a.ndim == 3 and b.ndim == 3):
         raise ValueError(
             f"bmm_batch_invariant expects 3D tensors, "
             f"got shapes {a.shape} and {b.shape}"
         )
 
+    if a.shape[0] != b.shape[0]:
+        raise ValueError(
+            f"Batch dimensions of tensors must match, "
+            f"but got {a.shape[0]} and {b.shape[0]}."
+        )
+    if a.shape[2] != b.shape[1]:
+        raise ValueError(
+            f"Incompatible inner dimensions for matmul: got {a.shape} and {b.shape}."
+        )
+    if a.dtype != b.dtype:
+        raise ValueError(f"Incompatible dtypes: got {a.dtype} and {b.dtype}.")
+
+    B, M, K = a.shape
+    _, _, N = b.shape
+    dtype = a.dtype
+
+    if out is None:
+        c = torch.empty((B, M, N), device=a.device, dtype=dtype)
+    else:
+        assert out.shape == (B, M, N), "out tensor has incorrect shape"
+        assert out.dtype == dtype and out.device == a.device, "out tensor mismatch"
+        c = out
+
+    configs = {
+        torch.bfloat16: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 64,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+        torch.float16: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 256,
+            "BLOCK_SIZE_K": 64,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+        torch.float32: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+    }
+
+    cfg = configs[dtype]
+    # grid = (B, num_tiles_per_matrix)
+    grid = (
+        B,
+        triton.cdiv(M, cfg["BLOCK_SIZE_M"]) * triton.cdiv(N, cfg["BLOCK_SIZE_N"]),
+    )
+
+    bmm_kernel[grid](
+        a,
+        b,
+        c,
+        B,
+        M,
+        N,
+        K,
+        a.stride(0),
+        a.stride(1),
+        a.stride(2),
+        b.stride(0),
+        b.stride(1),
+        b.stride(2),
+        c.stride(0),
+        c.stride(1),
+        c.stride(2),
+        A_LARGE=a.numel() > 2**31,
+        B_LARGE=b.numel() > 2**31,
+        C_LARGE=c.numel() > 2**31,
+        **cfg,
+    )
+
+    return c
+
 
 def addmm_batch_invariant(bias, a, b):
     return matmul_persistent(a, b, bias=bias)

From e603129505fdf39b0784fe9600feb9101ed5170d Mon Sep 17 00:00:00 2001
From: HDCharles <39544797+HDCharles@users.noreply.github.com>
Date: Wed, 26 Nov 2025 12:21:58 -0500
Subject: [PATCH 093/134] [refactor] CTConfig methods to static/class methods
 (#28870)

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../compressed_tensors/compressed_tensors.py  | 55 +++++++++++--------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 6c7d4cd7bd9ab..2800f90ce0b67 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -266,8 +266,9 @@ class CompressedTensorsConfig(QuantizationConfig):
     def get_config_filenames(cls) -> list[str]:
         return []
 
+    @staticmethod
     def _check_scheme_supported(
-        self, min_capability: int, error: bool = True, match_exact: bool = False
+        min_capability: int, error: bool = True, match_exact: bool = False
     ) -> bool:
         capability_tuple = current_platform.get_device_capability()
 
@@ -293,9 +294,8 @@ class CompressedTensorsConfig(QuantizationConfig):
         else:
             return False
 
-    def _is_fp4a4_nvfp4(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
-    ):
+    @staticmethod
+    def _is_fp4a4_nvfp4(weight_quant: QuantizationArgs, input_quant: QuantizationArgs):
         if weight_quant is None or input_quant is None:
             return False
 
@@ -322,9 +322,8 @@ class CompressedTensorsConfig(QuantizationConfig):
             and is_symmetric
         )
 
-    def _is_fp4a16_nvfp4(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
-    ):
+    @staticmethod
+    def _is_fp4a16_nvfp4(weight_quant: QuantizationArgs, input_quant: QuantizationArgs):
         is_weight_only = weight_quant is not None and input_quant is None
         is_tensor_group_quant = (
             weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value
@@ -344,8 +343,9 @@ class CompressedTensorsConfig(QuantizationConfig):
             and is_symmetric
         )
 
+    @staticmethod
     def _is_static_tensor_w8a8(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
         weight_strategy = (
@@ -362,8 +362,9 @@ class CompressedTensorsConfig(QuantizationConfig):
         # Only symmetric weight quantization supported.
         return is_8_bits and is_tensor and weight_quant.symmetric and is_static
 
+    @staticmethod
     def _is_dynamic_token_w8a8(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
         weight_strategy = (
@@ -379,8 +380,9 @@ class CompressedTensorsConfig(QuantizationConfig):
         # Only symmetric weight quantization supported.
         return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
 
+    @staticmethod
     def _is_dynamic_token_w4a8_int(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         is_weight_4_bits = weight_quant.num_bits == 4
         is_activation_8_bits = input_quant.num_bits == 8
@@ -403,8 +405,9 @@ class CompressedTensorsConfig(QuantizationConfig):
             and is_dynamic
         )
 
+    @staticmethod
     def _is_fp8_w8a8(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         # Confirm weights and activations quantized.
         if weight_quant is None or input_quant is None:
@@ -439,8 +442,9 @@ class CompressedTensorsConfig(QuantizationConfig):
         is_per_tensor_activation = input_quant.strategy == QuantizationStrategy.TENSOR
         return is_symmetric_activation and is_per_tensor_activation
 
+    @staticmethod
     def _is_fp8_w4a8(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         if not weight_quant or not input_quant:
             return False
@@ -462,29 +466,33 @@ class CompressedTensorsConfig(QuantizationConfig):
             and is_dynamic
         )
 
+    @classmethod
     def _is_fp8_w4a8_sm90(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        cls, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
-        return self._check_scheme_supported(
+        return cls._check_scheme_supported(
             90, error=False, match_exact=True
-        ) and self._is_fp8_w4a8(weight_quant, input_quant)
+        ) and cls._is_fp8_w4a8(weight_quant, input_quant)
 
+    @classmethod
     def _is_fp8_w8a8_sm90(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        cls, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
-        return self._check_scheme_supported(
+        return cls._check_scheme_supported(
             90, error=False, match_exact=True
-        ) and self._is_fp8_w8a8(weight_quant, input_quant)
+        ) and cls._is_fp8_w8a8(weight_quant, input_quant)
 
+    @classmethod
     def _is_fp8_w8a8_sm100(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        cls, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
-        return self._check_scheme_supported(
+        return cls._check_scheme_supported(
             100, error=False, match_exact=True
-        ) and self._is_fp8_w8a8(weight_quant, input_quant)
+        ) and cls._is_fp8_w8a8(weight_quant, input_quant)
 
+    @staticmethod
     def _is_fp8_w8a16(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         # Confirm weights quantized.
         if weight_quant is None:
@@ -508,8 +516,9 @@ class CompressedTensorsConfig(QuantizationConfig):
             and is_tensor_or_channel_or_block_weight
         )
 
+    @staticmethod
     def _is_wNa16_group_channel(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         input_quant_none = input_quant is None
         is_channel_group = (

From c4c0354eec86f3486285f121fa184dd6d9cacb9d Mon Sep 17 00:00:00 2001
From: Alec <35311602+alec-flowers@users.noreply.github.com>
Date: Wed, 26 Nov 2025 09:41:16 -0800
Subject: [PATCH 094/134] [CI/Build] allow user modify pplx and deepep ref by
 ENV or command line (#29131)

Signed-off-by: alec-flowers <aflowers@nvidia.com>
---
 docker/Dockerfile                            |  8 ++-
 tools/ep_kernels/install_python_libraries.sh | 66 +++++++++++++++++---
 2 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 84a1802dbe03a..aa3aad21d6c07 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -244,9 +244,15 @@ RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
 
 COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
 # Install EP kernels(pplx-kernels and DeepEP)
+ARG PPLX_COMMIT_HASH
+ARG DEEPEP_COMMIT_HASH
 RUN --mount=type=cache,target=/root/.cache/uv \
     export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
-    /tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \
+    /tmp/install_python_libraries.sh \
+        --workspace /tmp/ep_kernels_workspace \
+        --mode wheel \
+        ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
+        ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
     find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
 
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 1cea1bef8dbc9..88be5cd778fff 100755
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -1,22 +1,68 @@
 #!/usr/bin/env bash
 set -ex
 
-# usage: ./build.sh [workspace_dir] [mode]
-#   mode: "install" (default) → install directly into current Python env
-#         "wheel"              → build wheels into WORKSPACE/dist
+# usage: ./install_python_libraries.sh [options]
+#   --workspace <dir>    workspace directory (default: ./ep_kernels_workspace)
+#   --mode <mode>        "install" (default) or "wheel"
+#   --pplx-ref <commit>  pplx-kernels commit hash
+#   --deepep-ref <commit> DeepEP commit hash
+
+CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
+PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"}
+DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"}
+NVSHMEM_VER=3.3.9
+WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace}
+MODE=${MODE:-install}
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --workspace)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --workspace requires an argument." >&2
+                exit 1
+            fi
+            WORKSPACE="$2"
+            shift 2
+            ;;
+        --mode)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --mode requires an argument." >&2
+                exit 1
+            fi
+            MODE="$2"
+            shift 2
+            ;;
+        --pplx-ref)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --pplx-ref requires an argument." >&2
+                exit 1
+            fi
+            PPLX_COMMIT_HASH="$2"
+            shift 2
+            ;;
+        --deepep-ref)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --deepep-ref requires an argument." >&2
+                exit 1
+            fi
+            DEEPEP_COMMIT_HASH="$2"
+            shift 2
+            ;;
+        *)
+            echo "Error: Unknown argument '$1'" >&2
+            exit 1
+            ;;
+    esac
+done
 
-WORKSPACE=${1:-$(pwd)/ep_kernels_workspace}
-MODE=${2:-install}
 mkdir -p "$WORKSPACE"
 
 WHEEL_DIR="$WORKSPACE/dist"
 mkdir -p "$WHEEL_DIR"
-NVSHMEM_VER=3.3.9
 
 pushd "$WORKSPACE"
 
-CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
-
 # install dependencies if not installed
 if [ -z "$VIRTUAL_ENV" ]; then
   uv pip install --system cmake torch ninja
@@ -133,7 +179,7 @@ do_build \
     "https://github.com/ppl-ai/pplx-kernels" \
     "pplx-kernels" \
     "setup.py" \
-    "12cecfd" \
+    "$PPLX_COMMIT_HASH" \
     ""
 
 # build DeepEP
@@ -141,7 +187,7 @@ do_build \
     "https://github.com/deepseek-ai/DeepEP" \
     "DeepEP" \
     "setup.py" \
-    "73b6ea4" \
+    "$DEEPEP_COMMIT_HASH" \
     "export NVSHMEM_DIR=$WORKSPACE/nvshmem; "
 
 if [ "$MODE" = "wheel" ]; then

From 430dd4d9eb7e342e28012351df06d93892f86741 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 26 Nov 2025 12:53:15 -0500
Subject: [PATCH 095/134] [Attention] Remove imports from
 `vllm/attention/__init__.py` (#29342)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 docs/contributing/model/basic.md              |  2 +-
 tests/compile/test_fusion_attn.py             |  3 ++-
 tests/compile/test_qk_norm_rope_fusion.py     |  3 ++-
 tests/kernels/utils.py                        |  2 +-
 tests/v1/worker/test_gpu_model_runner.py      |  2 +-
 tests/v1/worker/test_utils.py                 |  4 ++--
 vllm/attention/__init__.py                    | 19 -------------------
 vllm/attention/backends/abstract.py           |  2 +-
 vllm/attention/layer.py                       |  7 +++++--
 vllm/compilation/fusion_attn.py               |  2 +-
 vllm/compilation/qk_norm_rope_fusion.py       |  2 +-
 .../kv_connector/v1/nixl_connector.py         |  2 +-
 .../kv_connector/v1/offloading_connector.py   |  3 ++-
 .../layers/mamba/linear_attn.py               |  2 +-
 vllm/model_executor/model_loader/utils.py     |  3 +--
 vllm/model_executor/models/afmoe.py           |  3 ++-
 vllm/model_executor/models/apertus.py         |  3 ++-
 vllm/model_executor/models/arctic.py          |  2 +-
 vllm/model_executor/models/baichuan.py        |  2 +-
 vllm/model_executor/models/bailing_moe.py     |  2 +-
 vllm/model_executor/models/bloom.py           |  2 +-
 vllm/model_executor/models/chameleon.py       |  2 +-
 vllm/model_executor/models/chatglm.py         |  2 +-
 vllm/model_executor/models/clip.py            |  3 +--
 vllm/model_executor/models/commandr.py        |  2 +-
 vllm/model_executor/models/dbrx.py            |  2 +-
 vllm/model_executor/models/deepseek_v2.py     |  2 +-
 vllm/model_executor/models/dots1.py           |  2 +-
 vllm/model_executor/models/ernie45_moe.py     |  2 +-
 vllm/model_executor/models/ernie45_vl_moe.py  |  2 +-
 vllm/model_executor/models/exaone.py          |  2 +-
 vllm/model_executor/models/exaone4.py         |  2 +-
 vllm/model_executor/models/falcon.py          |  2 +-
 vllm/model_executor/models/gemma.py           |  2 +-
 vllm/model_executor/models/gemma2.py          |  2 +-
 vllm/model_executor/models/gemma3.py          |  3 ++-
 vllm/model_executor/models/gemma3n.py         |  2 +-
 vllm/model_executor/models/glm4.py            |  3 ++-
 vllm/model_executor/models/glm4_moe.py        |  2 +-
 vllm/model_executor/models/gpt2.py            |  2 +-
 vllm/model_executor/models/gpt_bigcode.py     |  2 +-
 vllm/model_executor/models/gpt_j.py           |  2 +-
 vllm/model_executor/models/gpt_neox.py        |  2 +-
 vllm/model_executor/models/gpt_oss.py         |  3 ++-
 vllm/model_executor/models/granite.py         |  2 +-
 vllm/model_executor/models/granitemoe.py      |  2 +-
 vllm/model_executor/models/grok1.py           |  2 +-
 vllm/model_executor/models/hunyuan_v1.py      |  3 ++-
 vllm/model_executor/models/internlm2.py       |  2 +-
 vllm/model_executor/models/jais.py            |  2 +-
 vllm/model_executor/models/lfm2.py            |  2 +-
 vllm/model_executor/models/lfm2_moe.py        |  2 +-
 vllm/model_executor/models/llama.py           |  3 ++-
 vllm/model_executor/models/llama4.py          |  2 +-
 vllm/model_executor/models/minicpm.py         |  2 +-
 vllm/model_executor/models/minicpm3.py        |  2 +-
 vllm/model_executor/models/minimax_m2.py      |  2 +-
 vllm/model_executor/models/minimax_text_01.py |  3 ++-
 vllm/model_executor/models/mixtral.py         |  2 +-
 vllm/model_executor/models/molmo.py           |  3 +--
 vllm/model_executor/models/mpt.py             |  2 +-
 vllm/model_executor/models/nemotron.py        |  2 +-
 vllm/model_executor/models/nemotron_nas.py    |  2 +-
 vllm/model_executor/models/olmo.py            |  2 +-
 vllm/model_executor/models/olmo2.py           |  2 +-
 vllm/model_executor/models/olmoe.py           |  2 +-
 vllm/model_executor/models/openpangu.py       |  3 ++-
 vllm/model_executor/models/opt.py             |  2 +-
 vllm/model_executor/models/orion.py           |  2 +-
 vllm/model_executor/models/ouro.py            |  3 ++-
 vllm/model_executor/models/persimmon.py       |  2 +-
 vllm/model_executor/models/phi.py             |  2 +-
 vllm/model_executor/models/phimoe.py          |  2 +-
 vllm/model_executor/models/qwen.py            |  2 +-
 vllm/model_executor/models/qwen2.py           |  3 ++-
 vllm/model_executor/models/qwen2_moe.py       |  2 +-
 vllm/model_executor/models/qwen3.py           |  3 ++-
 vllm/model_executor/models/qwen3_moe.py       |  2 +-
 vllm/model_executor/models/qwen3_next.py      |  3 ++-
 vllm/model_executor/models/seed_oss.py        |  3 ++-
 vllm/model_executor/models/solar.py           |  2 +-
 vllm/model_executor/models/stablelm.py        |  2 +-
 vllm/model_executor/models/starcoder2.py      |  2 +-
 vllm/model_executor/models/step3_text.py      |  2 +-
 .../models/transformers/base.py               |  3 ++-
 vllm/model_executor/models/whisper.py         |  4 ++--
 vllm/platforms/cuda.py                        |  2 +-
 vllm/v1/attention/backends/cpu_attn.py        |  2 +-
 vllm/v1/attention/backends/flash_attn.py      |  2 +-
 vllm/v1/attention/backends/flex_attention.py  |  2 +-
 vllm/v1/kv_offload/cpu.py                     |  2 +-
 vllm/v1/kv_offload/spec.py                    |  2 +-
 vllm/v1/kv_offload/worker/cpu_gpu.py          |  2 +-
 vllm/v1/worker/gpu_model_runner.py            |  3 ++-
 .../worker/kv_connector_model_runner_mixin.py |  2 +-
 vllm/v1/worker/tpu_model_runner.py            |  3 +--
 96 files changed, 120 insertions(+), 121 deletions(-)

diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index e828de0adf3c2..a68d1f0162a10 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -29,7 +29,7 @@ The initialization code should look like this:
     ```python
     from torch import nn
     from vllm.config import VllmConfig
-    from vllm.attention import Attention
+    from vllm.attention.layer import Attention
 
     class MyAttention(nn.Module):
         def __init__(self, vllm_config: VllmConfig, prefix: str):
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index ea61c94953a77..dbe12dc5de705 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -9,8 +9,9 @@ from tests.compile.backend import LazyInitPass, TestBackend
 from tests.utils import flat_product
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import Attention
 from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
index 511e50f5fdc24..5ebb95b6db332 100644
--- a/tests/compile/test_qk_norm_rope_fusion.py
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -5,7 +5,8 @@ import pytest
 import torch
 
 from tests.compile.backend import TestBackend
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 9307ef7814a8b..b8148ce06b3fd 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -14,7 +14,7 @@ import torch
 from torch._prims_common import TensorLikeType
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
-from vllm.attention import AttentionType
+from vllm.attention.backends.abstract import AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.utils import (
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index d0f1b703fcb92..89669ee8b71a0 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -5,8 +5,8 @@ import numpy as np
 import pytest
 import torch
 
-from vllm.attention import Attention
 from vllm.attention.backends.abstract import MultipleOf
+from vllm.attention.layer import Attention
 from vllm.config import (
     CacheConfig,
     ModelConfig,
diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py
index f987b09e603e7..bcf5611e35228 100644
--- a/tests/v1/worker/test_utils.py
+++ b/tests/v1/worker/test_utils.py
@@ -7,7 +7,7 @@ from vllm.v1.worker.utils import bind_kv_cache
 
 
 def test_bind_kv_cache():
-    from vllm.attention import Attention
+    from vllm.attention.layer import Attention
 
     ctx = {
         "layers.0.self_attn": Attention(32, 128, 0.1),
@@ -35,7 +35,7 @@ def test_bind_kv_cache():
 
 
 def test_bind_kv_cache_non_attention():
-    from vllm.attention import Attention
+    from vllm.attention.layer import Attention
 
     # example from Jamba PP=2
     ctx = {
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index 8b4dc4013362e..e69de29bb2d1d 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -1,19 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionMetadata,
-    AttentionType,
-)
-from vllm.attention.layer import Attention
-from vllm.attention.selector import get_attn_backend, get_mamba_attn_backend
-
-__all__ = [
-    "Attention",
-    "AttentionBackend",
-    "AttentionMetadata",
-    "AttentionType",
-    "get_attn_backend",
-    "get_mamba_attn_backend",
-]
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index bd7e81b15bfc3..a321167b8090f 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -178,7 +178,7 @@ class AttentionBackend(ABC):
         By default, only supports decoder attention.
         Backends should override this to support other attention types.
         """
-        from vllm.attention import AttentionType
+        from vllm.attention.backends.abstract import AttentionType
 
         return attn_type == AttentionType.DECODER
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index f1d57ac50fb9f..62ac38751aa01 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -10,8 +10,11 @@ import torch.nn as nn
 import torch.nn.functional as F
 
 import vllm.envs as envs
-from vllm.attention import AttentionType
-from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionType,
+    MLAAttentionImpl,
+)
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index 4f44faece75e5..6dcbbd85d7031 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -10,7 +10,7 @@ from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py
index e3c399e079063..794cd8e3fce56 100644
--- a/vllm/compilation/qk_norm_rope_fusion.py
+++ b/vllm/compilation/qk_norm_rope_fusion.py
@@ -9,7 +9,7 @@ from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 493938d4aad92..ff51840b84b14 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -20,7 +20,7 @@ import torch
 import zmq
 
 from vllm import envs
-from vllm.attention import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index 8cd09014cab11..0ad9d4ae1b39f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -8,7 +8,8 @@ from typing import Any, ClassVar
 
 import torch
 
-from vllm.attention import Attention, AttentionBackend, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
index d85b3e61c5d61..278713408c288 100644
--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -8,7 +8,7 @@ import torch.nn.functional as F
 from einops import rearrange
 from torch import nn
 
-from vllm.attention import AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 2021b68b8a60b..eeb2444150eef 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -11,8 +11,7 @@ import torch
 from torch import nn
 from typing_extensions import assert_never
 
-from vllm.attention import Attention
-from vllm.attention.layer import MLAAttention
+from vllm.attention.layer import Attention, MLAAttention
 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py
index 4eb5665a71fc8..85827d54c911a 100644
--- a/vllm/model_executor/models/afmoe.py
+++ b/vllm/model_executor/models/afmoe.py
@@ -9,7 +9,8 @@ from itertools import islice
 import torch
 from torch import nn
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index b75e91319bbad..f38b09bf55068 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -32,7 +32,8 @@ import torch
 from torch import nn
 from transformers import ApertusConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index b75a254761d4e..266d29a8d9b2b 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -8,7 +8,7 @@ from itertools import islice
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 024788918d024..beb22995a0719 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -29,7 +29,7 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index cc10e936a2d3d..f7a5d4e7889e5 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -32,7 +32,7 @@ import torch.nn.functional as F
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 00fba93423d8e..507fbf1fdd0a8 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -27,7 +27,7 @@ import torch
 from torch import nn
 from transformers import BloomConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index b5a6d00dc309f..3aa01bb1905fe 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -16,7 +16,7 @@ from transformers import (
     ChameleonVQVAEConfig,
 )
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index dbfcd62d0bcab..3d485fdd0a2e1 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -12,7 +12,7 @@ import torch
 from torch import nn
 from torch.nn import LayerNorm
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 5d611deb942d1..c2993b47dc3f9 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -14,8 +14,7 @@ from transformers import (
     CLIPVisionConfig,
 )
 
-from vllm.attention import Attention
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 5ed920927c772..f837502c468f1 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -30,7 +30,7 @@ import torch
 from torch import nn
 from transformers import Cohere2Config, CohereConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 2c729019081a4..946baffc8817a 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -8,7 +8,7 @@ import torch
 import torch.nn as nn
 from transformers import DbrxConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
     get_pp_group,
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index ad932559b983d..73cac2556c55a 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -33,8 +33,8 @@ from torch import nn
 from transformers import DeepseekV2Config, DeepseekV3Config
 
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.layer import Attention
 from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index e65c275106a4e..1c2abbe7b3a78 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -32,7 +32,7 @@ import torch
 from torch import nn
 from transformers import Dots1Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index a7df3509e3ecd..278ba45e9684c 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -32,7 +32,7 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index 50e033d77606d..72f9957fc8828 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -31,7 +31,7 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 
 # from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index d13275488fe99..99002baa87529 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -32,7 +32,7 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 70f3cce2b7c56..9d2c67d6c4f80 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -28,7 +28,7 @@ import torch
 from torch import nn
 from transformers import Exaone4Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index dc2d51f340c8c..32d9e7b925597 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -30,7 +30,7 @@ from torch import nn
 from torch.nn import LayerNorm
 from transformers import FalconConfig as HF_FalconConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 00c7f59a08094..dd5a74c8ed005 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -26,7 +26,7 @@ import torch
 from torch import nn
 from transformers import GemmaConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 9b6cfe6932300..cb36e04824588 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -23,7 +23,7 @@ import torch
 from torch import nn
 from transformers import Gemma2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 4ad6fc89dcaf2..73176eba95ed5 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -23,7 +23,8 @@ import torch.nn.functional as F
 from torch import nn
 from transformers import Gemma3TextConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 8f1447ba34a81..f4427c9fd1d10 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -21,7 +21,7 @@ import torch
 from torch import nn
 from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index f8ef3b0385fb1..002cdb721e1db 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -29,7 +29,8 @@ import torch
 from torch import nn
 from transformers import Glm4Config
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 5aa51af54a00b..c99f824e1bd4d 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -31,7 +31,7 @@ import torch
 from torch import nn
 from transformers.models.glm4_moe import Glm4MoeConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index a5e8131c7fba9..da5d48a94ff3e 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -27,7 +27,7 @@ import torch
 from torch import nn
 from transformers import GPT2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed.parallel_state import (
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index cdf038ba25c92..a405fd184513f 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -28,7 +28,7 @@ import torch
 from torch import nn
 from transformers import GPTBigCodeConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index bd1bfea3c0fef..f0a34c47da54c 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -26,7 +26,7 @@ import torch
 from torch import nn
 from transformers import GPTJConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 815c2fba4d9fe..b9959682cbcef 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -26,7 +26,7 @@ import torch
 from torch import nn
 from transformers import GPTNeoXConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 1bc0ad38765d5..9de3e261941b1 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -7,7 +7,8 @@ import torch.distributed as dist
 from torch import nn
 from transformers import GptOssConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index cd7ce2fc8f00a..eac9ef9478a6a 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -31,7 +31,7 @@ import torch
 from torch import nn
 from transformers import GraniteConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 8f4139d63c3f6..02c6c5862141f 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -31,7 +31,7 @@ from typing import Any
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index cfca564920111..6f62a1d11e52e 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -31,7 +31,7 @@ import torch
 import torch.nn.functional as F
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 53fb444ed622d..ccdfa3fe175f1 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -33,7 +33,8 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index dc8f821bd134f..c79934e121447 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -10,7 +10,7 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 5549a1fc1cd30..6012288814f15 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -28,7 +28,7 @@ from itertools import islice
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index 74bdde27ece5c..69615f8b6a099 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn as nn
 from transformers import Lfm2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index c088a08211527..aaeb2cc38999e 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -6,7 +6,7 @@ from itertools import islice
 import torch
 import torch.nn as nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f6af2bb3b12e9..6dfbde7a17f54 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -31,7 +31,8 @@ import torch
 from torch import nn
 from transformers import LlamaConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index e1bdfc3405f70..423be45e80149 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -24,7 +24,7 @@ import torch
 from torch import nn
 from transformers import Llama4TextConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 04923833065f3..67911ba8c1c8f 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -33,7 +33,7 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 2d775219fc972..0a2bcbd7f6084 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -29,7 +29,7 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.layernorm import RMSNorm
diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py
index 4955c68c0cda8..dd98e36ec0851 100644
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@@ -30,7 +30,7 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 50f7396e2de60..390de78cc27b4 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -14,7 +14,8 @@ import torch
 from torch import nn
 from transformers import MiniMaxConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed.parallel_state import (
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0a9c3f136964e..e21656dbd6350 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -32,7 +32,7 @@ import torch
 from torch import nn
 from transformers import MixtralConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index dc06938d5d6e1..7b53299cccbe4 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -17,8 +17,7 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin, TensorT
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.attention import Attention
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 106ad971a321a..1e285646b9ec3 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -10,7 +10,7 @@ import torch
 import torch.nn as nn
 from transformers import MptConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index c3337bd1ea699..93ad2064a2fca 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -30,7 +30,7 @@ from itertools import islice
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 2eebe38051cbd..34ea2945b711e 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -31,7 +31,7 @@ import torch
 from torch import nn
 from transformers import LlamaConfig
 
-from vllm.attention import AttentionType
+from vllm.attention.backends.abstract import AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index bd8a8e317544f..3bbb4dd242262 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -31,7 +31,7 @@ import torch
 from torch import nn
 from transformers import OlmoConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index f0f6b2f6b3e6d..88e9c2d8541a1 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -32,7 +32,7 @@ import torch
 from torch import nn
 from transformers import Olmo2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index c39e338d72e22..1376583a99725 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -21,7 +21,7 @@ from itertools import islice
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index 4124a181a14c2..bddd9fa50957a 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -29,7 +29,8 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 5df700d1a2e17..bba5291ea5ef5 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -27,7 +27,7 @@ import torch
 from torch import nn
 from transformers import OPTConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index b30be93ca726f..544a44ed54681 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -15,7 +15,7 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py
index 63d2fff6ec8bc..dcae92ed20881 100644
--- a/vllm/model_executor/models/ouro.py
+++ b/vllm/model_executor/models/ouro.py
@@ -33,7 +33,8 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 98963d52e4848..795cd25f16753 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -30,7 +30,7 @@ import torch
 from torch import nn
 from transformers import PersimmonConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index da476f621627b..70016d9ed246c 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -45,7 +45,7 @@ import torch
 from torch import nn
 from transformers import PhiConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 8ffac95d93960..a5a669139b2f7 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -31,7 +31,7 @@ import torch
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index c973e79170982..12285cf9c1968 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -16,7 +16,7 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 5831ce0b3d64b..34c31d8deee23 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -33,7 +33,8 @@ import torch
 from torch import nn
 from transformers import Qwen2Config
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 6b97d0b2ca2e3..5a428740082f6 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -34,7 +34,7 @@ import torch.nn.functional as F
 from torch import nn
 from transformers import Qwen2MoeConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 93a629d81e8ff..7d2b3e5f9bc79 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -30,7 +30,8 @@ import torch
 from torch import nn
 from transformers import Qwen3Config
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 8ee3dd99e11db..6f520706a3176 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -31,7 +31,7 @@ from typing import Any
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index bfed64728305e..661a182151d74 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -10,7 +10,8 @@ from einops import rearrange
 from torch import nn
 from transformers.activations import ACT2FN
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CacheConfig,
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index 4744d8e44f390..267c60157506d 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -30,7 +30,8 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig as SeedOssConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 7e9fc51036d2e..c576154b1ecfd 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -30,7 +30,7 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index a738fcbb4ee28..6cb98b7b72a5b 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -29,7 +29,7 @@ import torch
 from torch import nn
 from transformers import StableLmConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 1118fca3cac91..46422f303ff43 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -28,7 +28,7 @@ import torch
 from torch import nn
 from transformers import Starcoder2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index 3c377a2c539df..077cce84a98dd 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -9,7 +9,7 @@ from typing import Any
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index f4ba4758bcc46..b33ce35427f5e 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -27,7 +27,8 @@ from torch import nn
 from transformers import AutoModel
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.config.utils import getattr_iter
 from vllm.distributed import get_pp_group, get_tp_group
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 50587c627160d..c72b5e1c091f2 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -16,8 +16,8 @@ from transformers import (
 )
 from transformers.models.whisper.modeling_whisper import sinusoids
 
-from vllm.attention import Attention, AttentionType
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.attention.layers.cross_attention import CrossAttention
 from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 75b6bc77e4c1c..e8e14387bb7f6 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -335,7 +335,7 @@ class CudaPlatformBase(Platform):
         use_sparse: bool,
         attn_type: str | None = None,
     ) -> str:
-        from vllm.attention import AttentionType
+        from vllm.attention.backends.abstract import AttentionType
 
         if attn_type is None:
             attn_type = AttentionType.DECODER
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 590bf91b0d057..d0b1f8c1b8071 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -51,7 +51,7 @@ class CPUAttentionBackend(AttentionBackend):
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """CPU attention supports decoder and encoder-only attention."""
-        from vllm.attention import AttentionType
+        from vllm.attention.backends.abstract import AttentionType
 
         return attn_type in (
             AttentionType.DECODER,
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a9a4af5ac1183..0fc57cfb1f9d3 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -84,7 +84,7 @@ class FlashAttentionBackend(AttentionBackend):
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """FlashAttention supports all attention types."""
-        from vllm.attention import AttentionType
+        from vllm.attention.backends.abstract import AttentionType
 
         return attn_type in (
             AttentionType.DECODER,
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 7768827d26dc3..3869f1f4164c9 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -87,7 +87,7 @@ class FlexAttentionBackend(AttentionBackend):
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """FlexAttention supports both decoder and encoder-only attention."""
-        from vllm.attention import AttentionType
+        from vllm.attention.backends.abstract import AttentionType
 
         return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
 
diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py
index 86747299eb107..2f2e85c0ff332 100644
--- a/vllm/v1/kv_offload/cpu.py
+++ b/vllm/v1/kv_offload/cpu.py
@@ -4,7 +4,7 @@ from collections.abc import Iterator
 
 import torch
 
-from vllm.attention import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py
index c1813a4ff4ea9..3afce55890752 100644
--- a/vllm/v1/kv_offload/spec.py
+++ b/vllm/v1/kv_offload/spec.py
@@ -11,7 +11,7 @@ from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 if TYPE_CHECKING:
-    from vllm.attention import AttentionBackend
+    from vllm.attention.backends.abstract import AttentionBackend
     from vllm.config import VllmConfig
 
 logger = init_logger(__name__)
diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
index bb163f0043fc6..461458c1f6ce8 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -5,7 +5,7 @@ import numpy as np
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d3c61794f8b0d..581921a9bfe52 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,12 +19,13 @@ import torch.nn as nn
 from tqdm import tqdm
 
 import vllm.envs as envs
-from vllm.attention import Attention, AttentionType
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionMetadata,
+    AttentionType,
     MultipleOf,
 )
+from vllm.attention.layer import Attention
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index ff047d8d03f0e..b799f1be73d9c 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -13,7 +13,7 @@ from typing import (
 
 import torch
 
-from vllm.attention import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.config.cache import CacheDType
 from vllm.distributed.kv_transfer import (
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 72d4474b89627..9c1fbfd24149d 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -17,9 +17,8 @@ import torch_xla.distributed.spmd as xs
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
-from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionType
-from vllm.attention.layer import MLAAttention
+from vllm.attention.layer import Attention, MLAAttention
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
 from vllm.config import (

From 56539cddac9eeab0a91941d8de689a6cae5dbe05 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 26 Nov 2025 14:07:13 -0500
Subject: [PATCH 096/134] [Core] Refactor padding logic and pad for CUDA graphs
 before attention metadata building  (#28579)

---
 docs/design/cuda_graphs.md                    |   8 +-
 tests/v1/cudagraph/test_cudagraph_dispatch.py |  43 +-
 vllm/forward_context.py                       |  18 +-
 vllm/v1/attention/backends/flashinfer.py      |  21 +-
 vllm/v1/attention/backends/mamba_attn.py      |   2 +
 vllm/v1/attention/backends/utils.py           |   5 +-
 vllm/v1/cudagraph_dispatcher.py               |  97 ++--
 vllm/v1/worker/dp_utils.py                    |  17 +-
 vllm/v1/worker/gpu_model_runner.py            | 432 ++++++++++--------
 vllm/v1/worker/gpu_worker.py                  |  41 +-
 10 files changed, 401 insertions(+), 283 deletions(-)

diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md
index 66bf3b27d1f52..7baadf8ba23cb 100644
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -84,12 +84,14 @@ See the following figures for a quick comparison between the previous and curren
 ```python
 class BatchDescriptor(NamedTuple):
     num_tokens: int
-    uniform_decode: bool = False
+    num_reqs: int
+    uniform: bool = False
+    has_lora: bool = False
 ```
 
-where `num_tokens` can be the padded token length, and `uniform_decode` is determined by if `max_query_len` of a batch is equal to the desired `max_query_len` of a uniform_decode, and the num_scheduled_tokens is divisible by that desired `max_query_len`.
+where `num_tokens` can be the padded token length, and `uniform` indicates if all the requests have the same query lengths. Many attention backends only support full cudagraphs when the batches are uniform; pure decode batches are uniform but may not be query length 1 (i.e. `num_tokens == num_reqs`), this occurs in the validation pass of spec-decode where "decode" batches will have a query length of  `1+num_spec_tokens`.
 
-The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
+The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item.
 
 !!! note
     The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index bb953e5c70c8c..314e7094ef97f 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -42,12 +42,24 @@ def _create_vllm_config(
     mock_config.compilation_config = compilation_config
     mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs)
     mock_config.parallel_config = ParallelConfig()
+    mock_config.speculative_config = None  # No speculative decoding
     if not lora_config:
         mock_config.lora_config = None
     # Mimic the behavior of VllmConfig.__post_init__()
     if compilation_config.mode == CompilationMode.VLLM_COMPILE:
         compilation_config.set_splitting_ops_for_v1()
 
+    # mimic VllmConfig.__post_init__
+    if compilation_config.cudagraph_capture_sizes:
+        compilation_config.max_cudagraph_capture_size = (
+            compilation_config.cudagraph_capture_sizes[-1]
+        )
+
+        compilation_config.post_init_cudagraph_sizes()
+        mock_config.pad_for_cudagraph = (
+            lambda batch_size: compilation_config.bs_to_padded_graph_size[batch_size]
+        )
+
     return mock_config
 
 
@@ -109,9 +121,11 @@ class TestCudagraphDispatcher:
         # 1. non-uniform batch, size in cudagraph size list
         desc_full_exact = BatchDescriptor(
             num_tokens=8,
-            uniform_decode=False,
+            uniform=False,
+        )
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=8, uniform_decode=False, has_lora=False
         )
-        rt_mode, key = dispatcher.dispatch(desc_full_exact)
         if cudagraph_mode_str == "FULL":
             assert rt_mode == CUDAGraphMode.FULL
             assert key == desc_full_exact
@@ -122,32 +136,37 @@ class TestCudagraphDispatcher:
             assert rt_mode == CUDAGraphMode.NONE
 
         # 2. uniform decode batch, size in cudagraph size list
-        desc_uniform_exact = BatchDescriptor(num_tokens=8, uniform_decode=True)
-        rt_mode, key = dispatcher.dispatch(desc_uniform_exact)
+        desc_uniform_exact = BatchDescriptor(num_tokens=8, num_reqs=8, uniform=True)
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=8, uniform_decode=True, has_lora=False
+        )
         if cudagraph_mode_str == "FULL":
             assert rt_mode == CUDAGraphMode.FULL
-            assert key == desc_uniform_exact.non_uniform
+            assert key == desc_uniform_exact.relax_for_mixed_batch_cudagraphs()
         elif cudagraph_mode_str in ["FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"]:
             assert rt_mode == CUDAGraphMode.FULL
             assert key == desc_uniform_exact
         elif cudagraph_mode_str == "PIECEWISE":
             assert rt_mode == CUDAGraphMode.PIECEWISE
-            assert key == desc_uniform_exact.non_uniform
+            assert key == desc_uniform_exact.relax_for_mixed_batch_cudagraphs()
         else:
             assert rt_mode == CUDAGraphMode.NONE
 
         # 3. No key match
-        desc_no_match = BatchDescriptor(num_tokens=15, uniform_decode=False)
-        rt_mode, key = dispatcher.dispatch(desc_no_match)
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=15, uniform_decode=False, has_lora=False
+        )
         assert rt_mode == CUDAGraphMode.NONE
-        assert key is None
+        assert key == BatchDescriptor(num_tokens=15)
 
         # 4. Cascade attention should have a fall back mode
-        desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False)
-        rt_mode, key = dispatcher.dispatch(desc_full_exact, use_cascade_attn=True)
+        desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False)
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=8, uniform_decode=False, has_lora=False, use_cascade_attn=True
+        )
         if "PIECEWISE" in cudagraph_mode_str:  # string contains check
             assert rt_mode == CUDAGraphMode.PIECEWISE
-            assert key == desc_full_exact.non_uniform
+            assert key == desc_full_exact.relax_for_mixed_batch_cudagraphs()
         else:
             assert rt_mode == CUDAGraphMode.NONE
 
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 7cb490e391abb..635419bc7cad4 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -35,23 +35,27 @@ class BatchDescriptor(NamedTuple):
     """
 
     num_tokens: int
-    uniform_decode: bool = False
+    num_reqs: int | None = None
     """
-    False can also be used for an uniform decode batch to dispatch to the 
-    cudagraph supporting non-uniform batches.
+    Number of requests in the batch. Can be None for PIECEWISE cudagraphs where
+    the cudagraphs can handle any number of requests.
+    """
+    uniform: bool = False
+    """
+    True if all the requests in the batch have the same number of tokens.
     """
     has_lora: bool = False
     """
     Whether this batch has active LoRA adapters.
     """
 
-    @property
-    def non_uniform(self) -> "BatchDescriptor":
+    def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor":
         """
-        Return a non-uniform version of current batch descriptor.
+        Return a relaxed version of current batch descriptor that is still compatible
+        with PIECEWISE cudagraphs (or mixed prefill-decode FA cudagraphs).
         """
         return BatchDescriptor(
-            self.num_tokens, uniform_decode=False, has_lora=self.has_lora
+            self.num_tokens, num_reqs=None, uniform=False, has_lora=self.has_lora
         )
 
 
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 8159f4096107f..dbd72b298b1fd 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -930,31 +930,12 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
             if num_decodes > 0:
                 pure_decode = num_prefills == 0
-                # possible required padding for cudagraph replay
                 use_cudagraph = (
                     self.enable_cuda_graph
                     and pure_decode
                     and num_decode_tokens <= self._decode_cudagraph_max_bs
                 )
-                if use_cudagraph:
-                    num_input_tokens = self.vllm_config.pad_for_cudagraph(
-                        num_decode_tokens
-                    )
-                    # Carefully fulfill the padding region with reasonable value
-                    # on cpu.
-                    # Make sure paged_kv_indptr_cpu is not decreasing
-                    self.paged_kv_indptr_cpu[
-                        1 + num_decodes : 1 + num_input_tokens
-                    ].fill_(paged_kv_indptr_cpu[-1])
-                    # Fill the remaining paged_kv_last_page_len_cpu with 1.
-                    # This is because flashinfer treats 0 as a full page
-                    # instead of empty.
-                    self.paged_kv_last_page_len_cpu[num_decodes:num_input_tokens].fill_(
-                        1
-                    )
-
-                else:
-                    num_input_tokens = num_decode_tokens
+                num_input_tokens = num_decode_tokens
 
                 attn_metadata.decode_wrapper = self._get_decode_wrapper(
                     num_input_tokens, use_cudagraph
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index 0d875565fc99a..a9705db59f19d 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -107,6 +107,8 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
         )
         # -1 in case it's non-computed and causes later issues with indexing
         block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0)
+        # -1 in the case we have a padded request (0 seq-len)
+        block_idx_last_scheduled_token = block_idx_last_scheduled_token.clamp(min=0)
 
         return (
             block_idx_last_computed_token,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index cebfe8a3ff04e..18e91fd4fd6a5 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -72,6 +72,7 @@ class CommonAttentionMetadata:
 
     num_reqs: int
     """Number of requests"""
+    # TODO(lucas): rename to num_tokens since it may be padded and this is misleading
     num_actual_tokens: int
     """Total number of tokens in batch"""
     max_query_len: int
@@ -857,7 +858,9 @@ def split_decodes_and_prefills(
     if require_uniform:
         is_prefill = query_lens != query_lens[0]
     else:
-        is_prefill = query_lens > decode_threshold
+        # 0-query len indicates a padded request; leave this at the back
+        # of the batch with the prefills
+        is_prefill = (query_lens > decode_threshold) | (query_lens == 0)
 
     if not torch.any(is_prefill):
         return num_reqs, 0, num_tokens, 0
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index b480ac78f23cf..ef0f8d9e67452 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -4,6 +4,9 @@ from itertools import product
 
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.forward_context import BatchDescriptor
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class CudagraphDispatcher:
@@ -28,7 +31,11 @@ class CudagraphDispatcher:
     def __init__(self, vllm_config: VllmConfig):
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
-        self.cudagraph_mode = self.compilation_config.cudagraph_mode
+        self.uniform_decode_query_len = (
+            1
+            if not self.vllm_config.speculative_config
+            else 1 + self.vllm_config.speculative_config.num_speculative_tokens
+        )
 
         # Dict to store valid cudagraph dispatching keys.
         self.cudagraph_keys: dict[CUDAGraphMode, set[BatchDescriptor]] = {
@@ -36,25 +43,42 @@ class CudagraphDispatcher:
             CUDAGraphMode.FULL: set(),
         }
 
-        not_use_piecewise_compilation = (
-            not self.cudagraph_mode.requires_piecewise_compilation()
-        )
-
         assert (
-            not_use_piecewise_compilation
+            not self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
             or self.compilation_config.is_attention_compiled_piecewise()
         ), (
             "Compilation mode should be CompilationMode.VLLM_COMPILE when "
             "cudagraph_mode piecewise cudagraphs is used, "
             "and attention should be in splitting_ops or "
             "inductor splitting should be used. "
-            f"cudagraph_mode={self.cudagraph_mode}, "
+            f"cudagraph_mode={self.compilation_config.cudagraph_mode}, "
             f"compilation_mode={self.compilation_config.mode}, "
             f"splitting_ops={self.compilation_config.splitting_ops}"
         )
 
         self.keys_initialized = False
 
+    def _create_padded_batch_descriptor(
+        self, num_tokens: int, uniform_decode: bool, has_lora: bool
+    ) -> BatchDescriptor:
+        max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs
+        uniform_decode_query_len = self.uniform_decode_query_len
+        num_tokens_padded = self.vllm_config.pad_for_cudagraph(num_tokens)
+
+        if uniform_decode and self.cudagraph_mode.has_mode(CUDAGraphMode.FULL):
+            num_reqs = num_tokens_padded // uniform_decode_query_len
+            assert num_tokens_padded % uniform_decode_query_len == 0
+        else:
+            uniform_decode = False
+            num_reqs = min(num_tokens_padded, max_num_seqs)
+
+        return BatchDescriptor(
+            num_tokens=num_tokens_padded,
+            num_reqs=num_reqs,
+            uniform=uniform_decode,
+            has_lora=has_lora,
+        )
+
     def add_cudagraph_key(
         self, runtime_mode: CUDAGraphMode, batch_descriptor: BatchDescriptor
     ):
@@ -66,7 +90,9 @@ class CudagraphDispatcher:
     def initialize_cudagraph_keys(
         self, cudagraph_mode: CUDAGraphMode, uniform_decode_query_len: int
     ):
-        # This should be called only after attention backend is initialized.
+        # This should be called only after attention backend is initialized. So we can
+        # get the correct cudagraph mode after backend support is resolved.
+        self.cudagraph_mode = cudagraph_mode
 
         # LoRA activation cases to specialize the cuda graphs on
         if self.vllm_config.lora_config:
@@ -86,9 +112,9 @@ class CudagraphDispatcher:
             ):
                 self.add_cudagraph_key(
                     cudagraph_mode.mixed_mode(),
-                    BatchDescriptor(
-                        num_tokens=bs, uniform_decode=False, has_lora=has_lora
-                    ),
+                    self._create_padded_batch_descriptor(
+                        bs, False, has_lora
+                    ).relax_for_mixed_batch_cudagraphs(),
                 )
 
         # if decode cudagraph mode is FULL, and we don't already have mixed
@@ -109,40 +135,49 @@ class CudagraphDispatcher:
             for bs, has_lora in product(cudagraph_capture_sizes_for_decode, lora_cases):
                 self.add_cudagraph_key(
                     CUDAGraphMode.FULL,
-                    BatchDescriptor(
-                        num_tokens=bs, uniform_decode=True, has_lora=has_lora
-                    ),
+                    self._create_padded_batch_descriptor(bs, True, has_lora),
                 )
+
         self.keys_initialized = True
 
     def dispatch(
-        self, batch_descriptor: BatchDescriptor, use_cascade_attn: bool = False
-    ) -> tuple[CUDAGraphMode, BatchDescriptor | None]:
+        self,
+        num_tokens: int,
+        uniform_decode: bool,
+        has_lora: bool,
+        use_cascade_attn: bool = False,
+    ) -> tuple[CUDAGraphMode, BatchDescriptor]:
         """
         Given conditions(e.g.,batch descriptor and if using cascade attention),
         dispatch to a cudagraph runtime mode and the valid batch descriptor.
         A new batch descriptor is returned as we might dispatch a uniform batch
         to a graph that supports a more general batch (uniform to non-uniform).
         """
-        # if not initialized, just skip dispatching.
-        if not self.keys_initialized:
-            return CUDAGraphMode.NONE, None
+        if (
+            not self.keys_initialized
+            or self.cudagraph_mode == CUDAGraphMode.NONE
+            or num_tokens > self.compilation_config.max_cudagraph_capture_size
+        ):
+            return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
+
+        batch_desc = self._create_padded_batch_descriptor(
+            num_tokens, uniform_decode, has_lora
+        )
+        relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
 
-        non_uniform_key = batch_descriptor.non_uniform
-        # if a batch use cascade attention, bypass checking full cudagraphs
         if not use_cascade_attn:
             # check if key exists for full cudagraph
-            if batch_descriptor in self.cudagraph_keys[CUDAGraphMode.FULL]:
-                return CUDAGraphMode.FULL, batch_descriptor
+            if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]:
+                return CUDAGraphMode.FULL, batch_desc
 
-            # otherwise, check if non-uniform key exists
-            if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.FULL]:
-                return CUDAGraphMode.FULL, non_uniform_key
+            # otherwise, check if the relaxed key exists
+            if relaxed_batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]:
+                return CUDAGraphMode.FULL, relaxed_batch_desc
 
-        # also check if non-uniform key exists for more "general"
+        # also check if the relaxed key exists for more "general"
         # piecewise cudagraph
-        if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
-            return CUDAGraphMode.PIECEWISE, non_uniform_key
+        if relaxed_batch_desc in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
+            return CUDAGraphMode.PIECEWISE, relaxed_batch_desc
 
-        # finally, just return no cudagraphs
-        return CUDAGraphMode.NONE, None
+        # finally, just return no cudagraphs and a trivial batch descriptor
+        return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 464fbf11a21ad..064f2f0360cbf 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -9,6 +9,7 @@ from vllm.config import ParallelConfig
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.logger import init_logger
 from vllm.v1.worker.ubatch_utils import (
+    UBatchSlice,
     UBatchSlices,
     check_ubatch_thresholds,
     create_ubatch_slices,
@@ -88,6 +89,17 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch
         return num_tokens_across_dp.cpu()
 
 
+# This just pads the second ubatch slice out to the total number of tokens
+# (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding.
+def _pad_out_ubatch_slice(ubatch_slices: UBatchSlices, num_total_tokens: int):
+    padded_second_ubatch_slice = slice(
+        ubatch_slices[1].token_slice.start, num_total_tokens
+    )
+    ubatch_slices[1] = UBatchSlice(
+        padded_second_ubatch_slice, padded_second_ubatch_slice
+    )
+
+
 def _synchronize_dp_ranks(
     num_tokens_unpadded: int,
     num_tokens_padded: int,
@@ -220,11 +232,14 @@ def coordinate_batch_across_dp(
     # to the second ubatch in pad_out_ubatch_slice after attention
     # metadata creation
     assert num_tokens_after_padding is not None
-    token_split_point = int(num_tokens_after_padding[0].item()) // 2
+    num_tokens_padded = int(num_tokens_after_padding[0].item())
+    token_split_point = int(num_tokens_padded) // 2
 
     assert num_scheduled_tokens_per_request is not None
     ubatch_slices = create_ubatch_slices(
         num_scheduled_tokens_per_request, token_split_point
     )
+    ubatch_slices = _pad_out_ubatch_slice(ubatch_slices, num_tokens_padded)
+    assert sum(s.num_tokens for s in ubatch_slices) == num_tokens_padded
 
     return (ubatch_slices, num_tokens_after_padding)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 581921a9bfe52..0ae4eb48acf22 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -151,7 +151,6 @@ from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 from vllm.v1.worker.ubatch_utils import (
-    UBatchSlice,
     UBatchSlices,
     check_ubatch_thresholds,
 )
@@ -1239,17 +1238,13 @@ class GPUModelRunner(
         self,
         scheduler_output: "SchedulerOutput",
         num_scheduled_tokens: np.ndarray,
-        max_num_scheduled_tokens: int,
     ) -> tuple[
         torch.Tensor,
         SpecDecodeMetadata | None,
-        UBatchSlices | None,
-        torch.Tensor | None,
     ]:
         """
         :return: tuple[
             logits_indices, spec_decode_metadata,
-            ubatch_slices, num_tokens_across_dp,
         ]
         """
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -1364,28 +1359,6 @@ class GPUModelRunner(
         self.query_start_loc.copy_to_gpu()
         query_start_loc = self.query_start_loc.gpu[: num_reqs + 1]
 
-        num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
-        num_tokens_padded = self._get_num_input_tokens(num_tokens_unpadded)
-        uniform_decode = (
-            max_num_scheduled_tokens == self.uniform_decode_query_len
-        ) and (total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
-
-        # Disable DP padding when running eager to avoid excessive padding when
-        # running prefills. This lets us set enforce_eager on the prefiller in
-        # a P/D setup and still use CUDA graphs (enabled by this padding) on the
-        # decoder.
-        allow_dp_padding = self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-
-        ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
-            num_tokens_unpadded=num_tokens_unpadded,
-            parallel_config=self.parallel_config,
-            allow_microbatching=True,
-            allow_dp_padding=allow_dp_padding,
-            num_tokens_padded=num_tokens_padded,
-            uniform_decode=uniform_decode,
-            num_scheduled_tokens_per_request=num_scheduled_tokens,
-        )
-
         self.seq_lens.np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] + num_scheduled_tokens
         )
@@ -1486,15 +1459,15 @@ class GPUModelRunner(
         return (
             logits_indices,
             spec_decode_metadata,
-            ubatch_slices,
-            num_tokens_across_dp,
         )
 
     def _build_attention_metadata(
         self,
-        total_num_scheduled_tokens: int,
-        max_num_scheduled_tokens: int,
+        num_tokens: int,
         num_reqs: int,
+        max_query_len: int,
+        num_tokens_padded: int | None = None,
+        num_reqs_padded: int | None = None,
         ubatch_slices: UBatchSlices | None = None,
         logits_indices: torch.Tensor | None = None,
         use_spec_decode: bool = False,
@@ -1505,6 +1478,9 @@ class GPUModelRunner(
         """
         :return: tuple[attn_metadata, spec_decode_common_attn_metadata]
         """
+        num_tokens_padded = num_tokens_padded or num_tokens
+        num_reqs_padded = num_reqs_padded or num_reqs
+
         logits_indices_padded = None
         num_logits_indices = None
         if logits_indices is not None:
@@ -1522,28 +1498,13 @@ class GPUModelRunner(
                 self.dcp_rank,
                 self.parallel_config.cp_kv_cache_interleave_size,
             )
-            self.dcp_local_seq_lens.copy_to_gpu(num_reqs)
+            self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0)
+            self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded)
 
         attn_metadata: PerLayerAttnMetadata = {}
         if ubatch_slices is not None:
             attn_metadata = [dict() for _ in range(len(ubatch_slices))]
 
-        # Used in the below loop
-        query_start_loc = self.query_start_loc.gpu[: num_reqs + 1]
-        query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs + 1]
-        seq_lens = self.seq_lens.gpu[:num_reqs]
-        seq_lens_cpu = self.seq_lens.cpu[:num_reqs]
-        num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
-            :num_reqs
-        ]
-
-        dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None
-        if self.dcp_world_size > 1:
-            dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs]
-            dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs]
-
-        spec_decode_common_attn_metadata = None
-
         if for_cudagraph_capture:
             # For some attention backends (e.g. FA) with sliding window models we need
             # to make sure the backend see a max_seq_len that is larger to the sliding
@@ -1559,6 +1520,22 @@ class GPUModelRunner(
             self.num_accepted_tokens.np[num_reqs:].fill(1)
             self.num_accepted_tokens.copy_to_gpu()
 
+        # Used in the below loop, uses padded shapes
+        query_start_loc = self.query_start_loc.gpu[: num_reqs_padded + 1]
+        query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs_padded + 1]
+        seq_lens = self.seq_lens.gpu[:num_reqs_padded]
+        seq_lens_cpu = self.seq_lens.cpu[:num_reqs_padded]
+        num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
+            :num_reqs_padded
+        ]
+
+        dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None
+        if self.dcp_world_size > 1:
+            dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded]
+            dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs_padded]
+
+        spec_decode_common_attn_metadata = None
+
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_gid, kv_cache_group in enumerate(
@@ -1567,30 +1544,31 @@ class GPUModelRunner(
             encoder_seq_lens, encoder_seq_lens_cpu = self._get_encoder_seq_lens(
                 num_scheduled_tokens or {},
                 kv_cache_group.kv_cache_spec,
-                num_reqs,
+                num_reqs_padded,
             )
 
             if isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec):
                 # Encoder-only layers do not have KV cache, so we need to
                 # create a dummy block table and slot mapping for them.
                 blk_table_tensor = torch.zeros(
-                    (num_reqs, 1),
+                    (num_tokens_padded, 1),
                     dtype=torch.int32,
                     device=self.device,
                 )
                 slot_mapping = torch.zeros(
-                    (total_num_scheduled_tokens,),
+                    (num_tokens_padded,),
                     dtype=torch.int64,
                     device=self.device,
                 )
             else:
                 blk_table = self.input_batch.block_table[kv_cache_gid]
-                blk_table_tensor = blk_table.get_device_tensor(num_reqs)
-                slot_mapping = blk_table.slot_mapping.gpu[:total_num_scheduled_tokens]
+                blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded)
+                slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded]
 
                 # Fill unused with -1. Needed for reshape_and_cache in full cuda
-                # graph mode.
-                blk_table.slot_mapping.gpu[total_num_scheduled_tokens:].fill_(-1)
+                # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
+                slot_mapping[num_tokens:num_tokens_padded].fill_(-1)
+                blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1)
 
             common_attn_metadata = CommonAttentionMetadata(
                 query_start_loc=query_start_loc,
@@ -1598,9 +1576,9 @@ class GPUModelRunner(
                 seq_lens=seq_lens,
                 seq_lens_cpu=seq_lens_cpu,
                 num_computed_tokens_cpu=num_computed_tokens_cpu,
-                num_reqs=num_reqs,
-                num_actual_tokens=total_num_scheduled_tokens,
-                max_query_len=max_num_scheduled_tokens,
+                num_actual_tokens=num_tokens_padded,
+                num_reqs=num_reqs_padded,
+                max_query_len=max_query_len,
                 max_seq_len=max_seq_len,
                 block_table_tensor=blk_table_tensor,
                 slot_mapping=slot_mapping,
@@ -1631,9 +1609,11 @@ class GPUModelRunner(
                 extra_attn_metadata_args = {}
                 if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
                     extra_attn_metadata_args = dict(
-                        num_accepted_tokens=self.num_accepted_tokens.gpu[:num_reqs],
+                        num_accepted_tokens=self.num_accepted_tokens.gpu[
+                            :num_reqs_padded
+                        ],
                         num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[
-                            :num_reqs
+                            :num_reqs_padded
                         ],
                     )
 
@@ -1677,6 +1657,7 @@ class GPUModelRunner(
     def _compute_cascade_attn_prefix_lens(
         self,
         num_scheduled_tokens: np.ndarray,
+        num_computed_tokens: np.ndarray,
         num_common_prefix_blocks: list[int],
     ) -> list[list[int]] | None:
         """
@@ -1699,6 +1680,7 @@ class GPUModelRunner(
                     # 0 if cascade attention should not be used
                     cascade_attn_prefix_len = self._compute_cascade_attn_prefix_len(
                         num_scheduled_tokens,
+                        num_computed_tokens,
                         num_common_prefix_blocks[kv_cache_gid],
                         attn_group.kv_cache_spec,
                         attn_group.get_metadata_builder(),
@@ -1711,6 +1693,7 @@ class GPUModelRunner(
     def _compute_cascade_attn_prefix_len(
         self,
         num_scheduled_tokens: np.ndarray,
+        num_computed_tokens: np.ndarray,
         num_common_prefix_blocks: int,
         kv_cache_spec: KVCacheSpec,
         attn_metadata_builder: AttentionMetadataBuilder,
@@ -1777,10 +1760,7 @@ class GPUModelRunner(
         # and the second kernel will get an empty input. While this is not
         # a fundamental problem, our current implementation does not support
         # this case.
-        num_reqs = len(num_scheduled_tokens)
-        common_prefix_len = min(
-            common_prefix_len, self.input_batch.num_computed_tokens_cpu[:num_reqs].min()
-        )
+        common_prefix_len = min(common_prefix_len, num_computed_tokens.min())
         # common_prefix_len should be a multiple of the block size.
         common_prefix_len = (
             common_prefix_len // kv_cache_spec.block_size * kv_cache_spec.block_size
@@ -2334,19 +2314,6 @@ class GPUModelRunner(
             log_stats=self.parallel_config.eplb_config.log_balancedness,
         )
 
-    # This is where the second ubatch is adjusted to account for the padding.
-    # Should be called after attention metadata creation. This just pads
-    # the second ubatch slice out to the total number of tokens
-    # (num_tokens + padding)
-    @staticmethod
-    def pad_out_ubatch_slice(ubatch_slices: UBatchSlices, num_total_tokens: int):
-        padded_second_ubatch_slice = slice(
-            ubatch_slices[1].token_slice.start, num_total_tokens
-        )
-        ubatch_slices[1] = UBatchSlice(
-            padded_second_ubatch_slice, padded_second_ubatch_slice
-        )
-
     def _pool(
         self,
         hidden_states: torch.Tensor,
@@ -2391,18 +2358,7 @@ class GPUModelRunner(
             pooler_output=pooler_output,
         )
 
-    def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int:
-        if (
-            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            and hasattr(self, "cudagraph_batch_sizes")
-            and self.cudagraph_batch_sizes
-            and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]
-        ):
-            # Use CUDA graphs.
-            # Add padding to the batch size.
-            return self.vllm_config.pad_for_cudagraph(num_scheduled_tokens)
-
-        # Eager mode.
+    def _pad_for_sequence_parallelism(self, num_scheduled_tokens: int) -> int:
         # Pad tokens to multiple of tensor_parallel_size when
         # enabled collective fusion for SP
         tp_size = self.vllm_config.parallel_config.tensor_parallel_size
@@ -2738,6 +2694,87 @@ class GPUModelRunner(
             **model_kwargs,
         )
 
+    def _determine_batch_execution_and_padding(
+        self,
+        num_tokens: int,
+        num_reqs: int,
+        num_scheduled_tokens_np: np.ndarray,
+        max_num_scheduled_tokens: int,
+        use_cascade_attn: bool,
+        allow_microbatching: bool = True,
+        force_eager: bool = False,
+        # For cudagraph capture TODO(lucas): Refactor how we capture cudagraphs (will
+        # be improved in model runner v2)
+        force_uniform_decode: bool | None = None,
+        force_has_lora: bool | None = None,
+    ) -> tuple[
+        CUDAGraphMode, BatchDescriptor, UBatchSlices | None, torch.Tensor | None
+    ]:
+        num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
+        uniform_decode = (
+            (
+                (max_num_scheduled_tokens == self.uniform_decode_query_len)
+                and (num_tokens_padded == max_num_scheduled_tokens * num_reqs)
+            )
+            if force_uniform_decode is None
+            else force_uniform_decode
+        )
+
+        has_lora = (
+            len(self.input_batch.lora_id_to_lora_request) > 0
+            if force_has_lora is None
+            else force_has_lora
+        )
+
+        dispatch_cudagraph = (
+            lambda num_tokens: self.cudagraph_dispatcher.dispatch(
+                num_tokens=num_tokens,
+                has_lora=has_lora,
+                use_cascade_attn=use_cascade_attn,
+                uniform_decode=uniform_decode,
+            )
+            if not force_eager
+            else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
+        )
+
+        cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
+        num_tokens_padded = batch_descriptor.num_tokens
+
+        # Extra coordination when running data-parallel since we need to coordinate
+        # across ranks
+        ubatch_slices, num_tokens_across_dp = None, None
+        if self.vllm_config.parallel_config.data_parallel_size > 1:
+            # Disable DP padding when running eager to avoid excessive padding when
+            # running prefills. This lets us set cudagraph_mode="NONE" on the prefiller
+            # in a P/D setup and still use CUDA graphs (enabled by this padding) on the
+            # decoder.
+            allow_dp_padding = (
+                self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            )
+
+            ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
+                num_tokens_unpadded=num_tokens_padded,
+                parallel_config=self.parallel_config,
+                allow_microbatching=allow_microbatching,
+                allow_dp_padding=allow_dp_padding,
+                num_tokens_padded=num_tokens_padded,
+                uniform_decode=uniform_decode,
+                num_scheduled_tokens_per_request=num_scheduled_tokens_np,
+            )
+
+            # Extract DP padding if there is any
+            if num_tokens_across_dp is not None:
+                dp_rank = self.parallel_config.data_parallel_rank
+                num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
+
+                # Re-dispatch with DP padding
+                cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
+                # Assert to make sure the agreed upon token count is correct otherwise
+                # num_tokens_across_dp will no-longer be valid
+                assert batch_descriptor.num_tokens == num_tokens_padded
+
+        return cudagraph_mode, batch_descriptor, ubatch_slices, num_tokens_across_dp
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -2790,7 +2827,7 @@ class GPUModelRunner(
                         # returns True. before returning early here we call
                         # dummy run to ensure coordinate_batch_across_dp
                         # is called into to avoid out of sync issues.
-                        self._dummy_run(self._get_num_input_tokens(1))
+                        self._dummy_run(1)
                     if not has_kv_transfer_group():
                         # Return empty ModelRunnerOutput if no work to do.
                         return EMPTY_MODEL_RUNNER_OUTPUT
@@ -2809,36 +2846,63 @@ class GPUModelRunner(
                 tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
                 num_scheduled_tokens_np = np.array(tokens, dtype=np.int32)
                 max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
+                num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
 
                 (
                     logits_indices,
                     spec_decode_metadata,
-                    ubatch_slices,
-                    num_tokens_across_dp,
                 ) = self._prepare_inputs(
-                    scheduler_output, num_scheduled_tokens_np, max_num_scheduled_tokens
+                    scheduler_output,
+                    num_scheduled_tokens_np,
                 )
 
                 cascade_attn_prefix_lens = None
                 # Disable cascade attention when using microbatching (DBO)
-                if self.cascade_attn_enabled and ubatch_slices is None:
+                if self.cascade_attn_enabled and not self.parallel_config.enable_dbo:
                     # Pre-compute cascade attention prefix lengths
-                    # NOTE: Must be AFTER _prepare_inputs uses self.input_batch state
                     cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens(
                         num_scheduled_tokens_np,
+                        self.input_batch.num_computed_tokens_cpu[:num_reqs],
                         scheduler_output.num_common_prefix_blocks,
                     )
 
-                # TODO(lucas): move cudagraph dispatching here:
-                #   https://github.com/vllm-project/vllm/issues/23789
+                (
+                    cudagraph_mode,
+                    batch_desc,
+                    ubatch_slices,
+                    num_tokens_across_dp,
+                ) = self._determine_batch_execution_and_padding(
+                    num_tokens=num_tokens_unpadded,
+                    num_reqs=num_reqs,
+                    num_scheduled_tokens_np=num_scheduled_tokens_np,
+                    max_num_scheduled_tokens=max_num_scheduled_tokens,
+                    use_cascade_attn=cascade_attn_prefix_lens is not None,
+                )
+
+                logger.debug(
+                    "Running batch with cudagraph_mode: %s, batch_descriptor: %s, "
+                    "ubatch_slices: %s, num_tokens_across_dp: %s",
+                    cudagraph_mode,
+                    batch_desc,
+                    ubatch_slices,
+                    num_tokens_across_dp,
+                )
+
+                num_tokens_padded = batch_desc.num_tokens
+                num_reqs_padded = (
+                    batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
+                )
 
-                total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
                 use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
-                attn_metadata, spec_decode_common_attn_metadata = (
+                pad_attn = cudagraph_mode == CUDAGraphMode.FULL
+
+                (attn_metadata, spec_decode_common_attn_metadata) = (
                     self._build_attention_metadata(
-                        total_num_scheduled_tokens=total_num_scheduled_tokens,
-                        max_num_scheduled_tokens=max_num_scheduled_tokens,
+                        num_tokens=num_tokens_unpadded,
+                        num_tokens_padded=num_tokens_padded if pad_attn else None,
                         num_reqs=num_reqs,
+                        num_reqs_padded=num_reqs_padded if pad_attn else None,
+                        max_query_len=max_num_scheduled_tokens,
                         ubatch_slices=ubatch_slices,
                         logits_indices=logits_indices,
                         use_spec_decode=use_spec_decode,
@@ -2847,49 +2911,22 @@ class GPUModelRunner(
                     )
                 )
 
-                dp_rank = self.parallel_config.data_parallel_rank
-                if ubatch_slices:
-                    assert num_tokens_across_dp is not None
-                    num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
-                    self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
-                elif num_tokens_across_dp is not None:
-                    num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
-                else:
-                    num_input_tokens = self._get_num_input_tokens(
-                        scheduler_output.total_num_scheduled_tokens
-                    )
-
-                (
-                    input_ids,
-                    inputs_embeds,
-                    positions,
-                    intermediate_tensors,
-                    model_kwargs,
-                    ec_connector_output,
-                ) = self._preprocess(
-                    scheduler_output, num_input_tokens, intermediate_tensors
-                )
-
-            uniform_decode = (
-                max_num_scheduled_tokens == self.uniform_decode_query_len
-            ) and (num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
-            batch_desc = BatchDescriptor(
-                num_tokens=num_input_tokens,
-                uniform_decode=uniform_decode,
-                has_lora=len(self.input_batch.lora_id_to_lora_request) > 0,
-            )
-            cudagraph_runtime_mode, batch_descriptor = (
-                self.cudagraph_dispatcher.dispatch(
-                    batch_desc,
-                    use_cascade_attn=cascade_attn_prefix_lens is not None,
-                )
+            (
+                input_ids,
+                inputs_embeds,
+                positions,
+                intermediate_tensors,
+                model_kwargs,
+                ec_connector_output,
+            ) = self._preprocess(
+                scheduler_output, num_tokens_padded, intermediate_tensors
             )
 
         # Set cudagraph mode to none if calc_kv_scales is true.
         # KV scales calculation involves dynamic operations that are incompatible
         # with CUDA graph capture.
         if self.calculate_kv_scales:
-            cudagraph_runtime_mode = CUDAGraphMode.NONE
+            cudagraph_mode = CUDAGraphMode.NONE
             # Mark KV scales as calculated after the first forward pass
             self.calculate_kv_scales = False
 
@@ -2899,10 +2936,10 @@ class GPUModelRunner(
             set_forward_context(
                 attn_metadata,
                 self.vllm_config,
-                num_tokens=num_input_tokens,
+                num_tokens=num_tokens_padded,
                 num_tokens_across_dp=num_tokens_across_dp,
-                cudagraph_runtime_mode=cudagraph_runtime_mode,
-                batch_descriptor=batch_descriptor,
+                cudagraph_runtime_mode=cudagraph_mode,
+                batch_descriptor=batch_desc,
                 ubatch_slices=ubatch_slices,
             ),
             record_function_or_nullcontext("gpu_model_runner: forward"),
@@ -2952,7 +2989,7 @@ class GPUModelRunner(
                 if not get_pp_group().is_last_rank:
                     all_gather_tensors = {
                         "residual": not is_residual_scattered_for_sp(
-                            self.vllm_config, num_input_tokens
+                            self.vllm_config, num_tokens_padded
                         )
                     }
                     get_pp_group().send_tensor_dict(
@@ -3841,52 +3878,44 @@ class GPUModelRunner(
         assert sum(num_scheduled_tokens_list) == num_tokens
         assert len(num_scheduled_tokens_list) == num_reqs
         num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32)
-        total_num_scheduled_tokens = int(num_scheduled_tokens.sum())
+        num_tokens_unpadded = int(num_scheduled_tokens.sum())
+
         num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
 
-        # Disable DP padding when running eager
-        allow_dp_padding = self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-
-        # We currently only microbatch if the number of tokens is
-        # over a certain threshold.
-        ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
-            num_tokens_unpadded=total_num_scheduled_tokens,
-            parallel_config=self.vllm_config.parallel_config,
-            allow_microbatching=allow_microbatching,
-            allow_dp_padding=allow_dp_padding,
-            num_tokens_padded=total_num_scheduled_tokens,
-            uniform_decode=uniform_decode,
-            num_scheduled_tokens_per_request=num_scheduled_tokens,
-        )
-        num_tokens_after_padding = num_tokens
-        if num_tokens_across_dp is not None:
-            dp_rank = self.parallel_config.data_parallel_rank
-            num_tokens_after_padding = int(num_tokens_across_dp[dp_rank])
-
-        # filter out the valid batch descriptor
-        _cg_mode, batch_descriptor = (
-            self.cudagraph_dispatcher.dispatch(
-                BatchDescriptor(
-                    num_tokens=num_tokens_after_padding,
-                    uniform_decode=uniform_decode,
-                    has_lora=activate_lora and self.lora_config is not None,
-                )
+        _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp = (
+            self._determine_batch_execution_and_padding(
+                num_tokens=num_tokens_unpadded,
+                num_reqs=num_reqs,
+                num_scheduled_tokens_np=num_scheduled_tokens,
+                max_num_scheduled_tokens=max_query_len,
+                use_cascade_attn=False,
+                allow_microbatching=allow_microbatching,
+                force_eager=is_profile
+                or (cudagraph_runtime_mode == CUDAGraphMode.NONE),
+                # `force_uniform_decode` is used for cudagraph capture; because for
+                # capturing mixed prefill-decode batches, we sometimes use
+                # num_tokens == num_reqs which looks like a uniform decode batch to the
+                # dispatcher; but we actually want to capture a piecewise cudagraph
+                force_uniform_decode=uniform_decode,
+                # `force_has_lora` is used for cudagraph capture; because LoRA is
+                # activated later in the context manager, but we need to know the
+                # LoRA state when determining the batch descriptor for capture
+                force_has_lora=activate_lora,
             )
-            if not is_profile
-            else (CUDAGraphMode.NONE, None)
         )
-        if cudagraph_runtime_mode is not None:
-            # we allow forcing NONE when the dispatcher disagrees to support
-            # warm ups for cudagraph capture
-            assert (
-                cudagraph_runtime_mode == CUDAGraphMode.NONE
-                or cudagraph_runtime_mode == _cg_mode
-            ), (
-                f"Cudagraph runtime mode mismatch at dummy_run. "
-                f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}."
-            )
+
+        if cudagraph_runtime_mode is None:
+            cudagraph_runtime_mode = _cudagraph_mode
         else:
-            cudagraph_runtime_mode = _cg_mode
+            assert cudagraph_runtime_mode == _cudagraph_mode, (
+                f"Cudagraph runtime mode mismatch in dummy_run. "
+                f"Expected {_cudagraph_mode}, but got {cudagraph_runtime_mode}."
+            )
+
+        num_tokens_padded = batch_desc.num_tokens
+        num_reqs_padded = (
+            batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
+        )
 
         attn_metadata: PerLayerAttnMetadata | None = None
 
@@ -3909,9 +3938,9 @@ class GPUModelRunner(
             self.query_start_loc.copy_to_gpu()
 
             attn_metadata, _ = self._build_attention_metadata(
-                total_num_scheduled_tokens=num_tokens,
-                max_num_scheduled_tokens=max_query_len,
-                num_reqs=num_reqs,
+                num_tokens=num_tokens_unpadded,
+                num_reqs=num_reqs_padded,
+                max_query_len=max_query_len,
                 ubatch_slices=ubatch_slices,
                 for_cudagraph_capture=True,
             )
@@ -3924,29 +3953,29 @@ class GPUModelRunner(
             remove_lora,
         ):
             # Make sure padding doesn't exceed max_num_tokens
-            assert num_tokens_after_padding <= self.max_num_tokens
-            model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
+            assert num_tokens_padded <= self.max_num_tokens
+            model_kwargs = self._init_model_kwargs(num_tokens_padded)
             if self.supports_mm_inputs and not self.model_config.is_encoder_decoder:
                 input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
                 model_kwargs = {
                     **model_kwargs,
                     **self._dummy_mm_kwargs(num_reqs),
                 }
             elif self.enable_prompt_embeds:
                 input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
-                model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
+                model_kwargs = self._init_model_kwargs(num_tokens_padded)
             else:
-                input_ids = self.input_ids.gpu[:num_tokens_after_padding]
+                input_ids = self.input_ids.gpu[:num_tokens_padded]
                 inputs_embeds = None
 
             if self.uses_mrope:
-                positions = self.mrope_positions.gpu[:, :num_tokens_after_padding]
+                positions = self.mrope_positions.gpu[:, :num_tokens_padded]
             elif self.uses_xdrope_dim > 0:
-                positions = self.xdrope_positions.gpu[:, :num_tokens_after_padding]
+                positions = self.xdrope_positions.gpu[:, :num_tokens_padded]
             else:
-                positions = self.positions.gpu[:num_tokens_after_padding]
+                positions = self.positions.gpu[:num_tokens_padded]
 
             if get_pp_group().is_first_rank:
                 intermediate_tensors = None
@@ -3961,26 +3990,26 @@ class GPUModelRunner(
                     )
 
                 intermediate_tensors = self.sync_and_slice_intermediate_tensors(
-                    num_tokens_after_padding, None, False
+                    num_tokens_padded, None, False
                 )
 
             if ubatch_slices is not None:
                 # Adjust values to reflect a single ubatch.
                 # TODO(sage,lucas): this is cruft that should be addressed in
                 #  the padding refactor.
-                num_tokens_after_padding = ubatch_slices[0].num_tokens
+                num_tokens_padded = ubatch_slices[0].num_tokens
                 if num_tokens_across_dp is not None:
-                    num_tokens_across_dp[:] = num_tokens_after_padding
+                    num_tokens_across_dp[:] = num_tokens_padded
 
             with (
                 self.maybe_randomize_inputs(input_ids),
                 set_forward_context(
                     attn_metadata,
                     self.vllm_config,
-                    num_tokens=num_tokens_after_padding,
+                    num_tokens=num_tokens_padded,
                     num_tokens_across_dp=num_tokens_across_dp,
                     cudagraph_runtime_mode=cudagraph_runtime_mode,
-                    batch_descriptor=batch_descriptor,
+                    batch_descriptor=batch_desc,
                     ubatch_slices=ubatch_slices,
                 ),
             ):
@@ -4706,8 +4735,7 @@ class GPUModelRunner(
 
         # Trigger cudagraph dispatching keys initialization after
         # resolved cudagraph mode.
-        cudagraph_mode = self.compilation_config.cudagraph_mode
-        assert cudagraph_mode is not None
+        self.compilation_config.cudagraph_mode = cudagraph_mode
         self.cudagraph_dispatcher.initialize_cudagraph_keys(
             cudagraph_mode, self.uniform_decode_query_len
         )
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 6a4bfde5f972b..d0c6091ce2a6e 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -8,12 +8,13 @@ from contextlib import AbstractContextManager, nullcontext
 from types import NoneType
 from typing import TYPE_CHECKING, Any, cast
 
+import numpy as np
 import torch
 import torch.distributed
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.distributed import (
     ensure_model_parallel_initialized,
     init_distributed_environment,
@@ -487,6 +488,7 @@ class Worker(WorkerBase):
             hidden_states, last_hidden_states = self.model_runner._dummy_run(
                 num_tokens=max_num_reqs,
                 skip_eplb=True,
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
             )
             if self.model_runner.is_pooling_model:
                 self.model_runner._dummy_pooler_run(hidden_states)
@@ -534,12 +536,39 @@ class Worker(WorkerBase):
         intermediate_tensors = None
         forward_pass = scheduler_output.total_num_scheduled_tokens > 0
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        num_input_tokens = self.model_runner._get_num_input_tokens(num_scheduled_tokens)
-        all_gather_tensors = {
-            "residual": not is_residual_scattered_for_sp(
-                self.vllm_config, num_input_tokens
+        all_gather_tensors = {}
+        compilation_config = self.vllm_config.compilation_config
+        parallel_config = self.vllm_config.parallel_config
+
+        if (
+            parallel_config.pipeline_parallel_size > 1
+            and compilation_config.pass_config.enable_sequence_parallelism
+            and forward_pass
+        ):
+            # currently only supported by V1 GPUModelRunner
+            assert isinstance(self.model_runner, GPUModelRunner)
+            num_scheduled_tokens_np = np.array(
+                list(scheduler_output.num_scheduled_tokens.values()),
+                dtype=np.int32,
             )
-        }
+            # TODO(lucas): This is pretty gross; ideally we should only ever call
+            # `_determine_batch_execution_and_padding` once (will get called again
+            # in `execute_model`) but this requires a larger refactor of PP.
+            _, batch_desc, _, _ = (
+                self.model_runner._determine_batch_execution_and_padding(
+                    num_tokens=num_scheduled_tokens,
+                    num_reqs=len(num_scheduled_tokens_np),
+                    num_scheduled_tokens_np=num_scheduled_tokens_np,
+                    max_num_scheduled_tokens=num_scheduled_tokens_np.max(),
+                    use_cascade_attn=False,  # TODO(lucas): Handle cascade attention
+                )
+            )
+            all_gather_tensors = {
+                "residual": not is_residual_scattered_for_sp(
+                    self.vllm_config, batch_desc.num_tokens
+                )
+            }
+
         if forward_pass and not get_pp_group().is_first_rank:
             tensor_dict = get_pp_group().recv_tensor_dict(
                 all_gather_group=get_tp_group(),

From ba1fcd84a7f1dc907c17bf4ba4fab6762a9f33a1 Mon Sep 17 00:00:00 2001
From: Johnny Yang <24908445+jcyang43@users.noreply.github.com>
Date: Wed, 26 Nov 2025 14:46:36 -0800
Subject: [PATCH 097/134] [TPU] add tpu_inference (#27277)

Signed-off-by: Johnny Yang <johnnyyang@google.com>
---
 requirements/tpu.txt                                      | 4 +---
 vllm/distributed/device_communicators/tpu_communicator.py | 8 --------
 vllm/platforms/tpu.py                                     | 4 +++-
 vllm/v1/worker/tpu_worker.py                              | 2 +-
 4 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 4241cbb2b0333..e6fff58f7b794 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -12,6 +12,4 @@ ray[data]
 setuptools==78.1.0
 nixl==0.3.0
 tpu_info==0.4.0
-
-# Install torch_xla
-torch_xla[tpu, pallas]==2.8.0
\ No newline at end of file
+tpu-inference==0.11.1
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index a7724a86cc6a5..fa99078e9ff0d 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -97,11 +97,3 @@ class TpuCommunicator(DeviceCommunicatorBase):
     def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         assert dim == -1, "TPUs only support dim=-1 for all-gather."
         return xm.all_gather(input_, dim=dim)
-
-
-if USE_TPU_INFERENCE:
-    from tpu_inference.distributed.device_communicators import (
-        TpuCommunicator as TpuInferenceCommunicator,
-    )
-
-    TpuCommunicator = TpuInferenceCommunicator  # type: ignore
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 944344a229578..aa5ddbe43659d 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -267,7 +267,9 @@ class TpuPlatform(Platform):
 
 
 try:
-    from tpu_inference.platforms import TpuPlatform as TpuInferencePlatform
+    from tpu_inference.platforms.tpu_platforms import (
+        TpuPlatform as TpuInferencePlatform,
+    )
 
     TpuPlatform = TpuInferencePlatform  # type: ignore
     USE_TPU_INFERENCE = True
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index e1a109eca0a88..ce18ca6c37165 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -346,6 +346,6 @@ class TPUWorker:
 
 
 if USE_TPU_INFERENCE:
-    from tpu_inference.worker import TPUWorker as TpuInferenceWorker
+    from tpu_inference.worker.tpu_worker import TPUWorker as TpuInferenceWorker
 
     TPUWorker = TpuInferenceWorker  # type: ignore

From df01eda4dc570dbf9aa45dd196e288d13f427fab Mon Sep 17 00:00:00 2001
From: HDCharles <39544797+HDCharles@users.noreply.github.com>
Date: Wed, 26 Nov 2025 21:35:13 -0500
Subject: [PATCH 098/134] [Bugfix] Make compressed-tensors MoEs respect ignored
 layers (#28878)

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  1 +
 tests/quantization/test_compressed_tensors.py | 48 +++++++++++++
 .../layers/fused_moe/__init__.py              |  4 ++
 .../compressed_tensors/compressed_tensors.py  | 72 ++++++++++++++-----
 .../compressed_tensors_moe.py                 | 60 +++++++---------
 5 files changed, 133 insertions(+), 52 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d14b524b793a5..375645fde7477 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -632,6 +632,7 @@ steps:
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
   - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
 - label: LM Eval Small Models # 53min
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 31b65189b5ec3..412b21328a325 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -10,6 +10,7 @@ import torch
 from compressed_tensors.quantization import QuantizationType
 
 from tests.models.utils import check_logprobs_close
+from vllm.model_executor.layers.fused_moe import UnquantizedFusedMoEMethod
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensors24,
     CompressedTensorsLinearMethod,
@@ -767,3 +768,50 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):
 
         output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="This test is not for non-CUDA platforms",
+)
+def test_compressed_tensors_moe_ignore_with_model(vllm_runner):
+    """
+    Integration test for MoE layer ignore functionality with a real model.
+
+    This test would verify that when loading a compressed-tensors quantized
+    MoE model where some MoE layers are in the ignore list, those layers
+    use UnquantizedFusedMoEMethod while non-ignored layers use the
+    quantized method.
+
+    Expected model structure:
+    - Compressed-tensors quantized MoE model (e.g., Mixtral-based)
+    - Config with ignore list containing specific MoE layers
+    - Multiple MoE layers where some are quantized and some are not
+    """
+
+    # model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only" # CT 12.3
+    model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only-CTstable"  # CT 12.2
+
+    with vllm_runner(model_path, enforce_eager=True) as llm:
+
+        def check_model(model):
+            from vllm.model_executor.layers.fused_moe import FusedMoE
+            from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
+                CompressedTensorsMoEMethod,
+            )
+
+            # Check layer 0 MoE (should be quantized)
+            layer_quantized = model.model.layers[0].mlp.experts
+            assert isinstance(layer_quantized, FusedMoE)
+            assert isinstance(layer_quantized.quant_method, CompressedTensorsMoEMethod)
+
+            # Check layer 10 MoE (should be unquantized + ignored)
+            layer_unquantized = model.model.layers[3].mlp.experts
+            assert isinstance(layer_unquantized, FusedMoE)
+            assert isinstance(layer_unquantized.quant_method, UnquantizedFusedMoEMethod)
+
+        llm.apply_model(check_model)
+
+        # Verify the model can generate output
+        output = llm.generate_greedy("Hello, my name is", max_tokens=4)
+        assert output
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 53d98d0650b43..669abcb3d6ff1 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -18,6 +18,9 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEPrepareAndFinalize,
 )
 from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
+    UnquantizedFusedMoEMethod,
+)
 from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
 from vllm.triton_utils import HAS_TRITON
 
@@ -41,6 +44,7 @@ __all__ = [
     "FusedMoE",
     "FusedMoEConfig",
     "FusedMoEMethodBase",
+    "UnquantizedFusedMoEMethod",
     "FusedMoeWeightScaleSupported",
     "FusedMoEPermuteExpertsUnpermute",
     "FusedMoEActivationFormat",
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 2800f90ce0b67..7f61746a4e45c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -158,9 +158,23 @@ class CompressedTensorsConfig(QuantizationConfig):
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
-            return CompressedTensorsMoEMethod.get_moe_method(self, layer)
+            return CompressedTensorsMoEMethod.get_moe_method(self, layer, prefix)
         return None
 
+    def _add_fused_moe_to_target_scheme_map(self):
+        """
+        Helper function to update target_scheme_map
+        since linear layers get fused into FusedMoE
+        targetting 'Linear' needs to also match
+        FusedMoE modules.
+        """
+        if (
+            "Linear" not in self.target_scheme_map
+            or "FusedMoE" in self.target_scheme_map
+        ):
+            return
+        self.target_scheme_map["FusedMoE"] = self.target_scheme_map["Linear"]
+
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig":
         ignore: list[str] = cast(list[str], config.get("ignore", []))
@@ -655,25 +669,13 @@ class CompressedTensorsConfig(QuantizationConfig):
         to select the CompressedTensorsScheme used for inference.
         """
 
-        # Find the "target" in the compressed-tensors config
-        # that our layer conforms to.
-        # TODO (@kylesayrs): support ignore module names with ct matching utils
-        if should_ignore_layer(
-            layer_name, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
-        ):
-            return None
+        # Use the new get_quant_args method to extract QuantizationArgs
+        scheme_dict = self.get_scheme_dict(layer, layer_name)
 
-        # Will be empty for models with only sparsity
-        weight_quant = input_quant = None
-        if self.target_scheme_map:
-            matched_target = find_matched_target(
-                layer_name=layer_name,
-                module=layer,
-                targets=self.target_scheme_map.keys(),
-                fused_mapping=self.packed_modules_mapping,
-            )
-
-            scheme_dict = self.target_scheme_map[matched_target]
+        weight_quant = None
+        input_quant = None
+        format = None
+        if scheme_dict:
             weight_quant = scheme_dict.get("weights")
             input_quant = scheme_dict.get("input_activations")
             format = scheme_dict.get("format")
@@ -732,6 +734,38 @@ class CompressedTensorsConfig(QuantizationConfig):
         logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
         return scheme
 
+    def get_scheme_dict(
+        self, layer: torch.nn.Module, layer_name: str | None = None
+    ) -> dict[str, QuantizationArgs | str | None] | None:
+        """
+        Extract the QuantizationArgs for a given layer.
+
+        Returns:
+            dict with {
+                "weights": QuantizationArgs,
+                "input_activations": QuantizationArgs | None,
+                "format": str | None
+            } | None
+        """
+        # TODO (@kylesayrs): support ignore module names with ct matching utils
+        if should_ignore_layer(
+            layer_name, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
+        ):
+            return None
+
+        # Will be empty for models with only sparsity
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys(),
+                fused_mapping=self.packed_modules_mapping,
+            )
+
+            return self.target_scheme_map[matched_target]
+
+        return None
+
     def get_cache_scale(self, name: str) -> str | None:
         """
         Check whether the param name matches the format for k/v cache scales
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 71d7de97d4a10..c7dfd1787cc8f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe import (
     FusedMoEMethodBase,
     FusedMoEPermuteExpertsUnpermute,
     FusedMoeWeightScaleSupported,
+    UnquantizedFusedMoEMethod,
 )
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
@@ -45,9 +46,6 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compress
     WNA16_SUPPORTED_BITS,
     WNA16_SUPPORTED_TYPES_MAP,
 )
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    find_matched_target,
-)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize,
@@ -113,39 +111,35 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         layer: torch.nn.Module,
+        prefix: str,
     ) -> "CompressedTensorsMoEMethod":
+        # FusedMoE was made by combining multiple Linears so need to
+        # make sure quantization config for Linear can target it
+        quant_config._add_fused_moe_to_target_scheme_map()
+        unfused_names = [
+            prefix + proj_name
+            for proj_name in [".0.gate_proj", ".0.up_proj", ".0.down_proj"]
+        ]
+        # TODO: refactor this to use expert_mapping and check all layer numbers
+        all_scheme_dicts = [
+            quant_config.get_scheme_dict(layer, name) for name in unfused_names
+        ]
+        scheme_dict = all_scheme_dicts.pop()
+
+        # multiple schemes found
+        if not all([cur_dict == scheme_dict for cur_dict in all_scheme_dicts]):
+            raise ValueError(
+                "All MoE projections need to have same "
+                "quantization scheme but found multiple"
+            )
+
+        if scheme_dict is None:  # ignored layer
+            return UnquantizedFusedMoEMethod(layer.moe_config)
+
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
-        # Check if a using "Linear" to select schemes
-        if "Linear" in quant_config.target_scheme_map:
-            matched_target = "Linear"
-        else:
-            # May have instead defined the linear layers in the fused model
-
-            fused_layers = ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"]
-            current_scheme = None
-            for fused_layer in fused_layers:
-                # Check if one of the fused layers are defined in quant_config
-                matched_target = find_matched_target(
-                    layer_name=fused_layer,
-                    module=layer,
-                    targets=quant_config.target_scheme_map.keys(),
-                    fused_mapping=quant_config.packed_modules_mapping,
-                )
-
-                # Only valid if down_proj, gate_proj, and up_proj
-                # are mapped to the same quant scheme in the quant_config
-                if current_scheme is None:
-                    current_scheme = quant_config.target_scheme_map.get(matched_target)
-                else:
-                    assert current_scheme == quant_config.target_scheme_map.get(
-                        matched_target
-                    )
-
-        weight_quant = quant_config.target_scheme_map[matched_target].get("weights")
-        input_quant = quant_config.target_scheme_map[matched_target].get(
-            "input_activations"
-        )
+        weight_quant = scheme_dict.get("weights")
+        input_quant = scheme_dict.get("input_activations")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
             # group_size=None means channelwise

From 77740191de965329e143e501321637a3e242e2f6 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 26 Nov 2025 21:48:43 -0500
Subject: [PATCH 099/134] [Attention][Async] Eliminate `seq_lens_cpu` in
 FlashAttention metadata building with DCP > 1 (#29449)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/v1/attention/backends/flash_attn.py | 27 ++++++++++++------------
 vllm/v1/attention/backends/utils.py      |  6 ++++--
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 0fc57cfb1f9d3..a1558073003fd 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -328,7 +328,6 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
         max_seq_len = common_attn_metadata.max_seq_len
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
         causal = common_attn_metadata.causal
@@ -401,20 +400,23 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
         prefix_scheduler_metadata = None
 
         if self.dcp_world_size > 1:
-            query_kv_lens_cpu = (
-                common_attn_metadata.query_start_loc_cpu[1:]
-                - common_attn_metadata.query_start_loc_cpu[:-1]
-            )
-            dcp_context_kv_lens_cpu = seq_lens_cpu - query_kv_lens_cpu
+            query_kv_lens = query_start_loc[1:] - query_start_loc[:-1]
+            dcp_context_kv_lens = seq_lens - query_kv_lens
 
-            dcp_context_kv_lens_cpu = get_dcp_local_seq_lens(
-                dcp_context_kv_lens_cpu,
+            dcp_context_kv_lens = get_dcp_local_seq_lens(
+                dcp_context_kv_lens,
                 self.dcp_world_size,
                 self.dcp_rank,
                 self.cp_kv_cache_interleave_size,
             )
-            dcp_context_kv_lens = dcp_context_kv_lens_cpu.to(self.device)
-            max_dcp_context_kv_len = dcp_context_kv_lens.max().item()
+            # After DCP distribution, the maximum number of tokens for any rank is
+            # ceil(L / (N * I)) * I, where L is max_seq_len, N is dcp_world_size,
+            # and I is cp_kv_cache_interleave_size.
+            # This eliminates GPU->CPU sync while minimizing workspace over-allocation.
+            num_partitions = self.dcp_world_size * self.cp_kv_cache_interleave_size
+            max_dcp_context_kv_len = (
+                (max_seq_len + num_partitions - 1) // num_partitions
+            ) * self.cp_kv_cache_interleave_size
 
             scheduler_metadata = schedule(
                 batch_size=num_reqs,
@@ -431,9 +433,8 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
             prefix_kv_lens = torch.tensor(
                 [common_prefix_len], dtype=torch.int32, device=self.device
             )
-            suffix_kv_lens = (seq_lens_cpu[:num_reqs] - common_prefix_len).to(
-                self.device, non_blocking=True
-            )
+            # Use GPU tensor directly - no CPU sync needed
+            suffix_kv_lens = seq_lens[:num_reqs] - common_prefix_len
             prefix_scheduler_metadata = schedule(
                 batch_size=1,
                 cu_query_lens=cu_prefix_query_lens,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 18e91fd4fd6a5..ea9dccc702a0a 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -1095,12 +1095,14 @@ def get_dcp_local_seq_lens(
     num_requests = seq_lens.size(0)
     if dcp_rank is None:
         rank_offsets = (
-            torch.arange(dcp_size, dtype=torch.int32)
+            torch.arange(dcp_size, dtype=torch.int32, device=seq_lens.device)
             .unsqueeze(0)
             .repeat(num_requests, 1)
         )
     else:
-        rank_offsets = torch.Tensor([[dcp_rank]]).to(dtype=torch.int32)
+        rank_offsets = torch.tensor(
+            [[dcp_rank]], dtype=torch.int32, device=seq_lens.device
+        )
     seq_lens_tiled = (
         seq_lens.to(torch.int32).unsqueeze(-1).repeat(1, rank_offsets.shape[1])
     )

From a67dec7cba6239022b5b713845e29d9cbb294ec7 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Thu, 27 Nov 2025 11:02:21 +0800
Subject: [PATCH 100/134] [Bugfix] fix IMA issue in certain cases of the moe
 marlin kernel (#28619)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 csrc/moe/marlin_moe_wna16/marlin_template.h    | 18 ++++++++++--------
 .../layers/fused_moe/shared_fused_moe.py       |  1 -
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
index dd86a9a5ba6e9..4dbca30da57a1 100644
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -489,14 +489,16 @@ __global__ void Marlin(
   #pragma unroll
         for (int i = 0; i < 4; i++) {
           int idx = tid4 * 4 + i;
-          idx = idx < block_num_valid_tokens ? idx : 0;
-          if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
-            sh_block_topk_weights[idx] = __hmul2(
-                global_scale, Dtype::num2num2(Dtype::float2num(
-                                  topk_weights_ptr[sh_block_sorted_ids[idx]])));
-          } else {
-            sh_block_topk_weights[idx] = Dtype::num2num2(
-                Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
+          if (idx < block_num_valid_tokens) {
+            if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+              sh_block_topk_weights[idx] =
+                  __hmul2(global_scale,
+                          Dtype::num2num2(Dtype::float2num(
+                              topk_weights_ptr[sh_block_sorted_ids[idx]])));
+            } else {
+              sh_block_topk_weights[idx] = Dtype::num2num2(
+                  Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
+            }
           }
         }
       }
diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
index 6ec8b33ed9309..9aaeec4f98a61 100644
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -38,7 +38,6 @@ class SharedFusedMoE(FusedMoE):
                 # TODO(wentao): find the root cause and remove this condition
                 self.enable_eplb
                 or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1)
-                or self.use_marlin_kernels
             )
             and self._shared_experts is not None
         )

From 9bb33c8919024a50e48687c08d706df3fa3302ed Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Wed, 26 Nov 2025 19:30:50 -0800
Subject: [PATCH 101/134] add xpu supported model and model id for cpu (#29380)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 docs/models/hardware_supported_models/cpu.md | 26 +++++---
 docs/models/hardware_supported_models/xpu.md | 65 ++++++++++++++++++++
 2 files changed, 82 insertions(+), 9 deletions(-)
 create mode 100644 docs/models/hardware_supported_models/xpu.md

diff --git a/docs/models/hardware_supported_models/cpu.md b/docs/models/hardware_supported_models/cpu.md
index 0832755f8fbe2..811778b2ad529 100644
--- a/docs/models/hardware_supported_models/cpu.md
+++ b/docs/models/hardware_supported_models/cpu.md
@@ -1,25 +1,33 @@
 # CPU - Intel® Xeon®
 
+## Validated Hardware
+
+| Hardware                                 |
+| ----------------------------------------- |
+| [Intel® Xeon® 6 Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html)                   |
+| [Intel® Xeon® 5 Processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html)              |
+
 ## Supported Models
 
 ### Text-only Language Models
 
 | Model                                | Architecture                             | Supported |
 |--------------------------------------|-------------------------------------------|-----------|
-| meta-llama/Llama-3.1 / 3.3           | LlamaForCausalLM                          | ✅        |
-| meta-llama/Llama-4-Scout             | Llama4ForConditionalGeneration            | ✅        |
-| meta-llama/Llama-4-Maverick          | Llama4ForConditionalGeneration            | ✅        |
-| ibm-granite/granite (Granite-MOE)    | GraniteMoeForCausalLM                     | ✅        |
-| Qwen/Qwen3                           | Qwen3ForCausalLM                          | ✅        |
-| zai-org/GLM-4.5                      | GLMForCausalLM                            | ✅        |
-| google/gemma                         | GemmaForCausalLM                          | ✅        |
+| meta-llama/Llama-3.1-8B-Instruct     | LlamaForCausalLM                          | ✅        |
+| meta-llama/Llama-3.2-3B-Instruct     | LlamaForCausalLM                          | ✅        |
+| ibm-granite/granite-3.2-2b-instruct  | GraniteForCausalLM                        | ✅        |
+| Qwen/Qwen3-1.7B                      | Qwen3ForCausalLM                          | ✅        |
+| Qwen/Qwen3-4B                        | Qwen3ForCausalLM                          | ✅        |
+| Qwen/Qwen3-8B                        | Qwen3ForCausalLM                          | ✅        |
+| zai-org/glm-4-9b-hf                  | GLMForCausalLM                            | ✅        |
+| google/gemma-7b                      | GemmaForCausalLM                          | ✅        |
 
 ### Multimodal Language Models
 
 | Model                                | Architecture                             | Supported |
 |--------------------------------------|-------------------------------------------|-----------|
-| Qwen/Qwen2.5-VL                      | Qwen2VLForConditionalGeneration           | ✅        |
-| openai/whisper                       | WhisperForConditionalGeneration           | ✅        |
+| Qwen/Qwen2.5-VL-7B-Instruct          | Qwen2VLForConditionalGeneration           | ✅        |
+| openai/whisper-large-v3              | WhisperForConditionalGeneration           | ✅        |
 
 ✅ Runs and optimized.  
 🟨 Runs and correct but not optimized to green yet.  
diff --git a/docs/models/hardware_supported_models/xpu.md b/docs/models/hardware_supported_models/xpu.md
new file mode 100644
index 0000000000000..7b8dcf5c9af26
--- /dev/null
+++ b/docs/models/hardware_supported_models/xpu.md
@@ -0,0 +1,65 @@
+# XPU - Intel® GPUs
+
+## Validated Hardware
+
+| Hardware                                 |
+| ----------------------------------------- |
+| [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/workstations/b-series/overview.html)                   |
+
+## Supported Models
+
+### Text-only Language Models
+
+| Model                                     | Architecture                                         | FP16 | Dynamic FP8 | MXFP4 |
+| ----------------------------------------- | ---------------------------------------------------- | ---- | ----------- | ----- |
+| openai/gpt-oss-20b                        | GPTForCausalLM                                       |      |             | ✅     |
+| openai/gpt-oss-120b                       | GPTForCausalLM                                       |      |             | ✅     |
+| deepseek-ai/DeepSeek-R1-Distill-Llama-8B  | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Qwen-14B  | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Qwen-32B  | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Llama-70B | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen2.5-72B-Instruct                 | Qwen2ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-14B                            | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-32B                            | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-30B-A3B                        | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-30B-A3B-GPTQ-Int4              | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-coder-30B-A3B-Instruct         | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/QwQ-32B                              | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-V2-Lite              | DeepSeekForCausalLM                                  | ✅    | ✅           |       |
+| meta-llama/Llama-3.1-8B-Instruct          | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅    | ✅           |       |
+| THUDM/GLM-4-9B-chat                       | GLMForCausalLM                                       | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| chuhac/TeleChat2-35B                      | LlamaForCausalLM (TeleChat2 based on Llama arch)     | ✅    | ✅           |       |
+| 01-ai/Yi1.5-34B-Chat                      | YiForCausalLM                                        | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-Coder-33B-base       | DeepSeekCoderForCausalLM                             | ✅    | ✅           |       |
+| baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅    | ✅           |       |
+| meta-llama/Llama-2-13b-chat-hf            | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| Qwen/Qwen1.5-14B-Chat                     | QwenForCausalLM                                      | ✅    | ✅           |       |
+| Qwen/Qwen1.5-32B-Chat                     | QwenForCausalLM                                      | ✅    | ✅           |       |
+
+### Multimodal Language Models
+
+| Model                        | Architecture                     | FP16 | Dynamic FP8 | MXFP4 |
+| ---------------------------- | -------------------------------- | ---- | ----------- | ----- |
+| OpenGVLab/InternVL3_5-8B     | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| OpenGVLab/InternVL3_5-14B    | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| OpenGVLab/InternVL3_5-38B    | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| Qwen/Qwen2-VL-7B-Instruct    | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| Qwen/Qwen2.5-VL-72B-Instruct | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| Qwen/Qwen2.5-VL-32B-Instruct | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| THUDM/GLM-4v-9B              | GLM4vForConditionalGeneration    | ✅    | ✅           |       |
+| openbmb/MiniCPM-V-4          | MiniCPMVForConditionalGeneration | ✅    | ✅           |       |
+
+### Embedding and Reranker Language Models
+
+| Model                   | Architecture                   | FP16 | Dynamic FP8 | MXFP4 |
+| ----------------------- | ------------------------------ | ---- | ----------- | ----- |
+| Qwen/Qwen3-Embedding-8B | Qwen3ForTextEmbedding          | ✅    | ✅           |       |
+| Qwen/Qwen3-Reranker-8B  | Qwen3ForSequenceClassification | ✅    | ✅           |       |
+
+✅ Runs and optimized.  
+🟨 Runs and correct but not optimized to green yet.  
+❌ Does not pass accuracy test or does not run.  

From 0aeb698b774e2d8593b14988e3af9ebbdd773730 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 26 Nov 2025 19:47:17 -0800
Subject: [PATCH 102/134] [Model Runner V2] Minor code cleanup (#29570)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py | 11 ++---------
 vllm/v1/worker/gpu/dp_utils.py        |  9 +++++++++
 vllm/v1/worker/gpu/model_runner.py    | 16 +++++++---------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index ba783e2d0c6fb..6b056641c903d 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -16,6 +16,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
 from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.dp_utils import make_num_tokens_across_dp
 from vllm.v1.worker.gpu.input_batch import InputBuffers
 
 
@@ -127,15 +128,7 @@ class CudaGraphManager:
             slot_mappings=slot_mappings,
             kv_cache_config=kv_cache_config,
         )
-        if self.dp_size > 1:
-            num_tokens_across_dp = torch.full(
-                (self.dp_size,),
-                batch_size,
-                dtype=torch.int32,
-                device="cpu",
-            )
-        else:
-            num_tokens_across_dp = None
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, batch_size)
 
         # Warm up.
         with set_forward_context(
diff --git a/vllm/v1/worker/gpu/dp_utils.py b/vllm/v1/worker/gpu/dp_utils.py
index 9bfc7f25bef3a..d71d91d1e5cb8 100644
--- a/vllm/v1/worker/gpu/dp_utils.py
+++ b/vllm/v1/worker/gpu/dp_utils.py
@@ -20,3 +20,12 @@ def get_batch_metadata_across_dp(
     tensor[1][dp_rank] = cudagraph_size
     dist.all_reduce(tensor, group=group)
     return tensor[0], tensor[1]
+
+
+def make_num_tokens_across_dp(
+    dp_size: int,
+    num_tokens: int,
+) -> torch.Tensor | None:
+    if dp_size == 1:
+        return None
+    return torch.full((dp_size,), num_tokens, dtype=torch.int32, device="cpu")
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index e34a45f979807..6a78776b0a8a3 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -35,7 +35,10 @@ from vllm.v1.worker.gpu.attn_utils import (
 )
 from vllm.v1.worker.gpu.block_table import BlockTables
 from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager
-from vllm.v1.worker.gpu.dp_utils import get_batch_metadata_across_dp
+from vllm.v1.worker.gpu.dp_utils import (
+    get_batch_metadata_across_dp,
+    make_num_tokens_across_dp,
+)
 from vllm.v1.worker.gpu.input_batch import (
     InputBatch,
     InputBuffers,
@@ -255,12 +258,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if not skip_attn:
             self.prepare_dummy_attn_metadata(input_batch)
 
-        if self.dp_size == 1:
-            num_tokens_across_dp: torch.Tensor | None = None
-        else:
-            num_tokens_across_dp = torch.full(
-                (self.dp_size,), num_tokens, dtype=torch.int32, device="cpu"
-            )
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens)
         num_sampled_tokens = np.ones(input_batch.num_reqs, dtype=np.int32)
         with (
             self.maybe_dummy_run_with_lora(
@@ -816,7 +814,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.req_states.last_sampled_tokens,
             next_prefill_tokens,
         )
-        self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens
         return draft_tokens
 
     def get_cudagraph_and_dp_padding(
@@ -1006,7 +1003,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             input_batch, sampler_output.sampled_token_ids, num_sampled, num_rejected
         )
         if self.do_spec_decode:
-            _ = self.propose_draft(
+            draft_tokens = self.propose_draft(
                 input_batch,
                 sampling_metadata,
                 hidden_states,
@@ -1014,6 +1011,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 num_sampled,
                 num_rejected,
             )
+            self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens
 
         if self.use_async_scheduling:
             return async_output

From ee80aee1cab6f0b6893cf54c8aa2f2c23512ec82 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 26 Nov 2025 20:10:12 -0800
Subject: [PATCH 103/134] [Model Runner V2] Minor cleanup for
 build_attn_metadata (#29576)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/attn_utils.py      |  8 +++-----
 vllm/v1/worker/gpu/cudagraph_utils.py |  3 ++-
 vllm/v1/worker/gpu/model_runner.py    | 10 ++++++++--
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
index 4510a1c5ca1e9..5aa1a33d851cc 100644
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -18,7 +18,6 @@ from vllm.v1.kv_cache_interface import (
     KVCacheConfig,
     KVCacheSpec,
 )
-from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.utils import bind_kv_cache
 
 
@@ -145,7 +144,8 @@ def build_attn_metadata(
     attn_metadata_builders: list[AttentionMetadataBuilder],
     num_reqs: int,
     num_tokens: int,
-    query_start_loc: CpuGpuBuffer,
+    query_start_loc_gpu: torch.Tensor,
+    query_start_loc_cpu: torch.Tensor,
     seq_lens: torch.Tensor,
     seq_lens_np: np.ndarray,
     num_computed_tokens_cpu: torch.Tensor | None,
@@ -153,9 +153,7 @@ def build_attn_metadata(
     slot_mappings: torch.Tensor,
     kv_cache_config: KVCacheConfig,
 ) -> dict[str, Any]:
-    query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
-    query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
-    max_query_len = int(query_start_loc.np[: num_reqs + 1].max())
+    max_query_len = int(query_start_loc_cpu.max())
     seq_lens = seq_lens[:num_reqs]
     seq_lens_cpu = torch.from_numpy(seq_lens_np)
     max_seq_len = int(seq_lens_np.max())
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 6b056641c903d..b5fc2edea130f 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -120,7 +120,8 @@ class CudaGraphManager:
             attn_metadata_builders=attn_metadata_builders,
             num_reqs=batch_size,
             num_tokens=batch_size,
-            query_start_loc=input_buffers.query_start_loc,
+            query_start_loc_gpu=input_buffers.query_start_loc.gpu[: batch_size + 1],
+            query_start_loc_cpu=input_buffers.query_start_loc.cpu[: batch_size + 1],
             seq_lens=input_buffers.seq_lens,
             seq_lens_np=np.full(batch_size, self.max_model_len, dtype=np.int32),
             num_computed_tokens_cpu=None,  # FIXME
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 6a78776b0a8a3..ed41e5a1a6c5e 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -226,11 +226,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         num_computed_tokens = torch.zeros(
             input_batch.num_reqs, dtype=torch.int32, device=self.device
         )
+        query_start_loc = self.input_buffers.query_start_loc
+        query_start_loc_gpu = query_start_loc.gpu[: input_batch.num_reqs + 1]
+        query_start_loc_cpu = query_start_loc.cpu[: input_batch.num_reqs + 1]
         attn_metadata = build_attn_metadata(
             attn_metadata_builders=self.attn_metadata_builders,
             num_reqs=input_batch.num_reqs,
             num_tokens=input_batch.num_tokens,
-            query_start_loc=self.input_buffers.query_start_loc,
+            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
             seq_lens=self.input_buffers.seq_lens,
             seq_lens_np=input_batch.seq_lens_np,
             num_computed_tokens_cpu=num_computed_tokens,
@@ -515,6 +519,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
         self.input_buffers.query_start_loc.copy_to_gpu()
         query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+        query_start_loc_cpu = self.input_buffers.query_start_loc.cpu[: num_reqs + 1]
         query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
 
         # Copy prefill tokens from CPU to GPU.
@@ -572,7 +577,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             attn_metadata_builders=self.attn_metadata_builders,
             num_reqs=num_reqs,
             num_tokens=num_tokens,
-            query_start_loc=self.input_buffers.query_start_loc,
+            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
             seq_lens=self.input_buffers.seq_lens,
             seq_lens_np=seq_lens_np,
             num_computed_tokens_cpu=num_computed_tokens,

From da8e1a1bf9b0f6d3b18608d8f99a456cd8833a0f Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Thu, 27 Nov 2025 12:42:50 +0800
Subject: [PATCH 104/134] [DOC] Add vLLM Bangkok Meetup info (#29561)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 README.md                 | 1 +
 docs/community/meetups.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 033e1035d8916..abbb63158f166 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 
 *Latest News* 🔥
 
+- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
 - [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
 - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 0735f452df960..d8cf4ecdd5a32 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -10,6 +10,7 @@ Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_pro
 
 Below you'll find slides and recordings from our previous meetups:
 
+- [vLLM Bangkok Meetup](https://luma.com/v0f647nv), November 21st 2025. [[Slides]](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing)
 - [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)

From ecb1952378dda5c7a3a8d89cc6c2f1d806248b9f Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Thu, 27 Nov 2025 05:09:41 +0000
Subject: [PATCH 105/134] [cpu][fix] Fix Arm CI tests (#29552)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .../scripts/hardware_ci/run-cpu-test-arm.sh   | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index d0036f24c8d04..b5f6b2494792f 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -7,53 +7,51 @@ set -ex
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-0-16}
 OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
-NUMA_NODE=${NUMA_NODE:-0}
 
-export CMAKE_BUILD_PARALLEL_LEVEL=32
+export CMAKE_BUILD_PARALLEL_LEVEL=16
 
 # Setup cleanup
 remove_docker_container() {
     set -e;
-    docker rm -f cpu-test-"$NUMA_NODE" || true;
+    docker rm -f cpu-test || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
 
-# Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+# Run the image
+docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
 
 function cpu_tests() {
   set -e
-  export NUMA_NODE=$2
 
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test bash -c "
     set -e
     pip list"
 
   # offline inference
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test bash -c "
     set -e
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
   # Run kernel tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test bash -c "
     set -e
     pytest -x -v -s tests/kernels/test_onednn.py
     pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
 
   # basic online serving
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
+  docker exec cpu-test bash -c '
     set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
     server_pid=$!
     timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
     vllm bench serve \
       --backend vllm \
       --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
+      --model Qwen/Qwen3-0.6B \
       --num-prompts 20 \
       --endpoint /v1/completions
     kill -s SIGTERM $server_pid &'
@@ -61,4 +59,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2h bash -c cpu_tests

From 11ea5ec1ff7afc5ba181cba41f0cc2e4053e27f3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 26 Nov 2025 21:37:59 -0800
Subject: [PATCH 106/134] [Model Runner V2] Refactor CudaGraphManager (#29583)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py | 247 ++++++++++++++++----------
 1 file changed, 153 insertions(+), 94 deletions(-)

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index b5fc2edea130f..8f1718e493b1e 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from unittest.mock import patch
+from collections.abc import Callable, Iterable
+from typing import Any
 
 import numpy as np
 import torch
@@ -32,6 +33,7 @@ class CudaGraphManager:
 
         self.max_model_len = vllm_config.model_config.max_model_len
         self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
         self.dp_size = vllm_config.parallel_config.data_parallel_size
         self.compilation_config = vllm_config.compilation_config
         assert self.compilation_config is not None
@@ -40,102 +42,60 @@ class CudaGraphManager:
             self.cudagraph_mode = CUDAGraphMode.NONE
         else:
             self.cudagraph_mode = self.compilation_config.cudagraph_mode
-        if self.compilation_config.cudagraph_capture_sizes is not None:
-            cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
-            # Limit the cudagraph sizes to the max decode batch size.
-            self.cudagraph_sizes = [
-                x for x in cudagraph_sizes if x <= self.max_num_reqs
-            ]
-        else:
-            self.cudagraph_sizes = []
-        self.padded_sizes = self._init_padded_sizes()
+        self.cudagraph_sizes = get_cudagraph_sizes(
+            self.compilation_config.cudagraph_capture_sizes,
+            self.max_num_reqs,
+            self.max_num_tokens,
+            self.cudagraph_mode,
+        )
 
         self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
         self.pool = torch.cuda.graph_pool_handle()
         self.hidden_states: torch.Tensor | None = None
 
-    def _init_padded_sizes(self) -> dict[int, int]:
-        if not self.cudagraph_mode.has_full_cudagraphs():
-            # Full cuda graphs are not used.
-            return {}
-        if not self.cudagraph_sizes:
-            return {}
-
-        padded_sizes: dict[int, int] = {}
-        for i in range(1, self.cudagraph_sizes[-1] + 1):
-            for x in self.cudagraph_sizes:
-                if i <= x:
-                    padded_sizes[i] = x
-                    break
-        return padded_sizes
-
     def needs_capture(self) -> bool:
-        return len(self.padded_sizes) > 0
+        return len(self.cudagraph_sizes) > 0
 
     def get_cudagraph_size(
         self,
         scheduler_output: SchedulerOutput,
         num_tokens_after_padding: int,
     ) -> int | None:
-        if not self.cudagraph_mode.has_full_cudagraphs():
-            return None
-        if self.cudagraph_mode != CUDAGraphMode.FULL:
-            # TODO(woosuk): Support uniform decode with multiple tokens (spec decoding).
-            all_decode = all(
-                x == 1 for x in scheduler_output.num_scheduled_tokens.values()
-            )
-            if not all_decode:
-                # Prefill is included.
-                return None
-        return self.padded_sizes.get(num_tokens_after_padding)
+        return get_cudagraph_size(
+            num_tokens_after_padding,
+            scheduler_output.num_scheduled_tokens.values(),
+            self.cudagraph_sizes,
+            self.cudagraph_mode,
+        )
 
     def capture_graph(
         self,
-        batch_size: int,
+        num_tokens: int,
         model: nn.Module,
         input_buffers: InputBuffers,
         block_tables: BlockTables,
         attn_metadata_builders: list[AttentionMetadataBuilder],
         kv_cache_config: KVCacheConfig,
     ) -> None:
-        assert batch_size not in self.graphs
-
-        # Prepare dummy inputs.
-        input_ids = input_buffers.input_ids.gpu[:batch_size]
-        positions = input_buffers.positions[:batch_size]
-
-        input_buffers.query_start_loc.np[: batch_size + 1] = np.arange(batch_size + 1)
-        input_buffers.query_start_loc.np[batch_size:] = batch_size
-        input_buffers.query_start_loc.copy_to_gpu()
-        # HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len)
-        # for seq_lens. This leads to a mismatch between seq_lens (GPU) and
-        # seq_lens_np (CPU), which might cause issues in some attention backends.
-        input_buffers.seq_lens[:batch_size] = 1
-        input_buffers.seq_lens[batch_size:] = 0
-
-        input_block_tables = [x[:batch_size] for x in block_tables.input_block_tables]
-        slot_mappings = block_tables.slot_mappings[:, :batch_size]
-
-        attn_metadata = build_attn_metadata(
-            attn_metadata_builders=attn_metadata_builders,
-            num_reqs=batch_size,
-            num_tokens=batch_size,
-            query_start_loc_gpu=input_buffers.query_start_loc.gpu[: batch_size + 1],
-            query_start_loc_cpu=input_buffers.query_start_loc.cpu[: batch_size + 1],
-            seq_lens=input_buffers.seq_lens,
-            seq_lens_np=np.full(batch_size, self.max_model_len, dtype=np.int32),
-            num_computed_tokens_cpu=None,  # FIXME
-            block_tables=input_block_tables,
-            slot_mappings=slot_mappings,
-            kv_cache_config=kv_cache_config,
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        input_ids = input_buffers.input_ids.gpu[:num_tokens]
+        positions = input_buffers.positions[:num_tokens]
+        attn_metadata = prepare_inputs_to_capture(
+            num_reqs,
+            num_tokens,
+            input_buffers,
+            block_tables,
+            attn_metadata_builders,
+            self.max_model_len,
+            kv_cache_config,
         )
-        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, batch_size)
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens)
 
         # Warm up.
         with set_forward_context(
             attn_metadata,
             self.vllm_config,
-            num_tokens=batch_size,
+            num_tokens=num_tokens,
             cudagraph_runtime_mode=CUDAGraphMode.NONE,
             num_tokens_across_dp=num_tokens_across_dp,
         ):
@@ -147,13 +107,13 @@ class CudaGraphManager:
                 self.hidden_states = torch.empty_like(hidden_states)
 
         # Capture the graph.
+        assert num_tokens not in self.graphs
         graph = torch.cuda.CUDAGraph()
         with (
-            patch("torch.cuda.empty_cache", lambda: None),
             set_forward_context(
                 attn_metadata,
                 self.vllm_config,
-                num_tokens=batch_size,
+                num_tokens=num_tokens,
                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
                 num_tokens_across_dp=num_tokens_across_dp,
             ),
@@ -163,8 +123,8 @@ class CudaGraphManager:
                 input_ids=input_ids,
                 positions=positions,
             )
-            self.hidden_states[:batch_size] = hidden_states
-        self.graphs[batch_size] = graph
+            self.hidden_states[:num_tokens] = hidden_states
+        self.graphs[num_tokens] = graph
 
     @torch.inference_mode()
     def capture(
@@ -175,25 +135,124 @@ class CudaGraphManager:
         attn_metadata_builders: list[AttentionMetadataBuilder],
         kv_cache_config: KVCacheConfig,
     ) -> None:
-        assert self.needs_capture()
-        # Capture larger graphs first.
-        sizes_to_capture = sorted(self.cudagraph_sizes, reverse=True)
-        if is_global_first_rank():
-            sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs")
+        capture_graphs(
+            self.cudagraph_sizes,
+            self.device,
+            self.capture_graph,
+            model=model,
+            input_buffers=input_buffers,
+            block_tables=block_tables,
+            attn_metadata_builders=attn_metadata_builders,
+            kv_cache_config=kv_cache_config,
+        )
 
-        with graph_capture(device=self.device):
-            for batch_size in sizes_to_capture:
-                self.capture_graph(
-                    batch_size,
-                    model,
-                    input_buffers,
-                    block_tables,
-                    attn_metadata_builders,
-                    kv_cache_config,
-                )
-
-    def run(self, batch_size: int) -> torch.Tensor:
-        assert batch_size in self.graphs
-        self.graphs[batch_size].replay()
+    def run(self, num_tokens: int) -> torch.Tensor:
+        assert num_tokens in self.graphs
+        self.graphs[num_tokens].replay()
         assert self.hidden_states is not None
-        return self.hidden_states[:batch_size]
+        return self.hidden_states[:num_tokens]
+
+
+def get_cudagraph_sizes(
+    capture_sizes: list[int] | None,
+    max_num_reqs: int,
+    max_num_tokens: int,
+    cudagraph_mode: CUDAGraphMode,
+) -> dict[int, int]:
+    if not cudagraph_mode.has_full_cudagraphs():
+        return {}
+    if not capture_sizes:
+        return {}
+
+    capture_sizes = sorted(capture_sizes)
+    # Limit the capture sizes to the max number of requests or tokens.
+    upper_bound = (
+        max_num_reqs
+        if cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY
+        else max_num_tokens
+    )
+    capture_sizes = [x for x in capture_sizes if x <= upper_bound]
+    if not capture_sizes:
+        return {}
+
+    cudagraph_sizes: dict[int, int] = {}
+    for i in range(1, capture_sizes[-1] + 1):
+        for x in capture_sizes:
+            if i <= x:
+                cudagraph_sizes[i] = x
+                break
+    return cudagraph_sizes
+
+
+def get_cudagraph_size(
+    num_tokens_after_dp_padding: int,
+    num_tokens_per_request: Iterable[int],
+    cudagraph_sizes: dict[int, int],
+    cudagraph_mode: CUDAGraphMode,
+) -> int | None:
+    size = cudagraph_sizes.get(num_tokens_after_dp_padding)
+    if size is None:
+        # No CUDA graph for this size.
+        return None
+    if cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY:
+        all_decode = all(x == 1 for x in num_tokens_per_request)
+        if not all_decode:
+            # Prefill is included.
+            return None
+    return size
+
+
+def capture_graphs(
+    cudagraph_sizes: dict[int, int],
+    device: torch.device,
+    capture_fn: Callable,
+    **capture_kwargs,
+) -> None:
+    # Capture larger graphs first.
+    sizes_to_capture = sorted(set(cudagraph_sizes.values()), reverse=True)
+    if is_global_first_rank():
+        sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs")
+
+    with graph_capture(device=device):
+        for size in sizes_to_capture:
+            capture_fn(size, **capture_kwargs)
+
+
+def prepare_inputs_to_capture(
+    num_reqs: int,
+    num_tokens: int,
+    input_buffers: InputBuffers,
+    block_tables: BlockTables,
+    attn_metadata_builders: list[AttentionMetadataBuilder],
+    max_model_len: int,
+    kv_cache_config: KVCacheConfig,
+) -> dict[str, Any]:
+    num_tokens_per_req = num_tokens // num_reqs
+    query_start_loc = input_buffers.query_start_loc
+    query_start_loc.np[: num_reqs + 1] = np.arange(num_reqs + 1) * num_tokens_per_req
+    query_start_loc.np[num_reqs:] = num_tokens
+    query_start_loc.copy_to_gpu()
+    seq_lens_np = np.full(num_reqs, max_model_len, dtype=np.int32)
+    # HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len)
+    # for seq_lens. This leads to a mismatch between seq_lens (GPU) and
+    # seq_lens_np (CPU), which might cause issues in some attention backends.
+    input_buffers.seq_lens[:num_reqs] = 1
+    input_buffers.seq_lens[num_reqs:] = 0
+
+    input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables]
+    slot_mappings = block_tables.slot_mappings[:, :num_tokens]
+
+    attn_metadata = build_attn_metadata(
+        attn_metadata_builders=attn_metadata_builders,
+        num_reqs=num_reqs,
+        num_tokens=num_tokens,
+        query_start_loc_gpu=query_start_loc.gpu[: num_reqs + 1],
+        query_start_loc_cpu=query_start_loc.cpu[: num_reqs + 1],
+        seq_lens=input_buffers.seq_lens,
+        seq_lens_np=seq_lens_np,
+        num_computed_tokens_cpu=None,  # FIXME
+        block_tables=input_block_tables,
+        slot_mappings=slot_mappings,
+        kv_cache_config=kv_cache_config,
+    )
+    return attn_metadata

From c069086b9c9b8212b0a8544eb25b6af65c16762d Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 27 Nov 2025 15:16:07 +0800
Subject: [PATCH 107/134] [Bugfix] Fix getting device for MoE LoRA (#29475)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers/fused_moe.py | 4 +++-
 vllm/lora/layers/utils.py     | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 0eb6562bec6cd..1b925742c3002 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -30,6 +30,8 @@ from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
     FusedMoEModularMethod,
 )
 
+from .utils import _get_lora_device
+
 
 class FusedMoEWithLoRA(BaseLayerWithLoRA):
     def __init__(self, base_layer: FusedMoE) -> None:
@@ -41,7 +43,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         )
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
-        self.device = base_layer.w2_weight.device
+        self.device = _get_lora_device(base_layer)
         self._w13_slices = 2
         self._inject_lora_into_fused_moe()
 
diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py
index 2da90f180ee74..74403240f6cc2 100644
--- a/vllm/lora/layers/utils.py
+++ b/vllm/lora/layers/utils.py
@@ -33,6 +33,15 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device:
     # HQQ marlin
     elif hasattr(base_layer, "W_q"):
         return base_layer.W_q.device
+    # MoE layer
+    elif hasattr(base_layer, "w2_weight"):
+        return base_layer.w2_weight.device
+    # MoE Compressed Tensor
+    elif hasattr(base_layer, "w2_weight_packed"):
+        return base_layer.w2_weight_packed.device
+    # MoE GPTQ/AWQ/GGUF
+    elif hasattr(base_layer, "w2_qweight"):
+        return base_layer.w2_qweight.device
     else:
         raise ValueError(f"Unsupported base layer: {base_layer}")
 

From 3ecabd06eee69e60c2239a6ca7159b978b26d6ce Mon Sep 17 00:00:00 2001
From: Johnny Yang <24908445+jcyang43@users.noreply.github.com>
Date: Wed, 26 Nov 2025 23:25:21 -0800
Subject: [PATCH 108/134] Fix tpu-inference platform path (#29554)

Signed-off-by: Johnny Yang <johnnyyang@google.com>
---
 vllm/platforms/tpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index aa5ddbe43659d..04325a522f444 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -267,7 +267,7 @@ class TpuPlatform(Platform):
 
 
 try:
-    from tpu_inference.platforms.tpu_platforms import (
+    from tpu_inference.platforms import (
         TpuPlatform as TpuInferencePlatform,
     )
 

From 43c5792592d9beb02eea57730ce5a4647dc0c838 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Thu, 27 Nov 2025 01:54:44 -0600
Subject: [PATCH 109/134] [ROCm][CI] Fix test_cpu_offloading for ROCm (#29548)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/kv_offload/test_cpu_offloading.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py
index 406d4c0b4c1fd..57474a3dc01e7 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -20,6 +20,8 @@ ATTN_BACKENDS = ["FLASH_ATTN"]
 
 if current_platform.is_cuda():
     ATTN_BACKENDS.append("FLASHINFER")
+elif current_platform.is_rocm():
+    ATTN_BACKENDS = ["TRITON_ATTN"]
 
 
 class MockSubscriber:

From da3222f371b48c8e2548ec22767523394580a1c5 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 27 Nov 2025 00:09:41 -0800
Subject: [PATCH 110/134] [Model Runner V2] Implement multi-step Eagle with
 CUDA graph (#29559)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py         |   9 +-
 vllm/v1/worker/gpu/model_runner.py            |  53 +--
 vllm/v1/worker/gpu/spec_decode/eagle.py       | 422 ++++++++++++++++--
 .../worker/gpu/spec_decode/eagle_cudagraph.py | 112 +++++
 4 files changed, 526 insertions(+), 70 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 8f1718e493b1e..4fd8eb50a4ea8 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -233,10 +233,11 @@ def prepare_inputs_to_capture(
     query_start_loc.np[num_reqs:] = num_tokens
     query_start_loc.copy_to_gpu()
     seq_lens_np = np.full(num_reqs, max_model_len, dtype=np.int32)
-    # HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len)
-    # for seq_lens. This leads to a mismatch between seq_lens (GPU) and
-    # seq_lens_np (CPU), which might cause issues in some attention backends.
-    input_buffers.seq_lens[:num_reqs] = 1
+    # HACK(woosuk): For faster warmup, we set seq_lens (GPU) to num_tokens
+    # rather than max_model_len. This introduces a discrepancy between
+    # seq_lens (on GPU) and seq_lens_np (on CPU), which may cause issues for
+    # certain attention backends.
+    input_buffers.seq_lens[:num_reqs] = num_tokens
     input_buffers.seq_lens[num_reqs:] = 0
 
     input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables]
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index ed41e5a1a6c5e..0c9fdd0077f4a 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -140,10 +140,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
 
         # CUDA graphs.
-        self.cudagraph_manager = CudaGraphManager(
-            vllm_config=self.vllm_config,
-            device=self.device,
-        )
+        self.cudagraph_manager = CudaGraphManager(self.vllm_config, self.device)
 
     def get_supported_tasks(self) -> tuple[str]:
         return ("generate",)
@@ -203,6 +200,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.vllm_config,
             self.device,
         )
+        if self.do_spec_decode:
+            # HACK(woosuk)
+            self.speculator.set_attn(
+                self.kv_cache_config,
+                self.attn_metadata_builders,
+                self.block_tables,
+            )
+
         # TODO(woosuk): Support other backends.
         if not all(b.get_name() == "FLASH_ATTN" for b in self.attn_backends.values()):
             raise NotImplementedError("Only FLASH_ATTN backend is supported currently.")
@@ -297,35 +302,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         logits = self.model.compute_logits(hidden_states)
         self.sampler(logits, sampling_metadata)
 
-    @torch.inference_mode()
-    def _dummy_speculator_run(
-        self,
-        hidden_states: torch.Tensor,
-        aux_hidden_states: list[torch.Tensor] | None,
-    ) -> None:
-        num_tokens = hidden_states.shape[0]
-        num_reqs = min(num_tokens, self.max_num_reqs)
-        input_batch = InputBatch.make_dummy(
-            num_reqs=num_reqs,
-            num_tokens=num_tokens,
-            input_buffers=self.input_buffers,
-            device=self.device,
-        )
-        sampling_metadata = SamplingMetadata.make_dummy(
-            num_reqs=num_reqs,
-            device=self.device,
-        )
-        num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device)
-        num_rejected = torch.zeros(num_reqs, dtype=torch.int32, device=self.device)
-        self.propose_draft(
-            input_batch=input_batch,
-            sampling_metadata=sampling_metadata,
-            last_hidden_states=hidden_states,
-            aux_hidden_states=aux_hidden_states,
-            num_sampled=num_sampled,
-            num_rejected=num_rejected,
-        )
-
     @torch.inference_mode()
     def profile_run(self) -> None:
         hidden_states, sample_hidden_states = self._dummy_run(
@@ -334,7 +310,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         )
         self._dummy_sampler_run(sample_hidden_states)
         if self.do_spec_decode:
-            self._dummy_speculator_run(hidden_states, None)
+            num_tokens_across_dp = make_num_tokens_across_dp(
+                self.dp_size, self.max_num_tokens
+            )
+            self.speculator.run_model(
+                self.max_num_tokens,
+                attn_metadata=None,
+                num_tokens_across_dp=num_tokens_across_dp,
+            )
         torch.cuda.synchronize()
         del hidden_states, sample_hidden_states
         gc.collect()
@@ -368,6 +351,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 attn_metadata_builders=self.attn_metadata_builders,
                 kv_cache_config=self.kv_cache_config,
             )
+            if self.do_spec_decode:
+                self.speculator.capture_model()
 
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 3c8621cc69c97..daf2775e8b92d 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -1,17 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import numpy as np
 import torch
 import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.config.compilation import CUDAGraphMode
 from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.triton_utils import tl, triton
-from vllm.v1.worker.gpu.input_batch import InputBatch
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.input_batch import InputBatch, InputBuffers
 from vllm.v1.worker.gpu.sampler import gumbel_sample
+from vllm.v1.worker.gpu.spec_decode.eagle_cudagraph import EagleCudaGraphManager
 from vllm.v1.worker.gpu.states import SamplingMetadata
 
+logger = init_logger(__name__)
+
 
 class EagleSpeculator:
     def __init__(self, vllm_config: VllmConfig, device: torch.device):
@@ -27,13 +39,48 @@ class EagleSpeculator:
         self.scheduler_config = vllm_config.scheduler_config
         self.max_num_reqs = self.scheduler_config.max_num_seqs
         self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.max_model_len = vllm_config.model_config.max_model_len
+        # We need to get the hidden size from the draft model config because
+        # the draft model's hidden size can be different from the target model's
+        # hidden size (e.g., Llama 3.3 70B).
+        self.hidden_size = self.draft_model_config.get_hidden_size()
+        self.vocab_size = self.draft_model_config.get_vocab_size()
+        self.pin_memory = is_pin_memory_available()
+        self.dtype = vllm_config.model_config.dtype
 
-        self.input_ids = torch.zeros(
-            self.max_num_tokens, dtype=torch.int32, device=device
+        self.input_buffers = InputBuffers(
+            max_num_reqs=self.max_num_reqs,
+            max_num_tokens=self.max_num_tokens,
+            hidden_size=self.hidden_size,
+            vocab_size=self.vocab_size,
+            dtype=self.dtype,
+            device=device,
+            pin_memory=self.pin_memory,
         )
-        self.positions = torch.zeros(
-            self.max_num_tokens, dtype=torch.int64, device=device
+        self.hidden_states = torch.zeros(
+            self.max_num_tokens,
+            self.hidden_size,
+            dtype=self.dtype,
+            device=device,
         )
+        self.temperature = torch.zeros(
+            self.max_num_reqs,
+            dtype=torch.float32,
+            device=device,
+        )
+        self.seeds = torch.zeros(
+            self.max_num_reqs,
+            dtype=torch.int64,
+            device=device,
+        )
+        self.draft_tokens = torch.zeros(
+            self.max_num_reqs,
+            self.num_speculative_steps,
+            dtype=torch.int64,
+            device=device,
+        )
+
+        self.cudagraph_manager = EagleCudaGraphManager(vllm_config, device)
 
     def load_model(self, target_model: nn.Module) -> None:
         from vllm.compilation.backends import set_model_tag
@@ -49,6 +96,91 @@ class EagleSpeculator:
                 del self.model.lm_head
             self.model.lm_head = target_model.lm_head
 
+    def set_attn(
+        self,
+        kv_cache_config: KVCacheConfig,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        block_tables: BlockTables,
+    ) -> None:
+        self.kv_cache_config = kv_cache_config
+        self.attn_metadata_builders = attn_metadata_builders
+        self.block_tables = block_tables
+
+    @torch.inference_mode()
+    def run_model(
+        self,
+        num_tokens: int,
+        attn_metadata: dict[str, Any],
+        num_tokens_across_dp: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        with set_forward_context(
+            attn_metadata,
+            self.vllm_config,
+            num_tokens=num_tokens,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
+            num_tokens_across_dp=num_tokens_across_dp,
+        ):
+            ret_hidden_states = self.model(
+                input_ids=self.input_buffers.input_ids.gpu[:num_tokens],
+                positions=self.input_buffers.positions[:num_tokens],
+                hidden_states=self.hidden_states[:num_tokens],
+            )
+        if self.method == "mtp":
+            last_hidden_states = ret_hidden_states
+            hidden_states = ret_hidden_states
+        else:
+            last_hidden_states, hidden_states = ret_hidden_states
+        return last_hidden_states, hidden_states
+
+    def generate_draft(
+        self,
+        num_reqs: int,
+        attn_metadata: dict[str, Any],
+        num_tokens_across_dp: torch.Tensor | None,
+    ) -> None:
+        pos = self.input_buffers.positions[:num_reqs]
+        query_start_loc = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+        for step in range(1, self.num_speculative_steps):
+            # Run the eagle model.
+            last_hidden_states, hidden_states = self.run_model(
+                num_reqs, attn_metadata, num_tokens_across_dp
+            )
+            logits = self.model.compute_logits(last_hidden_states)
+
+            # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
+            # used for draft and target sampling.
+            draft_tokens = gumbel_sample(
+                logits,
+                self.temperature[:num_reqs],
+                self.seeds[:num_reqs],
+                pos + 1,
+                apply_temperature=True,
+            )
+            self.draft_tokens[:num_reqs, step] = draft_tokens
+
+            if step < self.num_speculative_steps - 1:
+                # Update the inputs for the next step.
+                update_eagle_inputs(
+                    draft_tokens,
+                    hidden_states,
+                    self.input_buffers,
+                    self.hidden_states,
+                    self.max_model_len,
+                )
+                self.block_tables.compute_slot_mappings(query_start_loc, pos)
+
+    def capture_model(self) -> None:
+        if self.num_speculative_steps == 1:
+            return
+        logger.info("Capturing model for Eagle speculator...")
+        self.cudagraph_manager.capture(
+            self.generate_draft,
+            self.input_buffers,
+            self.block_tables,
+            self.attn_metadata_builders,
+            self.kv_cache_config,
+        )
+
     @torch.inference_mode()
     def propose(
         self,
@@ -80,64 +212,110 @@ class EagleSpeculator:
             )
         else:
             hidden_states = last_hidden_states
+        num_tokens = input_batch.num_tokens_after_padding
+        self.hidden_states[:num_tokens] = hidden_states
 
         # Get the input ids and last token indices for the speculator.
         last_token_indices = prepare_eagle_inputs(
-            self.input_ids,
+            self.input_buffers,
             input_batch,
             num_sampled,
             num_rejected,
             last_sampled,
             next_prefill_tokens,
         )
-        input_ids = self.input_ids[: input_batch.num_tokens_after_padding]
 
         # Prefill: Run the eagle speculator with eager mode.
-        with set_forward_context(
+        # TODO(woosuk): Support CUDA graph for prefill.
+        last_hidden_states, hidden_states = self.run_model(
+            num_tokens,
             input_batch.attn_metadata,
-            self.vllm_config,
-            num_tokens=input_batch.num_tokens_after_padding,
-            cudagraph_runtime_mode=CUDAGraphMode.NONE,
-        ):
-            ret_hidden_states = self.model(
-                input_ids=input_ids,
-                positions=input_batch.positions,
-                hidden_states=hidden_states,
-            )
-        if self.method == "mtp":
-            last_hidden_states = ret_hidden_states
-            hidden_states = ret_hidden_states
-        else:
-            last_hidden_states, hidden_states = ret_hidden_states
+            num_tokens_across_dp=None,  # FIXME
+        )
         sample_hidden_states = last_hidden_states[last_token_indices]
         logits = self.model.compute_logits(sample_hidden_states)
 
         num_reqs = input_batch.num_reqs
         cu_num_logits = input_batch.cu_num_logits[:num_reqs]
-        temperature = sampling_metadata.temperature[cu_num_logits]
-        seed = sampling_metadata.seeds[cu_num_logits]
-        # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
-        # used for draft and target sampling.
-        pos = input_batch.positions[last_token_indices] + 1
         # NOTE(woosuk): For draft sampling, we only consider the temperature
         # and ignore the other sampling parameters such as top_k and top_p,
         # for simplicity and performance.
         # While this may slightly degrade the acceptance rate, it does not
         # affect the output distribution after rejection sampling.
+        temperature = self.temperature[:num_reqs]
+        seeds = self.seeds[:num_reqs]
+        pos = self.input_buffers.positions[:num_reqs]
+        # Gather the values and copy them to the pre-allocated buffers.
+        torch.gather(sampling_metadata.temperature, 0, cu_num_logits, out=temperature)
+        torch.gather(sampling_metadata.seeds, 0, cu_num_logits, out=seeds)
+        torch.gather(input_batch.positions, 0, last_token_indices, out=pos)
+        # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
+        # used for draft and target sampling.
         draft_tokens = gumbel_sample(
-            logits, temperature, seed, pos, apply_temperature=True
+            logits, temperature, seeds, pos + 1, apply_temperature=True
         )
         if self.num_speculative_steps == 1:
             # Early exit.
             return draft_tokens.view(-1, 1)
-        raise NotImplementedError("num_speculative_steps > 1 is not supported yet.")
+
+        # Save the draft tokens for the first step.
+        self.draft_tokens[:num_reqs, 0] = draft_tokens
+        # Prepare the inputs for the decode steps.
+        prepare_eagle_decode(
+            draft_tokens,
+            hidden_states,
+            last_token_indices,
+            input_batch.seq_lens,
+            num_rejected,
+            self.input_buffers,
+            self.hidden_states,
+            self.max_model_len,
+            self.max_num_reqs,
+        )
+        query_start_loc = self.input_buffers.query_start_loc
+        query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
+        slot_mappings = self.block_tables.compute_slot_mappings(
+            query_start_loc_gpu, pos
+        )
+
+        cudagraph_size = self.cudagraph_manager.get_cudagraph_size(num_reqs)
+        if cudagraph_size is not None:
+            # Run CUDA graph.
+            self.cudagraph_manager.run(cudagraph_size)
+            return self.draft_tokens[:num_reqs]
+
+        # Run eager mode.
+        query_start_loc.np[: num_reqs + 1] = np.arange(num_reqs + 1)
+        query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
+        # HACK(woosuk)
+        seq_lens_np = np.full(num_reqs, self.max_model_len, dtype=np.int32)
+        block_tables = [x[:num_reqs] for x in self.block_tables.input_block_tables]
+
+        # FIXME(woosuk): This is UNSAFE!!
+        attn_metadata = build_attn_metadata(
+            attn_metadata_builders=self.attn_metadata_builders,
+            num_reqs=num_reqs,
+            num_tokens=num_reqs,
+            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
+            seq_lens=self.input_buffers.seq_lens[:num_reqs],
+            seq_lens_np=seq_lens_np,
+            num_computed_tokens_cpu=None,  # FIXME
+            block_tables=block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=self.kv_cache_config,
+        )
+        self.generate_draft(num_reqs, attn_metadata, num_tokens_across_dp=None)  # FIXME
+        return self.draft_tokens[:num_reqs]
 
 
 @triton.jit
 def _prepare_eagle_inputs_kernel(
     last_token_indices_ptr,
     eagle_input_ids_ptr,
+    eagle_positions_ptr,
     target_input_ids_ptr,
+    target_positions_ptr,
     idx_mapping_ptr,
     last_sampled_ptr,
     next_prefill_tokens_ptr,
@@ -175,9 +353,16 @@ def _prepare_eagle_inputs_kernel(
     tl.store(last_token_indices_ptr + batch_idx, last_token_index)
     tl.store(eagle_input_ids_ptr + last_token_index, next_token)
 
+    # Copy positions.
+    for i in range(0, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        target_pos = tl.load(target_positions_ptr + query_start + block, mask=mask)
+        tl.store(eagle_positions_ptr + query_start + block, target_pos, mask=mask)
+
 
 def prepare_eagle_inputs(
-    eagle_input_ids: torch.Tensor,
+    input_buffers: InputBuffers,
     input_batch: InputBatch,
     # [num_reqs]
     num_sampled: torch.Tensor,
@@ -192,12 +377,14 @@ def prepare_eagle_inputs(
     last_token_indices = torch.empty(
         num_reqs,
         dtype=torch.int64,
-        device=eagle_input_ids.device,
+        device=num_sampled.device,
     )
     _prepare_eagle_inputs_kernel[(num_reqs,)](
         last_token_indices,
-        eagle_input_ids,
+        input_buffers.input_ids.gpu,
+        input_buffers.positions,
         input_batch.input_ids,
+        input_batch.positions,
         input_batch.idx_mapping,
         last_sampled,
         next_prefill_tokens,
@@ -207,3 +394,174 @@ def prepare_eagle_inputs(
         BLOCK_SIZE=1024,
     )
     return last_token_indices
+
+
+@triton.jit
+def _prepare_eagle_docode_kernel(
+    draft_tokens_ptr,
+    output_hidden_states_ptr,
+    output_hidden_states_stride,
+    last_token_indices_ptr,
+    target_seq_lens_ptr,
+    num_rejected_ptr,
+    input_ids_ptr,
+    positions_ptr,
+    input_hidden_states_ptr,
+    input_hidden_states_stride,
+    query_start_loc_ptr,
+    seq_lens_ptr,
+    hidden_size,
+    max_model_len,
+    max_num_reqs,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    num_reqs = tl.num_programs(0) - 1
+    if req_idx == num_reqs:
+        # Compute query_start_loc. Pad it with the last query_start_loc
+        # for CUDA graphs.
+        for i in range(0, max_num_reqs + 1, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            q = tl.where(block < num_reqs, block, num_reqs)
+            mask = block < max_num_reqs + 1
+            tl.store(query_start_loc_ptr + block, q, mask=mask)
+        # Pad seq_lens for CUDA graphs.
+        for i in range(req_idx, max_num_reqs, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            mask = block < max_num_reqs
+            tl.store(seq_lens_ptr + block, 0, mask=mask)
+        return
+
+    # draft token -> input id.
+    draft_token = tl.load(draft_tokens_ptr + req_idx)
+    tl.store(input_ids_ptr + req_idx, draft_token)
+
+    # output hidden states -> input hidden states.
+    src_idx = tl.load(last_token_indices_ptr + req_idx)
+    for i in range(0, hidden_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < hidden_size
+        output_hidden_states = tl.load(
+            output_hidden_states_ptr + src_idx * output_hidden_states_stride + block,
+            mask=mask,
+        )
+        tl.store(
+            input_hidden_states_ptr + req_idx * input_hidden_states_stride + block,
+            output_hidden_states,
+            mask=mask,
+        )
+
+    # Compute position and seq_lens.
+    # NOTE(woosuk): To prevent out-of-range access, we clamp these values
+    # if they reach the max model length.
+    position = tl.load(positions_ptr + req_idx)
+    position = tl.minimum(position + 1, max_model_len - 1)
+    tl.store(positions_ptr + req_idx, position)
+
+    target_seq_len = tl.load(target_seq_lens_ptr + req_idx)
+    num_rejected = tl.load(num_rejected_ptr + req_idx)
+    seq_len = target_seq_len - num_rejected
+    seq_len = tl.minimum(seq_len + 1, max_model_len)
+    tl.store(seq_lens_ptr + req_idx, seq_len)
+
+
+def prepare_eagle_decode(
+    draft_tokens: torch.Tensor,
+    output_hidden_states: torch.Tensor,
+    last_token_indices: torch.Tensor,
+    target_seq_lens: torch.Tensor,
+    num_rejected: torch.Tensor,
+    input_buffers: InputBuffers,
+    input_hidden_states: torch.Tensor,
+    max_model_len: int,
+    max_num_reqs: int,
+):
+    num_reqs = draft_tokens.shape[0]
+    hidden_size = output_hidden_states.shape[-1]
+    _prepare_eagle_docode_kernel[(num_reqs + 1,)](
+        draft_tokens,
+        output_hidden_states,
+        output_hidden_states.stride(0),
+        last_token_indices,
+        target_seq_lens,
+        num_rejected,
+        input_buffers.input_ids.gpu,
+        input_buffers.positions,
+        input_hidden_states,
+        input_hidden_states.stride(0),
+        input_buffers.query_start_loc.gpu,
+        input_buffers.seq_lens,
+        hidden_size,
+        max_model_len,
+        max_num_reqs,
+        BLOCK_SIZE=1024,
+    )
+
+
+@triton.jit
+def _update_eagle_inputs_kernel(
+    input_ids_ptr,
+    positions_ptr,
+    input_hidden_states_ptr,
+    input_hidden_states_stride,
+    seq_lens_ptr,
+    max_model_len,
+    draft_tokens_ptr,
+    output_hidden_states_ptr,
+    output_hidden_states_stride,
+    hidden_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+
+    # Draft token -> Input ID.
+    draft_token = tl.load(draft_tokens_ptr + req_idx)
+    tl.store(input_ids_ptr + req_idx, draft_token)
+
+    # Output hidden states -> Input hidden states.
+    for i in range(0, hidden_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < hidden_size
+        output_hidden_states = tl.load(
+            output_hidden_states_ptr + req_idx * output_hidden_states_stride + block,
+            mask=mask,
+        )
+        tl.store(
+            input_hidden_states_ptr + req_idx * input_hidden_states_stride + block,
+            output_hidden_states,
+            mask=mask,
+        )
+
+    # Increment position and seq_lens.
+    # NOTE(woosuk): To prevent out-of-range access, we clamp these values
+    # if they reach the max model length.
+    position = tl.load(positions_ptr + req_idx)
+    position = tl.minimum(position + 1, max_model_len - 1)
+    tl.store(positions_ptr + req_idx, position)
+
+    seq_len = tl.load(seq_lens_ptr + req_idx)
+    seq_len = tl.minimum(seq_len + 1, max_model_len)
+    tl.store(seq_lens_ptr + req_idx, seq_len)
+
+
+def update_eagle_inputs(
+    draft_tokens: torch.Tensor,
+    output_hidden_states: torch.Tensor,
+    input_buffers: InputBuffers,
+    hidden_states: torch.Tensor,
+    max_model_len: int,
+):
+    num_reqs, hidden_size = output_hidden_states.shape
+    _update_eagle_inputs_kernel[(num_reqs,)](
+        input_buffers.input_ids.gpu,
+        input_buffers.positions,
+        hidden_states,
+        hidden_states.stride(0),
+        input_buffers.seq_lens,
+        max_model_len,
+        draft_tokens,
+        output_hidden_states,
+        output_hidden_states.stride(0),
+        hidden_size,
+        BLOCK_SIZE=1024,
+    )
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
new file mode 100644
index 0000000000000..a6f50d68cc684
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.cudagraph_utils import (
+    capture_graphs,
+    get_cudagraph_sizes,
+    prepare_inputs_to_capture,
+)
+from vllm.v1.worker.gpu.dp_utils import make_num_tokens_across_dp
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+
+
+class EagleCudaGraphManager:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device = device
+
+        self.max_model_len = vllm_config.model_config.max_model_len
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.dp_size = vllm_config.parallel_config.data_parallel_size
+        self.compilation_config = vllm_config.compilation_config
+        assert self.compilation_config is not None
+
+        if self.compilation_config.cudagraph_mode is None:
+            self.cudagraph_mode = CUDAGraphMode.NONE
+        else:
+            self.cudagraph_mode = self.compilation_config.cudagraph_mode
+            if self.cudagraph_mode == CUDAGraphMode.FULL:
+                # NOTE(woosuk): For Eagle, we only use CUDA graphs for decode.
+                self.cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
+
+        self.cudagraph_sizes = get_cudagraph_sizes(
+            self.compilation_config.cudagraph_capture_sizes,
+            self.max_num_reqs,
+            self.max_num_tokens,
+            self.cudagraph_mode,
+        )
+
+        self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
+        self.pool = torch.cuda.graph_pool_handle()
+
+    def get_cudagraph_size(self, num_tokens: int) -> int | None:
+        return self.cudagraph_sizes.get(num_tokens)
+
+    def capture_graph(
+        self,
+        num_tokens: int,
+        generate_fn: Callable,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        attn_metadata = prepare_inputs_to_capture(
+            num_reqs,
+            num_tokens,
+            input_buffers,
+            block_tables,
+            attn_metadata_builders,
+            self.max_model_len,
+            kv_cache_config,
+        )
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens)
+
+        # Warm up.
+        generate_fn(num_tokens, attn_metadata, num_tokens_across_dp)
+
+        # Capture the graph.
+        assert num_tokens not in self.graphs
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, self.pool):
+            generate_fn(num_tokens, attn_metadata, num_tokens_across_dp)
+        self.graphs[num_tokens] = graph
+
+    @torch.inference_mode()
+    def capture(
+        self,
+        generate_fn: Callable,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        capture_graphs(
+            self.cudagraph_sizes,
+            self.device,
+            self.capture_graph,
+            generate_fn=generate_fn,
+            input_buffers=input_buffers,
+            block_tables=block_tables,
+            attn_metadata_builders=attn_metadata_builders,
+            kv_cache_config=kv_cache_config,
+        )
+
+    def run(self, num_tokens: int) -> None:
+        assert num_tokens in self.graphs
+        self.graphs[num_tokens].replay()

From 00d3310d2d00d021d2e8f5f00e31b51d30f0413e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Nov 2025 17:36:18 +0800
Subject: [PATCH 111/134] [Bugfix] Update Ultravox  compatibility (#29588)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/ultravox.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index bb0f6bd036f14..26a8355cd22b5 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -116,7 +116,12 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
 
     def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor:
         hf_processor = self.get_hf_processor(**kwargs)
+
+        # Changed in https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
         audio_processor = hf_processor.audio_processor  # type: ignore
+        if isinstance(audio_processor, WhisperFeatureExtractor):
+            return audio_processor
+
         feature_extractor = audio_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor

From 0838b52e2eff77d1aaf4ee9d0da19522b9a5749c Mon Sep 17 00:00:00 2001
From: Morrison Turnansky <mturnans@redhat.com>
Date: Thu, 27 Nov 2025 04:55:58 -0500
Subject: [PATCH 112/134] [Frontend][torch.compile] CompilationConfig Overhaul
 (#20283): Set up -O infrastructure (#26847)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: adabeyta <aabeyta@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Co-authored-by: adabeyta <aabeyta@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/design/optimization_levels.md            |  69 ++++
 tests/compile/test_config.py                  |   4 +-
 tests/engine/test_arg_utils.py                |  57 +++-
 .../model_executor/test_enabled_custom_ops.py |   8 +-
 tests/test_config.py                          | 307 +++++++++++++++++-
 tests/utils_/test_argparse_utils.py           |  10 +-
 tests/v1/cudagraph/test_cudagraph_mode.py     |   4 +-
 vllm/config/compilation.py                    |  75 ++++-
 vllm/config/model.py                          |   8 +
 vllm/config/vllm.py                           | 223 ++++++++++++-
 vllm/engine/arg_utils.py                      |   8 +-
 vllm/utils/argparse_utils.py                  |  24 +-
 vllm/v1/worker/gpu/cudagraph_utils.py         |   2 +-
 13 files changed, 735 insertions(+), 64 deletions(-)
 create mode 100644 docs/design/optimization_levels.md

diff --git a/docs/design/optimization_levels.md b/docs/design/optimization_levels.md
new file mode 100644
index 0000000000000..940286071ef3c
--- /dev/null
+++ b/docs/design/optimization_levels.md
@@ -0,0 +1,69 @@
+<!-- markdownlint-disable -->
+
+# Optimization Levels
+
+## Overview
+
+vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
+
+## Level Summaries and Usage Examples
+```bash
+# CLI usage
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=0
+)
+```
+
+#### `-O1`: Quick Optimizations
+- **Startup**: Moderate startup time
+- **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
+- **Use case**:  Balance for most development scenarios
+
+```bash
+# CLI usage
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O1
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=1
+)
+```
+
+#### `-O2`: Full Optimizations (Default)
+- **Startup**: Longer startup time
+- **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
+- **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that  noop & fusion flags are enabled. 
+
+```bash
+# CLI usage (default, so optional)
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=2  # This is the default
+)
+```
+
+#### `-O3`: Full Optimization
+Still in development. Added infrastructure to prevent changing API in future 
+release. Currently behaves the same O2.
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Startup Time Too Long**: Use `-O0` or `-O1` for faster startup
+2. **Compilation Errors**: Use `debug_dump_path` for additional debugging information
+3. **Performance Issues**: Ensure using `-O2` for production
\ No newline at end of file
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 1e8a882a7f3eb..a9e5ccee520e3 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -172,8 +172,8 @@ def test_splitting_ops_dynamic():
     config = VllmConfig()
     # Default V1 config leaves cudagraph mode unset; splitting ops are only
     # populated when the engine decides to use piecewise compilation.
-    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
-    assert not config.compilation_config.splitting_ops_contain_attention()
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
+    assert config.compilation_config.splitting_ops_contain_attention()
 
     # When use_inductor_graph_partition=True
     config = VllmConfig(
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index be926764e4948..0077609b2f365 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -222,6 +222,47 @@ def test_media_io_kwargs_parser(arg, expected):
     assert args.media_io_kwargs == expected
 
 
+@pytest.mark.parametrize(
+    ("args", "expected"),
+    [
+        (["-O", "1"], "1"),
+        (["-O", "2"], "2"),
+        (["-O", "3"], "3"),
+        (["-O0"], "0"),
+        (["-O1"], "1"),
+        (["-O2"], "2"),
+        (["-O3"], "3"),
+    ],
+)
+def test_optimization_level(args, expected):
+    """
+    Test space-separated optimization levels (-O 1, -O 2, -O 3) map to
+    optimization_level.
+    """
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    parsed_args = parser.parse_args(args)
+    assert parsed_args.optimization_level == expected
+    assert parsed_args.compilation_config.mode is None
+
+
+@pytest.mark.parametrize(
+    ("args", "expected"),
+    [
+        (["-O.mode=0"], 0),
+        (["-O.mode=1"], 1),
+        (["-O.mode=2"], 2),
+        (["-O.mode=3"], 3),
+    ],
+)
+def test_mode_parser(args, expected):
+    """
+    Test compilation config modes (-O.mode=int) map to compilation_config.
+    """
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    parsed_args = parser.parse_args(args)
+    assert parsed_args.compilation_config.mode == expected
+
+
 def test_compilation_config():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
 
@@ -229,22 +270,6 @@ def test_compilation_config():
     args = parser.parse_args([])
     assert args.compilation_config == CompilationConfig()
 
-    # set to O3
-    args = parser.parse_args(["-O0"])
-    assert args.compilation_config.mode == 0
-
-    # set to O 3 (space)
-    args = parser.parse_args(["-O", "1"])
-    assert args.compilation_config.mode == 1
-
-    # set to O 3 (equals)
-    args = parser.parse_args(["-O=2"])
-    assert args.compilation_config.mode == 2
-
-    # set to O.mode 3
-    args = parser.parse_args(["-O.mode", "3"])
-    assert args.compilation_config.mode == 3
-
     # set to string form of a dict
     args = parser.parse_args(
         [
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 9121284de85b7..7d95dcddca711 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -5,7 +5,12 @@ import pytest
 import torch
 
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
+from vllm.config import (
+    CompilationConfig,
+    VllmConfig,
+    get_cached_compilation_config,
+    set_current_vllm_config,
+)
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (
     GeluAndMul,
@@ -86,6 +91,7 @@ def test_enabled_ops(
             backend=backend, mode=compilation_mode, custom_ops=custom_ops
         )
     )
+    get_cached_compilation_config.cache_clear()
     with set_current_vllm_config(vllm_config):
         assert CustomOp.default_on() == default_on
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 16f68d18fc68b..080e4d2afacc6 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -8,9 +8,20 @@ from unittest.mock import patch
 import pytest
 
 from vllm.compilation.backends import VllmBackend
-from vllm.config import ModelConfig, PoolerConfig, VllmConfig, update_config
+from vllm.config import (
+    CompilationConfig,
+    ModelConfig,
+    PoolerConfig,
+    VllmConfig,
+    update_config,
+)
+from vllm.config.compilation import CompilationMode, CUDAGraphMode
 from vllm.config.load import LoadConfig
 from vllm.config.utils import get_field
+from vllm.config.vllm import (
+    OPTIMIZATION_LEVEL_TO_CONFIG,
+    OptimizationLevel,
+)
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
@@ -235,6 +246,43 @@ def test_default_pooling_type(model_id, default_pooling_type, pooling_type):
     assert model_config.pooler_config.pooling_type == pooling_type
 
 
+@pytest.mark.parametrize(
+    ("model_id", "expected_is_moe_model"),
+    [
+        ("RedHatAI/Qwen3-8B-speculator.eagle3", False),
+        ("RedHatAI/Llama-3.1-8B-Instruct-NVFP4", False),
+        ("RedHatAI/Llama-3.2-1B-FP8", False),
+        ("RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8", False),
+        ("RedHatAI/gpt-oss-20b", True),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", True),
+        ("RedHatAI/Llama-4-Scout-17B-16E-Instruct", True),
+        ("RedHatAI/Mixtral-8x7B-Instruct-v0.1", True),
+    ],
+)
+def test_moe_model_detection(model_id, expected_is_moe_model):
+    model_config = ModelConfig(model_id)
+    # Just check that is_moe_model field exists and is a boolean
+    assert model_config.is_model_moe() == expected_is_moe_model
+
+
+@pytest.mark.parametrize(
+    ("model_id", "quantized"),
+    [
+        ("RedHatAI/Qwen3-8B-speculator.eagle3", False),
+        ("RedHatAI/Llama-3.1-8B-Instruct-NVFP4", True),
+        ("RedHatAI/Llama-3.2-1B-FP8", True),
+        ("RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8", True),
+        ("RedHatAI/gpt-oss-20b", True),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", True),
+        ("RedHatAI/Mixtral-8x7B-Instruct-v0.1", False),
+    ],
+)
+def test_is_quantized(model_id, quantized):
+    model_config = ModelConfig(model_id)
+    # Just check that quantized field exists and is a boolean
+    assert model_config.is_quantized() == quantized
+
+
 @pytest.mark.skipif(
     current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
 )
@@ -552,3 +600,260 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
     assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
     assert os.path.exists(config2.model) and os.path.isdir(config2.model)
     assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
+
+
+@pytest.mark.parametrize(
+    ("backend", "custom_ops", "expected"),
+    [
+        ("eager", [], True),
+        ("eager", ["+fused_layernorm"], True),
+        ("eager", ["all", "-fused_layernorm"], False),
+        ("inductor", [], False),
+        ("inductor", ["none", "+fused_layernorm"], True),
+        ("inductor", ["none", "-fused_layernorm"], False),
+    ],
+)
+def test_is_custom_op_enabled(backend: str, custom_ops: list[str], expected: bool):
+    """Test that is_custom_op_enabled works correctly."""
+    config = VllmConfig(
+        compilation_config=CompilationConfig(backend=backend, custom_ops=custom_ops)
+    )
+    assert config.compilation_config.is_custom_op_enabled("fused_layernorm") is expected
+
+
+def test_vllm_config_defaults_are_none():
+    """Verify that optimization-level defaults are None when not set by user."""
+    # Test all optimization levels to ensure defaults work correctly
+    for opt_level in OptimizationLevel:
+        config = object.__new__(VllmConfig)
+        config.compilation_config = CompilationConfig()
+        config.optimization_level = opt_level
+        config.model_config = None
+
+        # Use the global optimization level defaults
+        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[opt_level]
+
+        # Verify that all pass_config values are None before defaults are applied
+        for pass_k in default_config["compilation_config"]["pass_config"]:
+            assert getattr(config.compilation_config.pass_config, pass_k) is None
+
+        # Verify that other config values are None before defaults are applied
+        for k in default_config["compilation_config"]:
+            if k != "pass_config":
+                assert getattr(config.compilation_config, k) is None
+
+
+@pytest.mark.parametrize(
+    ("model_id", "compiliation_config", "optimization_level"),
+    [
+        (
+            None,
+            CompilationConfig(backend="eager", custom_ops=["+quant_fp8"]),
+            OptimizationLevel.O0,
+        ),
+        (None, CompilationConfig(), OptimizationLevel.O0),
+        (None, CompilationConfig(), OptimizationLevel.O1),
+        (None, CompilationConfig(), OptimizationLevel.O2),
+        (None, CompilationConfig(), OptimizationLevel.O3),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(backend="inductor", custom_ops=["+quant_fp8"]),
+            OptimizationLevel.O2,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O0,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O1,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O2,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O3,
+        ),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O0),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O1),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O2),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O3),
+    ],
+)
+def test_vllm_config_defaults(model_id, compiliation_config, optimization_level):
+    """Test that optimization-level defaults are correctly applied."""
+
+    model_config = None
+    if model_id is not None:
+        model_config = ModelConfig(model_id)
+        vllm_config = VllmConfig(
+            model_config=model_config,
+            compilation_config=compiliation_config,
+            optimization_level=optimization_level,
+        )
+    else:
+        vllm_config = VllmConfig(
+            compilation_config=compiliation_config,
+            optimization_level=optimization_level,
+        )
+    # Use the global optimization level defaults
+    default_config = OPTIMIZATION_LEVEL_TO_CONFIG[optimization_level]
+
+    # Verify pass_config defaults (nested under compilation_config)
+    pass_config_dict = default_config["compilation_config"]["pass_config"]
+    for pass_k, pass_v in pass_config_dict.items():
+        actual = getattr(vllm_config.compilation_config.pass_config, pass_k)
+        expected = pass_v(vllm_config) if callable(pass_v) else pass_v
+        assert actual == expected, (
+            f"pass_config.{pass_k}: expected {expected}, got {actual}"
+        )
+
+    # Verify other compilation_config defaults
+    compilation_config_dict = default_config["compilation_config"]
+    for k, v in compilation_config_dict.items():
+        if k != "pass_config":
+            actual = getattr(vllm_config.compilation_config, k)
+            expected = v(vllm_config) if callable(v) else v
+            assert actual == expected, (
+                f"compilation_config.{k}: expected {expected}, got {actual}"
+            )
+
+
+def test_vllm_config_callable_defaults():
+    """Test that callable defaults work in the config system.
+
+    Verifies that lambdas in default configs can inspect VllmConfig properties
+    (e.g., is_quantized, is_model_moe) to conditionally set optimization flags.
+    """
+    config_no_model = VllmConfig(optimization_level=OptimizationLevel.O2)
+
+    # Callable that checks if model exists
+    has_model = lambda cfg: cfg.model_config is not None
+    assert has_model(config_no_model) is False
+
+    # Test with quantized model
+    quantized_model = ModelConfig("RedHatAI/Llama-3.2-1B-FP8")
+    config_quantized = VllmConfig(
+        model_config=quantized_model, optimization_level=OptimizationLevel.O2
+    )
+    enable_if_quantized = lambda cfg: (
+        cfg.model_config is not None and cfg.model_config.is_quantized()
+    )
+    assert enable_if_quantized(config_quantized) is True
+    assert enable_if_quantized(config_no_model) is False
+
+    # Test with MoE model
+    moe_model = ModelConfig("deepseek-ai/DeepSeek-V2-Lite")
+    config_moe = VllmConfig(
+        model_config=moe_model, optimization_level=OptimizationLevel.O2
+    )
+    enable_if_sequential = lambda cfg: (
+        cfg.model_config is not None and not cfg.model_config.is_model_moe()
+    )
+    assert enable_if_sequential(config_moe) is False
+    assert enable_if_sequential(config_quantized) is True
+
+
+def test_vllm_config_explicit_overrides():
+    """Test that explicit property overrides work correctly with callable defaults.
+
+    When users explicitly set configuration properties, those values
+    take precedence over callable defaults, across different models and
+    optimization levels.
+    """
+    from vllm.config.compilation import PassConfig
+
+    quantized_model = ModelConfig("RedHatAI/Llama-3.2-1B-FP8")
+    moe_model = ModelConfig("deepseek-ai/DeepSeek-V2-Lite")
+    regular_model = ModelConfig("Qwen/Qwen1.5-7B")
+
+    # Explicit compilation mode override on O0 (where default is NONE)
+    compilation_config = CompilationConfig(mode=CompilationMode.VLLM_COMPILE)
+    config = VllmConfig(
+        optimization_level=OptimizationLevel.O0,
+        compilation_config=compilation_config,
+    )
+    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+
+    # Explicit pass config flags to override defaults
+    pass_config = PassConfig(enable_noop=True, enable_attn_fusion=True)
+    compilation_config = CompilationConfig(pass_config=pass_config)
+    config = VllmConfig(
+        optimization_level=OptimizationLevel.O0,
+        compilation_config=compilation_config,
+    )
+    assert config.compilation_config.pass_config.enable_noop is True
+    assert config.compilation_config.pass_config.enable_attn_fusion is True
+
+    # Explicit cudagraph mode override on quantized model at O2
+    pass_config = PassConfig(enable_async_tp=True)
+    compilation_config = CompilationConfig(
+        cudagraph_mode=CUDAGraphMode.NONE, pass_config=pass_config
+    )
+    config = VllmConfig(
+        model_config=quantized_model,
+        optimization_level=OptimizationLevel.O2,
+        compilation_config=compilation_config,
+    )
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+    assert config.compilation_config.pass_config.enable_async_tp is True
+    # Mode should still use default for O2
+    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+
+    # Different optimization levels with same model
+    config_o0 = VllmConfig(
+        model_config=regular_model, optimization_level=OptimizationLevel.O0
+    )
+    config_o2 = VllmConfig(
+        model_config=regular_model, optimization_level=OptimizationLevel.O2
+    )
+    assert config_o0.compilation_config.mode == CompilationMode.NONE
+    assert config_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config_o0.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+    assert (
+        config_o2.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
+    )
+
+    # Same optimization level across different model types
+    config_moe_o2 = VllmConfig(
+        model_config=moe_model, optimization_level=OptimizationLevel.O2
+    )
+    config_regular_o2 = VllmConfig(
+        model_config=regular_model, optimization_level=OptimizationLevel.O2
+    )
+    config_quantized_o2 = VllmConfig(
+        model_config=quantized_model, optimization_level=OptimizationLevel.O2
+    )
+    # All should have same base compilation settings at O2
+    assert config_moe_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config_regular_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config_quantized_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert (
+        config_moe_o2.compilation_config.cudagraph_mode
+        == CUDAGraphMode.FULL_AND_PIECEWISE
+    )
+    assert (
+        config_regular_o2.compilation_config.cudagraph_mode
+        == CUDAGraphMode.FULL_AND_PIECEWISE
+    )
+
+    # Override one field but not others
+    pass_config = PassConfig(enable_noop=False)
+    compilation_config = CompilationConfig(pass_config=pass_config)
+    config = VllmConfig(
+        model_config=regular_model,
+        optimization_level=OptimizationLevel.O2,
+        compilation_config=compilation_config,
+    )
+    # Explicit override should be respected
+    assert config.compilation_config.pass_config.enable_noop is False
+    # Other fields should still use defaults
+    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 32d4eca541356..c0519155c4ba8 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -28,6 +28,7 @@ def parser():
     parser.add_argument("--enable-feature", action="store_true")
     parser.add_argument("--hf-overrides", type=json.loads)
     parser.add_argument("-O", "--compilation-config", type=json.loads)
+    parser.add_argument("--optimization-level", type=int)
     return parser
 
 
@@ -217,8 +218,8 @@ def test_dict_args(parser):
             "key15": "-minus.and.dot",
         },
     }
+    assert parsed_args.optimization_level == 1
     assert parsed_args.compilation_config == {
-        "mode": 1,
         "use_inductor_graph_partition": True,
         "backend": "custom",
         "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
@@ -241,12 +242,13 @@ def test_duplicate_dict_args(caplog_vllm, parser):
     parsed_args = parser.parse_args(args)
     # Should be the last value
     assert parsed_args.hf_overrides == {"key1": "val2"}
-    assert parsed_args.compilation_config == {"mode": 3}
+    assert parsed_args.optimization_level == 3
+    assert parsed_args.compilation_config == {"mode": 2}
 
     assert len(caplog_vllm.records) == 1
     assert "duplicate" in caplog_vllm.text
     assert "--hf-overrides.key1" in caplog_vllm.text
-    assert "-O.mode" in caplog_vllm.text
+    assert "--optimization-level" in caplog_vllm.text
 
 
 def test_model_specification(
@@ -383,7 +385,7 @@ def test_compilation_mode_string_values(parser):
     assert args.compilation_config == {"mode": 0}
 
     args = parser.parse_args(["-O3"])
-    assert args.compilation_config == {"mode": 3}
+    assert args.optimization_level == 3
 
     args = parser.parse_args(["-O.mode=NONE"])
     assert args.compilation_config == {"mode": "NONE"}
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index 7f9c2a0571c3c..12621d493e549 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -117,9 +117,9 @@ else:
     combo_cases_2 = [
         ("FA2", "FULL", CompilationMode.NONE, True),
         ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "PIECEWISE", CompilationMode.NONE, False),
+        ("FA2", "PIECEWISE", CompilationMode.NONE, True),
         ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, True),
         ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
         ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
         ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 865d045676d14..da2c100dae3dc 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -8,7 +8,7 @@ from dataclasses import asdict, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
-from pydantic import TypeAdapter, field_validator
+from pydantic import Field, TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
@@ -97,19 +97,25 @@ class PassConfig:
 
     This is separate from general `CompilationConfig` so that inductor passes
     don't all have access to full configuration - that would create a cycle as
-    the `PassManager` is set as a property of config."""
+    the `PassManager` is set as a property of config.
 
-    enable_fusion: bool = False
+    You must pass PassConfig to VLLMConfig constructor via the CompilationConfig
+    constructor. VLLMConfig's post_init does further initialization.
+    If used outside of the VLLMConfig, some fields may be left in an
+    improper state.
+    """
+
+    enable_fusion: bool = Field(default=None)
     """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
-    enable_attn_fusion: bool = False
+    enable_attn_fusion: bool = Field(default=None)
     """Whether to enable the custom attention+quant fusion pass."""
-    enable_noop: bool = False
+    enable_noop: bool = Field(default=None)
     """Whether to enable the custom no-op elimination pass."""
-    enable_sequence_parallelism: bool = False
+    enable_sequence_parallelism: bool = Field(default=None)
     """Whether to enable sequence parallelism."""
-    enable_async_tp: bool = False
+    enable_async_tp: bool = Field(default=None)
     """Whether to enable async TP."""
-    enable_fi_allreduce_fusion: bool = False
+    enable_fi_allreduce_fusion: bool = Field(default=None)
     """Whether to enable flashinfer allreduce fusion."""
     fi_allreduce_fusion_max_size_mb: float | None = None
     """The threshold of the communicated tensor sizes under which
@@ -167,6 +173,22 @@ class PassConfig:
         """
         return InductorPass.hash_dict(asdict(self))
 
+    @field_validator(
+        "enable_fusion",
+        "enable_attn_fusion",
+        "enable_noop",
+        "enable_sequence_parallelism",
+        "enable_async_tp",
+        "enable_fi_allreduce_fusion",
+        mode="wrap",
+    )
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        if value is None:
+            return value
+        return handler(value)
+
     def __post_init__(self) -> None:
         if not self.enable_noop:
             if self.enable_fusion:
@@ -243,7 +265,13 @@ class DynamicShapesConfig:
 @config
 @dataclass
 class CompilationConfig:
-    """Configuration for compilation. It has three parts:
+    """Configuration for compilation.
+
+    You must pass CompilationConfig to VLLMConfig constructor.
+    VLLMConfig's post_init does further initialization. If used outside of the
+    VLLMConfig, some fields will be left in an improper state.
+
+    It has three parts:
 
     - Top-level Compilation control:
         - [`mode`][vllm.config.CompilationConfig.mode]
@@ -282,14 +310,14 @@ class CompilationConfig:
     """
 
     # Top-level Compilation control
-    level: int | None = None
+    level: int = Field(default=None)
     """
     Level is deprecated and will be removed in the next release,
     either 0.12.0 or 0.11.2 whichever is soonest.
     Please use mode. Currently all levels are mapped to mode.
     """
     # Top-level Compilation control
-    mode: CompilationMode | None = None
+    mode: CompilationMode = Field(default=None)
     """The compilation approach used for torch.compile-based compilation of the
     model.
 
@@ -390,7 +418,7 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    cudagraph_mode: CUDAGraphMode | None = None
+    cudagraph_mode: CUDAGraphMode = Field(default=None)
     """
     The mode of the cudagraph:
 
@@ -452,7 +480,7 @@ class CompilationConfig:
     When `enable_lora` is False, this option has no effect.
     """
 
-    use_inductor_graph_partition: bool = False
+    use_inductor_graph_partition: bool = Field(default=None)
     """Use inductor graph partition to split the graph at cudagraph_unsafe ops.
     This partition happens at inductor codegen time after all passes and fusions
     are finished. It generates a single `call` function which wraps
@@ -648,6 +676,20 @@ class CompilationConfig:
             )
         return value
 
+    @field_validator(
+        "level",
+        "mode",
+        "cudagraph_mode",
+        "use_inductor_graph_partition",
+        mode="wrap",
+    )
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        if value is None:
+            return value
+        return handler(value)
+
     def __post_init__(self) -> None:
         if self.level is not None:
             logger.warning(
@@ -948,6 +990,13 @@ class CompilationConfig:
                     op,
                 )
 
+    def is_custom_op_enabled(self, op: str) -> bool:
+        if "all" in self.custom_ops:
+            return f"-{op}" not in self.custom_ops
+
+        assert "none" in self.custom_ops
+        return f"+{op}" in self.custom_ops
+
     def adjust_cudagraph_sizes_for_spec_decode(
         self, uniform_decode_query_len: int, tensor_parallel_size: int
     ):
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 25972f097f53d..84311596b660c 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1752,6 +1752,14 @@ class ModelConfig:
         logger.info("Using max model len %s", max_model_len)
         return max_model_len
 
+    def is_model_moe(
+        self,
+    ) -> bool:
+        return self.get_num_experts() > 1
+
+    def is_quantized(self) -> bool:
+        return getattr(self.hf_config, "quantization_config", None) is not None
+
 
 def get_served_model_name(model: str, served_model_name: str | list[str] | None):
     """
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 9342564aa3d3f..c576275e80fe3 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -9,8 +9,9 @@ import tempfile
 import threading
 import time
 from contextlib import contextmanager
-from dataclasses import replace
+from dataclasses import is_dataclass, replace
 from datetime import datetime
+from enum import IntEnum
 from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, TypeVar, get_args
@@ -57,6 +58,103 @@ else:
 logger = init_logger(__name__)
 
 
+class OptimizationLevel(IntEnum):
+    """Optimization level enum."""
+
+    O0 = 0
+    """O0 : No optimization. no compilation, no cudagraphs, no other
+    optimization, just starting up immediately"""
+    O1 = 1
+    """O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise 
+    cudagraphs"""
+    O2 = 2
+    """O2: Full optimizations. -O1 as well as Full and Piecewise cudagraphs."""
+    O3 = 3
+    """O3: Currently the same as -O2s."""
+
+
+IS_QUANTIZED = False
+IS_DENSE = False
+# The optimizations that depend on these properties currently set to False
+# in all cases.
+# if model_config is not None:
+#     IS_QUANTIZED = lambda c: c.model_config.is_quantized()
+#     IS_DENSE = lambda c: not c.model_config.is_model_moe()
+# See https://github.com/vllm-project/vllm/issues/25689.
+
+
+def enable_fusion(cfg: "VllmConfig") -> bool:
+    """Returns True if RMS norm or quant FP8 is enabled."""
+    return cfg.compilation_config.is_custom_op_enabled(
+        "rms_norm"
+    ) or cfg.compilation_config.is_custom_op_enabled("quant_fp8")
+
+
+OPTIMIZATION_LEVEL_00 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": False,
+            "enable_fusion": False,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": False,
+            "enable_sequence_parallelism": False,
+            "enable_async_tp": False,
+        },
+        "cudagraph_mode": CUDAGraphMode.NONE,
+        "use_inductor_graph_partition": False,
+    },
+}
+OPTIMIZATION_LEVEL_01 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": True,
+            "enable_fusion": enable_fusion,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": False,
+            "enable_sequence_parallelism": False,
+            "enable_async_tp": False,
+        },
+        "cudagraph_mode": CUDAGraphMode.PIECEWISE,
+        "use_inductor_graph_partition": False,
+    },
+}
+OPTIMIZATION_LEVEL_02 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": True,
+            "enable_fusion": enable_fusion,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": IS_QUANTIZED,
+            "enable_sequence_parallelism": IS_DENSE,
+            "enable_async_tp": IS_DENSE,
+        },
+        "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
+        "use_inductor_graph_partition": False,
+    },
+}
+OPTIMIZATION_LEVEL_03 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": True,
+            "enable_fusion": enable_fusion,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": IS_QUANTIZED,
+            "enable_sequence_parallelism": IS_DENSE,
+            "enable_async_tp": IS_DENSE,
+        },
+        "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
+        "use_inductor_graph_partition": False,
+    },
+}
+
+OPTIMIZATION_LEVEL_TO_CONFIG = {
+    OptimizationLevel.O0: OPTIMIZATION_LEVEL_00,
+    OptimizationLevel.O1: OPTIMIZATION_LEVEL_01,
+    OptimizationLevel.O2: OPTIMIZATION_LEVEL_02,
+    OptimizationLevel.O3: OPTIMIZATION_LEVEL_03,
+}
+
+
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class VllmConfig:
@@ -116,6 +214,11 @@ class VllmConfig:
     you are using. Contents must be hashable."""
     instance_id: str = ""
     """The ID of the vLLM instance."""
+    optimization_level: OptimizationLevel = OptimizationLevel.O2
+    """The optimization level. These levels trade startup time cost for
+    performance, with -O0 having the best startup time and -O3 having the best
+    performance. -02 is used by defult. See  OptimizationLevel for full
+    description."""
 
     def compute_hash(self) -> str:
         """
@@ -297,6 +400,50 @@ class VllmConfig:
 
         return replace(self, model_config=model_config)
 
+    def _set_config_default(self, config_obj: Any, key: str, value: Any) -> None:
+        """Set config attribute to default if not already set by user.
+
+        Args:
+            config_obj: Configuration object to update.
+            key: Attribute name.
+            value: Default value (static or callable).
+        """
+        if getattr(config_obj, key) is None:
+            # Some config values are known before initialization and are
+            # hard coded.
+            # Other values depend on the user given configuration, so they are
+            # implemented with lambda functions and decided at run time.
+            setattr(config_obj, key, value(self) if callable(value) else value)
+
+    def _apply_optimization_level_defaults(self, defaults: dict[str, Any]) -> None:
+        """Apply optimization level defaults using self as root.
+
+        Recursively applies values from defaults into nested config objects.
+        Only fields present in defaults are overwritten.
+
+        If the user configuration does not specify a value for a default field
+        and if the default field is still None after all user selections are
+        applied, then default values will be applied to the field. User speciied
+        fields will not be overridden by the default.
+
+        Args:
+            defaults: Dictionary of default values to apply.
+        """
+
+        def apply_recursive(config_obj: Any, config_defaults: dict[str, Any]) -> None:
+            """Recursively apply defaults to config_obj, using self as root."""
+            for key, value in config_defaults.items():
+                if not hasattr(config_obj, key):
+                    continue
+
+                current = getattr(config_obj, key)
+                if isinstance(value, dict) and is_dataclass(current):
+                    apply_recursive(current, value)
+                else:
+                    self._set_config_default(config_obj, key, value)
+
+        apply_recursive(self, defaults)
+
     def _post_init_kv_transfer_config(self) -> None:
         """Update KVTransferConfig based on top-level configs in VllmConfig.
 
@@ -434,17 +581,47 @@ class VllmConfig:
                 "precision for chunked prefill triton kernels."
             )
 
-        # If the user does not explicitly set a compilation mode, then
-        # we use the default mode. The default mode depends on other
-        # settings (see the below code).
+        if (
+            self.optimization_level > OptimizationLevel.O0
+            and self.model_config is not None
+            and self.model_config.enforce_eager
+        ):
+            logger.warning("Enforce eager set, overriding optimization level to -O0")
+            self.optimization_level = OptimizationLevel.O0
+
+        if self.compilation_config.backend == "eager" or (
+            self.compilation_config.mode is not None
+            and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
+        ):
+            logger.warning(
+                "Inductor compilation was disabled by user settings,"
+                "Optimizations settings that are only active during"
+                "Inductor compilation will be ignored."
+            )
+
+        def has_blocked_weights():
+            if self.quant_config is not None:
+                if hasattr(self.quant_config, "weight_block_size"):
+                    return self.quant_config.weight_block_size is not None
+                elif hasattr(self.quant_config, "has_blocked_weights"):
+                    return self.quant_config.has_blocked_weights()
+            return False
+
+        # Enable quant_fp8 CUDA ops (TODO disable in follow up)
+        # On H100 the CUDA kernel is faster than
+        # native implementation
+        # https://github.com/vllm-project/vllm/issues/25094
+        if has_blocked_weights():
+            custom_ops = self.compilation_config.custom_ops
+            if "-quant_fp8" not in custom_ops:
+                custom_ops.append("+quant_fp8")
+
         if self.compilation_config.mode is None:
-            if self.model_config is not None and not self.model_config.enforce_eager:
+            if self.optimization_level > OptimizationLevel.O0:
                 self.compilation_config.mode = CompilationMode.VLLM_COMPILE
             else:
                 self.compilation_config.mode = CompilationMode.NONE
 
-        # If user does not set custom ops via none or all set it here based on
-        # compilation mode and backend.
         if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
             if (
                 self.compilation_config.backend == "inductor"
@@ -454,23 +631,33 @@ class VllmConfig:
             else:
                 self.compilation_config.custom_ops.append("all")
 
+        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
+        self._apply_optimization_level_defaults(default_config)
+        if (
+            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
+        ):
+            logger.info(
+                "Cudagraph mode %s is not compatible with compilation mode %s."
+                "Overriding to NONE.",
+                self.compilation_config.cudagraph_mode,
+                self.compilation_config.mode,
+            )
+            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
             self.compilation_config.pass_config.enable_sequence_parallelism = True
+        if self.compilation_config.pass_config.enable_sequence_parallelism:
+            if "-rms_norm" in self.compilation_config.custom_ops:
+                logger.warning(
+                    "RMS norm force disabled, sequence parallelism might break"
+                )
+            else:
+                self.compilation_config.custom_ops.append("+rms_norm")
 
         if current_platform.support_static_graph_mode():
-            # if cudagraph_mode is not explicitly set by users, set default
-            # value
-            if self.compilation_config.cudagraph_mode is None:
-                if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
-                    # default to full and piecewise for most models
-                    self.compilation_config.cudagraph_mode = (
-                        CUDAGraphMode.FULL_AND_PIECEWISE
-                    )
-                else:
-                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
             # if cudagraph_mode has full cudagraphs, we need to check support
             if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
                 # decode context parallel does not support full cudagraphs
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 696ff3a1f4024..e4c9a82d25223 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -77,6 +77,7 @@ from vllm.config.observability import DetailedTraceModules
 from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
+from vllm.config.vllm import OptimizationLevel
 from vllm.logger import init_logger, suppress_logging
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
@@ -560,6 +561,7 @@ class EngineArgs:
     stream_interval: int = SchedulerConfig.stream_interval
 
     kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
+    optimization_level: OptimizationLevel = VllmConfig.optimization_level
 
     kv_offloading_size: float | None = CacheConfig.kv_offloading_size
     kv_offloading_backend: KVOffloadingBackend | None = (
@@ -1114,6 +1116,10 @@ class EngineArgs:
             "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
         )
 
+        vllm_group.add_argument(
+            "--optimization-level", **vllm_kwargs["optimization_level"]
+        )
+
         # Other arguments
         parser.add_argument(
             "--disable-log-stats",
@@ -1733,7 +1739,6 @@ class EngineArgs:
             compilation_config.max_cudagraph_capture_size = (
                 self.max_cudagraph_capture_size
             )
-
         config = VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
@@ -1750,6 +1755,7 @@ class EngineArgs:
             kv_events_config=self.kv_events_config,
             ec_transfer_config=self.ec_transfer_config,
             additional_config=self.additional_config,
+            optimization_level=self.optimization_level,
         )
 
         return config
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index 692e756d19634..b68157f02f6cc 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -247,16 +247,16 @@ class FlexibleArgumentParser(ArgumentParser):
             elif arg.startswith("-O") and arg != "-O" and arg[2] != ".":
                 # allow -O flag to be used without space, e.g. -O3 or -Odecode
                 # -O.<...> handled later
-                # also handle -O=<mode> here
-                mode = arg[3:] if arg[2] == "=" else arg[2:]
-                processed_args.append(f"-O.mode={mode}")
+                # also handle -O=<optimization_level> here
+                optimization_level = arg[3:] if arg[2] == "=" else arg[2:]
+                processed_args += ["--optimization-level", optimization_level]
             elif (
                 arg == "-O"
                 and i + 1 < len(args)
                 and args[i + 1] in {"0", "1", "2", "3"}
             ):
-                # Convert -O <n> to -O.mode <n>
-                processed_args.append("-O.mode")
+                # Convert -O <n> to --optimization-level <n>
+                processed_args.append("--optimization-level")
             else:
                 processed_args.append(arg)
 
@@ -294,10 +294,24 @@ class FlexibleArgumentParser(ArgumentParser):
         delete = set[int]()
         dict_args = defaultdict[str, dict[str, Any]](dict)
         duplicates = set[str]()
+        # Track regular arguments (non-dict args) for duplicate detection
+        regular_args_seen = set[str]()
         for i, processed_arg in enumerate(processed_args):
             if i in delete:  # skip if value from previous arg
                 continue
 
+            if processed_arg.startswith("--") and "." not in processed_arg:
+                if "=" in processed_arg:
+                    arg_name = processed_arg.split("=", 1)[0]
+                else:
+                    arg_name = processed_arg
+
+                if arg_name in regular_args_seen:
+                    duplicates.add(arg_name)
+                else:
+                    regular_args_seen.add(arg_name)
+                continue
+
             if processed_arg.startswith("-") and "." in processed_arg:
                 if "=" in processed_arg:
                     processed_arg, value_str = processed_arg.split("=", 1)
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 4fd8eb50a4ea8..eb8e610ae4710 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -37,7 +37,7 @@ class CudaGraphManager:
         self.dp_size = vllm_config.parallel_config.data_parallel_size
         self.compilation_config = vllm_config.compilation_config
         assert self.compilation_config is not None
-
+        self.cudagraph_mode: CUDAGraphMode
         if self.compilation_config.cudagraph_mode is None:
             self.cudagraph_mode = CUDAGraphMode.NONE
         else:

From 51906c8c559f1d7c23efa667fcb3b7ed79f7fa25 Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Thu, 27 Nov 2025 18:09:24 +0800
Subject: [PATCH 113/134] [Docs] Improve `priority` parameter documentation
 (#29572)

Signed-off-by: maang <maang_h@163.com>
Signed-off-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/entrypoints/llm.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1860f383d45fb..f6ee746789981 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -405,6 +405,9 @@ class LLM:
             lora_request: LoRA request to use for generation, if any.
             priority: The priority of the requests, if any.
                 Only applicable when priority scheduling policy is enabled.
+                If provided, must be a list of integers matching the length
+                of `prompts`, where each priority value corresponds to the prompt
+                at the same index.
 
         Returns:
             A list of `RequestOutput` objects containing the

From e6d4f3c254a215e75b4d76d531176e242fe62a1f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Nov 2025 18:23:06 +0800
Subject: [PATCH 114/134] [Bugfix] Fix pre-commit (#29601)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../ec_connector/integration/test_epd_correctness.py  |  5 ++---
 vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py     | 11 +++++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/v1/ec_connector/integration/test_epd_correctness.py b/tests/v1/ec_connector/integration/test_epd_correctness.py
index 69c4c58e349b9..616d34441ab8e 100644
--- a/tests/v1/ec_connector/integration/test_epd_correctness.py
+++ b/tests/v1/ec_connector/integration/test_epd_correctness.py
@@ -237,9 +237,8 @@ def main():
 
     for i, prompt_data in enumerate(test_prompts):
         print(
-            f"\nRunning prompt {i + 1}/{len(test_prompts)}: {
-                prompt_data['description']
-            }"
+            f"\nRunning prompt {i + 1}/{len(test_prompts)}: "
+            f"{prompt_data['description']}"
         )
 
         output_str = run_chat_completion(
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
index a6f50d68cc684..dcdeedda60a77 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
@@ -35,13 +35,16 @@ class EagleCudaGraphManager:
         self.compilation_config = vllm_config.compilation_config
         assert self.compilation_config is not None
 
+        cudagraph_mode: CUDAGraphMode
         if self.compilation_config.cudagraph_mode is None:
-            self.cudagraph_mode = CUDAGraphMode.NONE
+            cudagraph_mode = CUDAGraphMode.NONE
         else:
-            self.cudagraph_mode = self.compilation_config.cudagraph_mode
-            if self.cudagraph_mode == CUDAGraphMode.FULL:
+            cudagraph_mode = self.compilation_config.cudagraph_mode
+            if cudagraph_mode == CUDAGraphMode.FULL:
                 # NOTE(woosuk): For Eagle, we only use CUDA graphs for decode.
-                self.cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
+                cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
+
+        self.cudagraph_mode = cudagraph_mode
 
         self.cudagraph_sizes = get_cudagraph_sizes(
             self.compilation_config.cudagraph_capture_sizes,

From a5abd1d38439a026607d641c594ca98829ea5623 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 27 Nov 2025 19:33:19 +0800
Subject: [PATCH 115/134] [CI] Auto label CPU related issues (#29602)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .github/workflows/issue_autolabel.yml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index 7d565ef9f2e45..a8251ceed07f4 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -105,6 +105,31 @@ jobs:
                   }
                 ],
               },
+              cpu: {
+                // Keyword search - matches whole words only (with word boundaries)
+                keywords: [
+                  {
+                    term: "[CPU]",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "x86",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "ARM",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "Apple Silicon",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "IBM Z",
+                    searchIn: "title"
+                  },
+                ],
+              },
               // Add more label configurations here as needed
               // example: {
               //   keywords: [...],

From cf348c8d27c34247f5976a86ebe6f4a3b4f9e888 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Thu, 27 Nov 2025 04:36:24 -0800
Subject: [PATCH 116/134] [Bugfix] Fix HunyuanVL XD-RoPE (#29593)

Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored by: grider-transwithai <grider@transwith.ai>
---
 vllm/model_executor/models/hunyuan_vision.py           | 2 +-
 vllm/transformers_utils/processors/hunyuan_vl_image.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index e83addd0c092f..2950db571e6ee 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -847,7 +847,7 @@ class HunYuanVLForConditionalGeneration(
                 .expand(-1, llm_grid_w + 1)
                 .reshape(-1)
             )
-            h_index[pos : pos + token_num] = 0
+            t_index[pos : pos + token_num] = image_index
 
         if xd_num == 4:
             llm_positions = torch.stack([p_index, w_index, h_index, t_index])
diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py
index 0a7e7865c783a..0b10ae249dbb6 100644
--- a/vllm/transformers_utils/processors/hunyuan_vl_image.py
+++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py
@@ -195,9 +195,9 @@ class HunYuanVLImageProcessor(BaseImageProcessor):
         processed_images = []
         for image in images:
             if do_resize:
-                resized_width, resized_height = smart_resize(
-                    width,
-                    height,
+                resized_height, resized_width = smart_resize(
+                    height=height,
+                    width=width,
                     factor=patch_size * merge_size,
                     min_pixels=self.min_pixels,
                     max_pixels=self.max_pixels,

From 2f5f9acd551cfb737997a1f7f86982ec74aabf79 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 27 Nov 2025 21:56:28 +0800
Subject: [PATCH 117/134] [LoRA] Continue optimizing MoE LoRA weight loading
 (#29322)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_lora_checkpoints.py          |  15 +-
 tests/lora/test_lora_huggingface.py          |   8 +-
 vllm/lora/layers/base.py                     |   2 +-
 vllm/lora/layers/column_parallel_linear.py   |  16 +-
 vllm/lora/layers/fused_moe.py                | 202 ++++++++++---------
 vllm/lora/layers/logits_processor.py         |   2 +-
 vllm/lora/layers/replicated_linear.py        |   2 +-
 vllm/lora/layers/row_parallel_linear.py      |   4 +-
 vllm/lora/layers/vocal_parallel_embedding.py |   2 +-
 vllm/lora/lora_weights.py                    |  53 +++++
 vllm/lora/models.py                          |  50 ++---
 vllm/lora/utils.py                           |  17 +-
 vllm/lora/worker_manager.py                  |  10 +-
 vllm/model_executor/models/interfaces.py     |   1 +
 vllm/model_executor/models/qwen3_vl_moe.py   |   1 +
 15 files changed, 228 insertions(+), 157 deletions(-)

diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 2219d470e91a1..b9b1bc59c6ed7 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -28,12 +28,13 @@ def test_load_checkpoints(
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules: list[str] = []
+    expected_lora_lst: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
-            expected_lora_modules.extend(packed_modules_mapping[module])
+            expected_lora_lst.extend(packed_modules_mapping[module])
         else:
-            expected_lora_modules.append(module)
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     if lora_name == "baichuan7B":
         peft_helper = PEFTHelper.from_local_dir(
             baichuan_lora_files, max_position_embeddings=4096
@@ -103,13 +104,13 @@ def test_lora_weights_mapping(baichuan_lora_files):
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules: list[str] = []
+    expected_lora_lst: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
-            expected_lora_modules.extend(packed_modules_mapping[module])
+            expected_lora_lst.extend(packed_modules_mapping[module])
         else:
-            expected_lora_modules.append(module)
-
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.": "language_model.model.",
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 7d20faef541aa..6a787471c74fd 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -26,13 +26,13 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
     embedding_modules = LlamaForCausalLM.embedding_modules
     embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
-    expected_lora_modules: list[str] = []
+    expected_lora_lst: list[str] = []
     for module in LLAMA_LORA_MODULES:
         if module in packed_modules_mapping:
-            expected_lora_modules.extend(packed_modules_mapping[module])
+            expected_lora_lst.extend(packed_modules_mapping[module])
         else:
-            expected_lora_modules.append(module)
-
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     lora_path = get_adapter_absolute_path(lora_name)
 
     # lora loading should work for either absolute path and huggingface id.
diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py
index 3bfb88c007622..a4b8fb4d2aec5 100644
--- a/vllm/lora/layers/base.py
+++ b/vllm/lora/layers/base.py
@@ -60,7 +60,7 @@ class BaseLayerWithLoRA(nn.Module):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
         raise NotImplementedError
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index 3e21d426c304a..904025901fba7 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -153,7 +153,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is ColumnParallelLinear or (
             type(source_layer) is MergedColumnParallelLinear
@@ -272,7 +272,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return (
             type(source_layer) is MergedColumnParallelLinear
@@ -338,7 +338,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1
 
@@ -396,7 +396,7 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 3
 
@@ -434,7 +434,7 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -480,7 +480,7 @@ class MergedColumnParallelLinearWithShardedLoRA(MergedColumnParallelLinearWithLo
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -516,7 +516,7 @@ class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -565,7 +565,7 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 1b925742c3002..3ad19370962ab 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -401,6 +401,61 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                     self.w13_lora_b_stacked[1][lora_id][experts_id]
                 )
 
+    def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor:
+        """
+        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
+        """
+        if self.tp_size == 1 or not self.fully_sharded:
+            return w13_lora_a
+
+        # w13_lora_a shape (num_experts,rank,input_size)
+        current_lora_rank = w13_lora_a.shape[1]
+        assert current_lora_rank % self.tp_size == 0
+        # Based on S-LoRA, we slice W13/W1/W3 A along the rank dim.
+        sliced_rank = current_lora_rank // self.tp_size
+        start_idx = self.tp_rank * sliced_rank
+        end_idx = (self.tp_rank + 1) * sliced_rank
+        return w13_lora_a[:, start_idx:end_idx, :]
+
+    def _slice_w13_b(self, w13_lora_b: torch.Tensor):
+        if self.tp_size == 1:
+            return w13_lora_b
+
+        # w13_lora_b shape (num_experts,output_size,rank)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+
+        return w13_lora_b[:, start_idx:end_idx, :]
+
+    def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor:
+        """
+        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
+        """
+        if self.tp_size == 1:
+            return w2_lora_a
+        # w2_lora_a shape (num_experts,rank,input_size)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+
+        return w2_lora_a[:, :, start_idx:end_idx]
+
+    def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor:
+        """
+        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
+        """
+        if self.tp_size == 1 or not self.fully_sharded:
+            return w2_lora_b
+        # Based on S-LoRA, we slice W2 B along the hidden_size dim.
+        # w2_lora_b shape (num_experts,output_size,rank)
+        current_lora_size = w2_lora_b.shape[1]
+
+        sliced_size = current_lora_size // self.tp_size
+        start_idx = self.tp_rank * sliced_size
+        end_idx = (self.tp_rank + 1) * sliced_size
+        return w2_lora_b[:, start_idx:end_idx, :]
+
     def reset_lora(self, index: int):
         """Resets the lora weights at index back to 0."""
         for pos in range(self._w13_slices):
@@ -411,6 +466,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         self.w2_lora_b_stacked[0][index] = 0
         self.adapter_enabled[index] = 0
 
+    #
+
     def set_lora(
         self,
         index: int,
@@ -418,69 +475,55 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         lora_b: torch.Tensor | list[torch.Tensor],
     ):
         """Overwrites lora tensors at index."""
+        # Make mypy happy
         assert isinstance(lora_a, list)
         assert isinstance(lora_b, list)
+
         self.reset_lora(index)
         self.adapter_enabled[index] = 1
-        for eid in range(len(lora_a) // 3):
-            w1_lora_a = lora_a[eid * 3]
-            w2_lora_a = lora_a[eid * 3 + 1]
-            w3_lora_a = lora_a[eid * 3 + 2]
-            w1_lora_b = lora_b[eid * 3]
-            w2_lora_b = lora_b[eid * 3 + 1]
-            w3_lora_b = lora_b[eid * 3 + 2]
 
-            # Handle the case of adding LoRA to only a subset of experts
-            if w1_lora_a is None or w2_lora_a is None or w3_lora_a is None:
-                continue
+        num_experts = self.w13_lora_a_stacked[0].shape[1]
 
-            if self.tp_size > 1:
-                shard_size = self.base_layer.intermediate_size_per_partition
-                start_idx = self.tp_rank * shard_size
-                end_idx = (self.tp_rank + 1) * shard_size
+        w1_lora_a, w2_lora_a, w3_lora_a = lora_a
+        w1_lora_b, w2_lora_b, w3_lora_b = lora_b
+        assert (
+            num_experts
+            == w1_lora_a.shape[0]
+            == w2_lora_a.shape[0]
+            == w3_lora_a.shape[0]
+        )
 
-                w1_lora_b = w1_lora_b[start_idx:end_idx, :]
-                w3_lora_b = w3_lora_b[start_idx:end_idx, :]
-                w2_lora_a = w2_lora_a[:, start_idx:end_idx]
+        slliced_w1_lora_a = self._slice_w13_a(w1_lora_a)
+        slliced_w1_lora_b = self._slice_w13_b(w1_lora_b)
+        slliced_w3_lora_a = self._slice_w13_a(w3_lora_a)
+        slliced_w3_lora_b = self._slice_w13_b(w3_lora_b)
 
-                if self.fully_sharded:
-                    # Based on S-LoRA, we slice W1 and W3 A along the rank dim,
-                    # and W2 B along the hidden_size dim.
-                    w13_shard_size = self.w13_lora_a_stacked[0][index, eid].shape[0]
-                    w13_start_idx = self.tp_rank * w13_shard_size
-                    w13_end_idx = (self.tp_rank + 1) * w13_shard_size
-                    w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :]
-                    w3_lora_a = w3_lora_a[w13_start_idx:w13_end_idx, :]
+        sliced_w2_lora_a = self._slice_w2_a(w2_lora_a)
+        sliced_w2_lora_b = self._slice_w2_b(w2_lora_b)
 
-                    w2_shard_size = self.w2_lora_b_stacked[0][index, eid].shape[0]
-                    w2_start_idx = self.tp_rank * w2_shard_size
-                    w2_end_idx = (self.tp_rank + 1) * w2_shard_size
-                    w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :]
-            # w1 lora_a
-            self.w13_lora_a_stacked[0][
-                index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1]
-            ].copy_(w1_lora_a, non_blocking=True)
-            # w3 lora_a
-            self.w13_lora_a_stacked[1][
-                index, eid, : w3_lora_a.shape[0], : w3_lora_a.shape[1]
-            ].copy_(w3_lora_a, non_blocking=True)
+        self.w13_lora_a_stacked[0][
+            index, :, : slliced_w1_lora_a.shape[1], : slliced_w1_lora_a.shape[2]
+        ].copy_(slliced_w1_lora_a, non_blocking=True)
 
-            # w1 lora_b
-            self.w13_lora_b_stacked[0][
-                index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1]
-            ].copy_(w1_lora_b, non_blocking=True)
-            # w3 lora_b
-            self.w13_lora_b_stacked[1][
-                index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1]
-            ].copy_(w3_lora_b, non_blocking=True)
+        self.w13_lora_a_stacked[1][
+            index, :, : slliced_w3_lora_a.shape[1], : slliced_w3_lora_a.shape[2]
+        ].copy_(slliced_w3_lora_a, non_blocking=True)
 
-            self.w2_lora_a_stacked[0][
-                index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1]
-            ].copy_(w2_lora_a, non_blocking=True)
+        self.w13_lora_b_stacked[0][
+            index, :, : slliced_w1_lora_b.shape[1], : slliced_w1_lora_b.shape[2]
+        ].copy_(slliced_w1_lora_b, non_blocking=True)
 
-            self.w2_lora_b_stacked[0][
-                index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1]
-            ].copy_(w2_lora_b, non_blocking=True)
+        self.w13_lora_b_stacked[1][
+            index, :, : slliced_w3_lora_b.shape[1], : slliced_w3_lora_b.shape[2]
+        ].copy_(slliced_w3_lora_b, non_blocking=True)
+
+        self.w2_lora_a_stacked[0][
+            index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
+        ].copy_(sliced_w2_lora_a, non_blocking=True)
+
+        self.w2_lora_b_stacked[0][
+            index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
+        ].copy_(sliced_w2_lora_b, non_blocking=True)
 
     def forward(self, *args, **kwargs):
         return self.base_layer.forward(*args, **kwargs)
@@ -506,12 +549,12 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
-        # return type(source_layer) is FusedMoE
 
-        return type(source_layer) is FusedMoE and len(packed_modules_list) == 2
+        # source_layer is FusedMoE or SharedFusedMoE
+        return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 2
 
 
 class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
@@ -555,6 +598,9 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
         model_config: PretrainedConfig | None = None,
     ) -> None:
         """Initializes lora matrices."""
+
+        assert isinstance(model_config, PretrainedConfig)
+        self._base_model = model_config.architectures[0]
         self.max_loras = lora_config.max_loras
         self.fully_sharded = lora_config.fully_sharded_loras
 
@@ -565,20 +611,7 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
         self._create_lora_a_weights(max_loras, lora_config)
         self._create_lora_b_weights(max_loras, lora_config)
 
-    def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor:
-        if self.tp_size == 1 or not self.fully_sharded:
-            return w13_lora_a
-
-        # w13_lora_a shape (num_experts,rank,input_size)
-        current_lora_rank = w13_lora_a.shape[1]
-        assert current_lora_rank % self.tp_size == 0
-
-        sliced_rank = current_lora_rank // self.tp_size
-        start_idx = self.tp_rank * sliced_rank
-        end_idx = (self.tp_rank + 1) * sliced_rank
-        return w13_lora_a[:, start_idx:end_idx, :]
-
-    def _slice_w13_b(self, w13_lora_b: torch.Tensor, is_interleave: bool = True):
+    def _slice_w13_b(self, w13_lora_b: torch.Tensor):
         if self.tp_size == 1:
             return w13_lora_b
 
@@ -586,7 +619,8 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
         shard_size = self.base_layer.intermediate_size_per_partition
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
-        if is_interleave:
+        # HACK: Currently, only GPT-OSS is in interleaved order
+        if self._base_model == "GptOssForCausalLM":
             # For models like GPT-OSS, the weights of w1 (gate_proj) and w3 (up_proj)
             # in the interleaved order, and corresponding LoRA need to be processed.
             w1_lora_b = w13_lora_b[:, ::2, :]
@@ -606,28 +640,6 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
 
             return torch.cat([sliced_w1_lora_b, sliced_w3_lora_b], dim=1)
 
-    def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor:
-        if self.tp_size == 1:
-            return w2_lora_a
-        # w2_lora_a shape (num_experts,rank,input_size)
-        shard_size = self.base_layer.intermediate_size_per_partition
-        start_idx = self.tp_rank * shard_size
-        end_idx = (self.tp_rank + 1) * shard_size
-
-        return w2_lora_a[:, :, start_idx:end_idx]
-
-    def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor:
-        if self.tp_size == 1 or not self.fully_sharded:
-            return w2_lora_b
-        # Based on S-LoRA, we slice W2 B along the hidden_size dim.
-        # w2_lora_b shape (num_experts,output_size,rank)
-        current_lora_size = w2_lora_b.shape[1]
-
-        sliced_size = current_lora_size // self.tp_size
-        start_idx = self.tp_rank * sliced_size
-        end_idx = (self.tp_rank + 1) * sliced_size
-        return w2_lora_b[:, start_idx:end_idx, :]
-
     def set_lora(
         self,
         index: int,
@@ -658,7 +670,7 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
         w2_lora_b = w2_lora_b.permute(1, 0, 2)
 
         sliced_w13_lora_a = self._slice_w13_a(w13_lora_a)
-        sliced_w13_lora_b = self._slice_w13_b(w13_lora_b, is_interleave=True)
+        sliced_w13_lora_b = self._slice_w13_b(w13_lora_b)
 
         sliced_w2_lora_a = self._slice_w2_a(w2_lora_a)
         sliced_w2_lora_b = self._slice_w2_b(w2_lora_b)
@@ -711,8 +723,8 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
-
-        return type(source_layer) is FusedMoE and len(packed_modules_list) == 1
+        # source_layer is FusedMoE or SharedFusedMoE
+        return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 1
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
index c01984db4e64c..01515f6136371 100644
--- a/vllm/lora/layers/logits_processor.py
+++ b/vllm/lora/layers/logits_processor.py
@@ -197,7 +197,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # Special handling for the LogitsProcessor.
         return False
diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py
index 243736c4ebc65..62bac546ccd1a 100644
--- a/vllm/lora/layers/replicated_linear.py
+++ b/vllm/lora/layers/replicated_linear.py
@@ -53,7 +53,7 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is ReplicatedLinear
 
diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py
index 95517b1aee263..958aa6af36746 100644
--- a/vllm/lora/layers/row_parallel_linear.py
+++ b/vllm/lora/layers/row_parallel_linear.py
@@ -87,7 +87,7 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is RowParallelLinear
 
@@ -164,7 +164,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py
index c87ca9e24dece..4c1550d09e5e2 100644
--- a/vllm/lora/layers/vocal_parallel_embedding.py
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -131,7 +131,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is VocabParallelEmbedding
 
diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py
index f0d8e22194050..15c4a1be63eeb 100644
--- a/vllm/lora/lora_weights.py
+++ b/vllm/lora/lora_weights.py
@@ -152,6 +152,59 @@ class PackedLoRALayerWeights(LoRALayerWeights):
         )
         return obj
 
+    @classmethod
+    def pack_moe(
+        cls, loras: GenericSequence[Optional["LoRALayerWeights"]], module_name: str
+    ) -> "PackedLoRALayerWeights":
+        """Pack a list of LoRAs into a single LoRA.
+
+        If LoRA is None, it signifies that the submodule does not have a LoRA.
+        """
+
+        first_lora = next(lora for lora in loras if lora is not None)
+        assert first_lora is not None
+        rank = first_lora.rank
+        lora_alpha = first_lora.lora_alpha
+        assert len(loras) % 3 == 0
+        w1_lora_a_lst = []
+        w2_lora_a_lst = []
+        w3_lora_a_lst = []
+        w1_lora_b_lst = []
+        w2_lora_b_lst = []
+        w3_lora_b_lst = []
+        # TODO: Consider the case where some experts don't have LoRA added.
+        for eid in range(len(loras) // 3):
+            w1_lora = loras[eid * 3]
+            w2_lora = loras[eid * 3 + 1]
+            w3_lora = loras[eid * 3 + 2]
+            assert w1_lora is not None
+            assert w2_lora is not None
+            assert w3_lora is not None
+
+            w1_lora_a_lst.append(w1_lora.lora_a)
+            w2_lora_a_lst.append(w2_lora.lora_a)
+            w3_lora_a_lst.append(w3_lora.lora_a)
+
+            w1_lora_b_lst.append(w1_lora.lora_b)
+            w2_lora_b_lst.append(w2_lora.lora_b)
+            w3_lora_b_lst.append(w3_lora.lora_b)
+
+        w1_lora_a = torch.stack(w1_lora_a_lst, dim=0)  # (num_experts,rank,input_size)
+        w2_lora_a = torch.stack(w2_lora_a_lst, dim=0)
+        w3_lora_a = torch.stack(w3_lora_a_lst, dim=0)
+        w1_lora_b = torch.stack(w1_lora_b_lst, dim=0)  # (num_experts,output_size,rank)
+        w2_lora_b = torch.stack(w2_lora_b_lst, dim=0)
+        w3_lora_b = torch.stack(w3_lora_b_lst, dim=0)
+
+        obj = cls(
+            module_name,
+            rank,
+            [lora_alpha, lora_alpha, lora_alpha],
+            [w1_lora_a, w2_lora_a, w3_lora_a],
+            [w1_lora_b, w2_lora_b, w3_lora_b],
+        )
+        return obj
+
     def optimize(self) -> "PackedLoRALayerWeights":
         """Optimize the LoRA by merging the scaling into lora_b."""
         for i in range(len(self.lora_b)):
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 636f062feb7b0..4caaf0e117cc4 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -13,7 +13,7 @@ from torch import nn
 
 from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
-from vllm.lora.layers import BaseLayerWithLoRA, FusedMoEWithLoRA, LoRAMapping
+from vllm.lora.layers import BaseLayerWithLoRA, FusedMoE3DWithLoRA, LoRAMapping
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.punica_wrapper import get_punica_wrapper
@@ -151,16 +151,13 @@ class LoRAModel:
                 if pin_memory:
                     loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
 
-        for lora in loras.values():
-            lora.optimize()
-
         return cls(lora_model_id, peft_helper.r, loras)
 
     @classmethod
     def from_local_checkpoint(
         cls,
         lora_dir: str,
-        expected_lora_modules: list[str],
+        expected_lora_modules: set[str],
         peft_helper: PEFTHelper,
         *,
         lora_model_id: int | None = None,
@@ -190,10 +187,7 @@ class LoRAModel:
         lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
         lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
         lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
-        # new_embeddings_tensor_path = os.path.join(
-        #     lora_dir, "new_embeddings.safetensors"
-        # )
-        # new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
+
         tensors: dict[str, torch.Tensor] = {}
         unexpected_modules: list[list[str] | str] = []
 
@@ -201,18 +195,19 @@ class LoRAModel:
             for lora_module in modules.keys():  # noqa
                 if is_base_embeddding_weights(lora_module):
                     continue
-                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
-                # Handle FSDP file format where experts.base_layer is the
+                # Handle PEFT file format where experts.base_layer is the
                 # gate_up_proj and experts is the down_proj
                 if "base_layer" in lora_module:
                     continue
+                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
                 # Case for expert lora weights
                 if ".experts" in module_name:
-                    if not any(
-                        module_name.endswith(ele) for ele in expected_lora_modules
-                    ):
+                    expert_idx = module_name.find(".experts")
+                    expert_suffix = module_name[expert_idx + 1 :]
+                    if expert_suffix not in expected_lora_modules:
                         unexpected_modules.append(module_name)
-                elif module_name.split(".")[-1] not in expected_lora_modules:
+
+                elif module_name.rsplit(".", 1)[-1] not in expected_lora_modules:
                     unexpected_modules.append(module_name)
 
             if unexpected_modules:
@@ -358,9 +353,7 @@ class LoRAModelManager:
         self.modules: dict[str, BaseLayerWithLoRA] = {}
         # Dict instead of a set for compatibility with LRUCache.
         self._last_mapping: LoRAMapping | None = None
-        self._is_3d_moe_model = is_moe_model(self.model) and hasattr(
-            self.model, "is_3d_moe_weight"
-        )
+        self._is_3d_moe_model = is_moe_model(self.model) and self.model.is_3d_moe_weight
         self._create_lora_modules()
 
         self.model.lora_manager = self
@@ -411,7 +404,7 @@ class LoRAModelManager:
                 continue
             # Note (gnovack) - If MOE lora weights are not split into
             # num_experts chunks, we split them here
-            if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor(
+            if isinstance(module, FusedMoE3DWithLoRA) and torch.is_tensor(
                 module_lora.lora_a
             ):
                 # Handle PEFT file format where experts.base_layer is the
@@ -679,7 +672,10 @@ class LoRAModelManager:
                         "cpu",
                     )
                     subloras.append(lora)
-                lora = PackedLoRALayerWeights.pack(subloras)
+                if module.__class__.__name__ == "FusedMoEWithLoRA":
+                    lora = PackedLoRALayerWeights.pack_moe(subloras, module_name)
+                else:
+                    lora = PackedLoRALayerWeights.pack(subloras)
                 model.loras[module_name] = lora
         return model
 
@@ -739,13 +735,21 @@ class LoRAModelManager:
                 replaced_module_name = module_name.replace("model.", "")
                 if lora_model.check_lora_name(module_name):
                     module_name = replaced_module_name
-            lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
-                replacement_loras
-            )
+            if module_name.endswith(".experts"):
+                lora_model.loras[module_name] = PackedLoRALayerWeights.pack_moe(
+                    replacement_loras, module_name
+                )
+            else:
+                lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
+                    replacement_loras
+                )
             # Remove the modules that have been replaced.
             for module in replaced_module:
                 lora_model.loras.pop(module, None)
 
+        for lora in lora_model.loras.values():
+            lora.optimize()
+
     def _get_lora_layer_weights(
         self, lora_model: LoRAModel, module_name: str
     ) -> LoRALayerWeights | None:
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 12524994d4968..47484b2b984df 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -170,16 +170,15 @@ def parse_fine_tuned_lora_name(
 
 def is_base_embeddding_weights(name: str) -> bool:
     # hardcoded subfixes for input & output embedding weights
-    input_embedding_subfix = ".embed_tokens.base_layer.weight"
-    output_embedding_subfix = ".lm_head.base_layer.weight"
-
-    return name.endswith(input_embedding_subfix) or name.endswith(
-        output_embedding_subfix
+    embedding_suffixes = (
+        ".embed_tokens.base_layer.weight",
+        ".lm_head.base_layer.weight",
     )
+    return name.endswith(embedding_suffixes)
 
 
 def is_regex_target_modules(
-    load_modules: str | list[str], expected_lora_modules: list[str]
+    load_modules: str | list[str], expected_lora_modules: set[str]
 ) -> bool:
     """
     PEFT supports passing `target_modules` in the form of regular expressions,
@@ -195,8 +194,8 @@ def is_regex_target_modules(
         except re.error:
             return False
 
-    def is_subset(sub_list, full_list):
-        return set(sub_list).issubset(set(full_list))
+    def is_subset(sub_list, full_set):
+        return set(sub_list).issubset(full_set)
 
     # Similar to PEFT's processing logic, regex-related operations are only
     #  executed when the load_modules is a `str`.
@@ -290,7 +289,7 @@ def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]:
             # the expert indices are expanded based on the configured number
             # of routed experts.
             packed_modules_mapping = get_packed_modules_mapping(model)
-            if not hasattr(model, "is_3d_moe_weight"):
+            if not model.is_3d_moe_weight:
                 # 3D MoE LoRA does not need `packed_modules_mapping`
                 packed_modules_mapping["experts"] = [
                     weight_name.rstrip(".")
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 4cc201a6414f1..d9a03f0500497 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -88,15 +88,15 @@ class WorkerLoRAManager:
         try:
             supported_lora_modules = self._adapter_manager.supported_lora_modules
             packed_modules_mapping = self._adapter_manager.packed_modules_mapping
-            expected_lora_modules: list[str] = []
+            expected_lora_lst: list[str] = []
             for module in supported_lora_modules:
                 if module in packed_modules_mapping:
-                    expected_lora_modules.extend(packed_modules_mapping[module])
+                    expected_lora_lst.extend(packed_modules_mapping[module])
                 else:
-                    expected_lora_modules.append(module)
+                    expected_lora_lst.append(module)
                 if module == "experts":
-                    expected_lora_modules.append(module)
-            expected_lora_modules = list(set(expected_lora_modules))
+                    expected_lora_lst.append(module)
+            expected_lora_modules = set(expected_lora_lst)
             lora_path = get_adapter_absolute_path(lora_request.lora_path)
 
             peft_helper = PEFTHelper.from_local_dir(
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 6f6ce32538b71..cee0b79e5e5ac 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -336,6 +336,7 @@ class SupportsLoRA(Protocol):
         There is no need to redefine this flag if this class is in the
         MRO of your model class.
     """
+    is_3d_moe_weight: ClassVar[bool] = False
     # The `embedding_module` and `embedding_padding_modules`
     # are empty by default.
     embedding_modules: ClassVar[dict[str, str]] = {}
diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index e2c129120b1a5..a054bd5b3831e 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -401,6 +401,7 @@ class Qwen3VLMoeMixtureOfExperts(MixtureOfExperts):
 class Qwen3VLMoeForConditionalGeneration(
     Qwen3VLForConditionalGeneration, Qwen3VLMoeMixtureOfExperts
 ):
+    is_3d_moe_weight: bool = True
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",

From 882851dc817061de52c949ac27b11442e5529caa Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 27 Nov 2025 22:51:26 +0800
Subject: [PATCH 118/134] [CI/Build][Bugfix] Fix auto label issues for CPU
 (#29610)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .github/workflows/issue_autolabel.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index a8251ceed07f4..629966b959330 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -109,7 +109,7 @@ jobs:
                 // Keyword search - matches whole words only (with word boundaries)
                 keywords: [
                   {
-                    term: "[CPU]",
+                    term: "CPU Backend",
                     searchIn: "title"
                   },
                   {

From bab438ff3e7bd93f861e66a60c6cbefe42af0d1a Mon Sep 17 00:00:00 2001
From: Ryan Rock <ryan.rock@amd.com>
Date: Thu, 27 Nov 2025 09:01:37 -0600
Subject: [PATCH 119/134] [CI/Build] Skip ray tests on ROCm (#29556)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
---
 tests/v1/distributed/test_async_llm_dp.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/v1/distributed/test_async_llm_dp.py b/tests/v1/distributed/test_async_llm_dp.py
index 60f9017184ea0..3b5f2e5e8d72f 100644
--- a/tests/v1/distributed/test_async_llm_dp.py
+++ b/tests/v1/distributed/test_async_llm_dp.py
@@ -12,6 +12,7 @@ from vllm import SamplingParams
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs import PromptType
+from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import DPAsyncMPClient
@@ -84,6 +85,10 @@ async def test_load(
     if async_scheduling and data_parallel_backend == "ray":
         # TODO(NickLucche) Re-enable when async scheduling is supported
         pytest.skip("Async scheduling is not supported with ray")
+    elif data_parallel_backend == "ray" and current_platform.is_rocm():
+        pytest.skip(
+            "Ray as the distributed executor backend is not supported with ROCm."
+        )
     stats_loggers = {}
 
     @dataclass

From 66d3d5422c9b90f1ee9593e1793e86f14e4eb3f4 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Thu, 27 Nov 2025 16:15:50 +0100
Subject: [PATCH 120/134] [Doc]: fixing typos in diverse files (#29492)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 vllm/benchmarks/serve.py                | 4 ++--
 vllm/config/parallel.py                 | 4 ++--
 vllm/lora/punica_wrapper/punica_base.py | 2 +-
 vllm/model_executor/models/adapters.py  | 4 ++--
 vllm/v1/sample/tpu/sampler.py           | 2 +-
 vllm/v1/worker/dp_utils.py              | 6 ++++--
 6 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index dddb050ec180e..519303c0bfa0a 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1005,7 +1005,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Key-value pairs (e.g, --header x-additional-info=0.3.3) "
         "for headers to be passed with each request. These headers override "
         "per backend constants and values set via environment variable, and "
-        "will be overriden by other arguments (such as request ids).",
+        "will be overridden by other arguments (such as request ids).",
     )
     parser.add_argument(
         "--max-concurrency",
@@ -1138,7 +1138,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--percentile-metrics",
         type=str,
         default=None,
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentiles. "
         "This argument specifies the metrics to report percentiles. "
         'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
         'If not specified, defaults to "ttft,tpot,itl" for generative models '
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 7ba1da5db3849..4a8c8bc17cfc3 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -238,9 +238,9 @@ class ParallelConfig:
     cp_kv_cache_interleave_size: int = 1
     """Interleave size of kv_cache storage while using DCP or PCP.
     For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
-        and `total_cp_world_size = pcp_world_size * dcp_world_szie`.
+        and `total_cp_world_size = pcp_world_size * dcp_world_size`.
     store interleave_size tokens on total_cp_rank i,
-    then store next interleave_size tokens on taotal_cp_rank i+1.
+    then store next interleave_size tokens on total_cp_rank i+1.
     Interleave_size=1: token-level alignment, where token `i` is stored on
         total_cp_rank `i % total_cp_world_size`.
     Interleave_size=block_size: block-level alignment, where tokens are
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index ce38751e4b6a7..47c42b095534a 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -173,7 +173,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
         vocab_size: int,
     ):
         # NOTE We have remove lora extra vocab support for now. So we set
-        # extra_vocab_size alwayzs to 0, and extra_vocab_size will be removed.
+        # extra_vocab_size always to 0, and extra_vocab_size will be removed.
 
         extra_vocab_size = 0
         (
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index a9cc49451a1d3..5aba46f8614be 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -428,7 +428,7 @@ def load_weights_using_from_2_way_softmax(
     )
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
-        # have this attribute, we fallback to get_input_embeddings(), which is used by
+        # have this attribute, we fall back to get_input_embeddings(), which is used by
         # the Transformers modeling backend.
         embed_tokens = (
             model.model.embed_tokens
@@ -486,7 +486,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     )
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
-        # have this attribute, we fallback to get_input_embeddings(), which is used by
+        # have this attribute, we fall back to get_input_embeddings(), which is used by
         # the Transformers modeling backend.
         embed_tokens = (
             model.model.embed_tokens
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index 8f0463c76ce15..6d992bb37a59d 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -181,7 +181,7 @@ def apply_top_k_top_p(
     after thresholding the logit using this cut-off, the remaining elements
     shall constitute the top-p set.
 
-    Note: in the case of tie (i.e. multipple cut-off elements present in the
+    Note: in the case of tie (i.e. multiple cut-off elements present in the
     logit), all tie elements are included in the top-p set. In other words,
     this function does not break ties. Instead, these tie tokens have equal
     chance of being chosen during final sampling, so we can consider the tie
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 064f2f0360cbf..c1509de821b05 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -24,12 +24,14 @@ def _get_device_and_group(parallel_config: ParallelConfig):
     device = get_dp_group().device
     group = get_dp_group().device_group
 
-    # Transfering this tensor from GPU to CPU will introduce a GPU sync
+    # Transferring this tensor from GPU to CPU will introduce a GPU sync
     # point that could adversely affect performance of vllm with asynch
     # scheduling. This environment variable exists to quickly disable
     # this optimization if we run into this case.
     if parallel_config.disable_nccl_for_dp_synchronization:
-        logger.info_once("Using CPU all reduce to syncronize DP padding between ranks.")
+        logger.info_once(
+            "Using CPU all reduce to synchronize DP padding between ranks."
+        )
         device = "cpu"
         group = get_dp_group().cpu_group
     return device, group

From cd007a53b4a2d7a83e35de559dc87da09302e956 Mon Sep 17 00:00:00 2001
From: Mathis Felardos <mathis@mistral.ai>
Date: Thu, 27 Nov 2025 16:32:38 +0100
Subject: [PATCH 121/134] [bugfix] avoid NIXL_ERR_REMOTE_DISCONNECT in
 nixl_connector when Prefill dies (#28120)

Signed-off-by: Mathis Felardos <mathis@mistral.ai>
---
 .../kv_connector/v1/nixl_connector.py         | 62 ++++++++++++-------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index ff51840b84b14..d5edf84e8e7f1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -1832,35 +1832,55 @@ class NixlConnectorWorker:
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
             in_progress = False
-            for handle, _xfer_stime in handles:
-                xfer_state = self.nixl_wrapper.check_xfer_state(handle)
-                if xfer_state == "DONE":
-                    # Get telemetry from NIXL
-                    res = self.nixl_wrapper.get_xfer_telemetry(handle)
-                    self.xfer_stats.record_transfer(res)
-                    self.nixl_wrapper.release_xfer_handle(handle)
-                elif xfer_state == "PROC":
-                    in_progress = True
-                    continue
-                else:
-                    # transfer failed - mark blocks as invalid
-                    logger.error(
-                        "NIXL transfer failed for request %s with state %s. "
+            for handle, xfer_start_time in handles:
+                try:
+                    xfer_state = self.nixl_wrapper.check_xfer_state(handle)
+                    if xfer_state == "DONE":
+                        # Get telemetry from NIXL
+                        res = self.nixl_wrapper.get_xfer_telemetry(handle)
+                        self.xfer_stats.record_transfer(res)
+                        self.nixl_wrapper.release_xfer_handle(handle)
+                    elif xfer_state == "PROC":
+                        in_progress = True
+                        continue
+                    else:
+                        logger.error(
+                            "NIXL transfer failed for request %s with state "
+                            "%s. Marking blocks as invalid.",
+                            req_id,
+                            xfer_state,
+                        )
+                        self._handle_failed_transfer(req_id, handle)
+                        in_progress = False
+                except Exception:
+                    logger.exception(
+                        "NIXL transfer exception for request %s. "
                         "Marking blocks as invalid.",
                         req_id,
-                        xfer_state,
                     )
-                    # mark all (logical)blocks for this request as invalid
-                    if meta := self._recving_metadata.pop(req_id, None):
-                        self._invalid_block_ids.update(meta.local_block_ids)
-                    self._recving_metadata.pop(req_id, None)
-                    self.nixl_wrapper.release_xfer_handle(handle)
-                    self.xfer_stats.record_failed_transfer()
+                    self._handle_failed_transfer(req_id, handle)
+                    in_progress = False
+
             if not in_progress:
                 done_req_ids.add(req_id)
                 del transfers[req_id]
         return done_req_ids
 
+    def _handle_failed_transfer(self, req_id: str, handle: int):
+        """
+        Handle a failed transfer by marking all (logical) blocks as invalid and
+        recording the failure.
+
+        Args:
+            req_id: The request ID.
+            handle: The transfer handle.
+        """
+        if meta := self._recving_metadata.pop(req_id, None):
+            self._invalid_block_ids.update(meta.local_block_ids)
+        self._recving_metadata.pop(req_id, None)
+        self.nixl_wrapper.release_xfer_handle(handle)
+        self.xfer_stats.record_failed_transfer()
+
     def start_load_kv(self, metadata: NixlConnectorMetadata):
         """
         Start loading by triggering non-blocking nixl_xfer.

From fc1d8be3dc97e33ade7fb578451006bb044a5e60 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 27 Nov 2025 11:19:09 -0500
Subject: [PATCH 122/134] [Attention] Update attention imports (#29540)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../test_rocm_attention_backends_selection.py         |  9 +++------
 .../kv_connector/unit/test_backwards_compatibility.py |  6 +++---
 vllm/attention/backends/abstract.py                   | 11 ++++-------
 vllm/attention/layers/chunked_local_attention.py      |  3 +--
 vllm/config/model.py                                  |  3 +--
 vllm/config/multimodal.py                             | 11 ++---------
 vllm/distributed/kv_transfer/kv_connector/v1/base.py  |  4 ++--
 .../kv_connector/v1/decode_bench_connector.py         |  4 ++--
 .../kv_transfer/kv_connector/v1/lmcache_connector.py  |  4 ++--
 .../v1/lmcache_integration/vllm_v1_adapter.py         |  4 ++--
 .../kv_connector/v1/lmcache_mp_connector.py           |  4 ++--
 .../kv_transfer/kv_connector/v1/multi_connector.py    |  4 ++--
 .../kv_transfer/kv_connector/v1/nixl_connector.py     |  5 ++---
 .../kv_connector/v1/p2p/p2p_nccl_connector.py         |  4 ++--
 .../kv_connector/v1/shared_storage_connector.py       |  4 ++--
 vllm/forward_context.py                               |  8 +++-----
 vllm/model_executor/layers/attention_layer_base.py    |  7 ++-----
 vllm/model_executor/layers/mamba/abstract.py          |  7 ++-----
 .../compressed_tensors/compressed_tensors.py          |  3 +--
 vllm/model_executor/layers/quantization/fp8.py        |  4 +---
 vllm/model_executor/layers/quantization/modelopt.py   |  3 +--
 vllm/model_executor/layers/quantization/mxfp4.py      |  3 +--
 vllm/model_executor/layers/quantization/petit.py      |  3 +--
 vllm/model_executor/layers/quantization/ptpc_fp8.py   |  3 +--
 .../model_executor/layers/quantization/quark/quark.py |  3 +--
 vllm/platforms/cpu.py                                 |  5 +----
 vllm/platforms/cuda.py                                | 10 ++--------
 vllm/platforms/interface.py                           |  5 +----
 vllm/platforms/rocm.py                                |  6 +-----
 vllm/platforms/tpu.py                                 |  5 +----
 vllm/platforms/xpu.py                                 |  7 +------
 vllm/v1/attention/backends/cpu_attn.py                |  2 --
 vllm/v1/attention/backends/flash_attn.py              |  2 --
 vllm/v1/attention/backends/flex_attention.py          |  2 --
 vllm/v1/attention/backends/utils.py                   |  7 +++++--
 vllm/v1/kv_offload/spec.py                            |  4 ++--
 vllm/v1/spec_decode/eagle.py                          |  3 +--
 vllm/v1/worker/utils.py                               |  7 ++-----
 38 files changed, 63 insertions(+), 126 deletions(-)

diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
index 80158d4b7278c..77790be6f892b 100644
--- a/tests/v1/attention/test_rocm_attention_backends_selection.py
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -139,14 +139,13 @@ def test_standard_attention_backend_selection(
     import importlib
 
     import vllm.envs as envs
-    from vllm.attention.backends.registry import _Backend
 
     importlib.reload(envs)
 
     # Convert string backend to enum if provided
     backend_enum = None
     if selected_backend:
-        backend_enum = getattr(_Backend, selected_backend)
+        backend_enum = getattr(AttentionBackendEnum, selected_backend)
 
     # Get the backend class path
     from vllm.platforms.rocm import RocmPlatform
@@ -253,7 +252,6 @@ def test_mla_backend_selection(
     import importlib
 
     import vllm.envs as envs
-    from vllm.attention.backends.registry import _Backend
 
     importlib.reload(envs)
 
@@ -269,7 +267,7 @@ def test_mla_backend_selection(
         # Convert string backend to enum if provided
         backend_enum = None
         if selected_backend:
-            backend_enum = getattr(_Backend, selected_backend)
+            backend_enum = getattr(AttentionBackendEnum, selected_backend)
 
         from vllm.platforms.rocm import RocmPlatform
 
@@ -301,7 +299,6 @@ def test_mla_backend_selection(
 
 def test_aiter_fa_requires_gfx9(mock_vllm_config):
     """Test that ROCM_AITER_FA requires gfx9 architecture."""
-    from vllm.attention.backends.registry import _Backend
     from vllm.platforms.rocm import RocmPlatform
 
     # Mock on_gfx9 to return False
@@ -313,7 +310,7 @@ def test_aiter_fa_requires_gfx9(mock_vllm_config):
         ),
     ):
         RocmPlatform.get_attn_backend_cls(
-            selected_backend=_Backend.ROCM_AITER_FA,
+            selected_backend=AttentionBackendEnum.ROCM_AITER_FA,
             head_size=128,
             dtype=torch.float16,
             kv_cache_dtype="auto",
diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
index f51001a6ec12a..7cd23805c599d 100644
--- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py
+++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
@@ -14,6 +14,7 @@ from unittest.mock import patch
 
 import pytest
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
@@ -24,7 +25,6 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from .utils import create_scheduler, create_vllm_config
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -68,7 +68,7 @@ class OldStyleTestConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         pass
@@ -119,7 +119,7 @@ class NewStyleTestConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         pass
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index a321167b8090f..c290670eeacb0 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -6,11 +6,10 @@ from typing import TYPE_CHECKING, ClassVar, Generic, Protocol, TypeVar, get_args
 
 import torch
 
-from vllm.model_executor.layers.linear import ColumnParallelLinear
-from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
-
 if TYPE_CHECKING:
     from vllm.config.cache import CacheDType
+    from vllm.model_executor.layers.linear import ColumnParallelLinear
+    from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
     from vllm.platforms.interface import DeviceCapability
     from vllm.v1.attention.backends.utils import KVCacheLayoutType
 
@@ -178,8 +177,6 @@ class AttentionBackend(ABC):
         By default, only supports decoder attention.
         Backends should override this to support other attention types.
         """
-        from vllm.attention.backends.abstract import AttentionType
-
         return attn_type == AttentionType.DECODER
 
     @classmethod
@@ -360,7 +357,7 @@ class AttentionImpl(ABC, Generic[T]):
     ) -> torch.Tensor:
         raise NotImplementedError
 
-    def fused_output_quant_supported(self, quant_key: QuantKey):
+    def fused_output_quant_supported(self, quant_key: "QuantKey"):
         """
         Does this attention implementation support fused output quantization.
         This is used by the AttnFusionPass to only fuse output quantization
@@ -412,7 +409,7 @@ class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
         qk_rope_head_dim: int,
         qk_head_dim: int,
         v_head_dim: int,
-        kv_b_proj: ColumnParallelLinear,
+        kv_b_proj: "ColumnParallelLinear",
         indexer: object | None = None,
     ) -> None:
         raise NotImplementedError
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index 48fcc6fa736bb..0ced0028ded9e 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -5,6 +5,7 @@ import functools
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
 from vllm.config.vllm import VllmConfig
@@ -22,8 +23,6 @@ from vllm.v1.kv_cache_interface import (
     KVCacheSpec,
 )
 
-from ..layer import Attention
-
 
 @functools.lru_cache
 def create_chunked_local_attention_backend(
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 84311596b660c..5dabd636c18c6 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -14,6 +14,7 @@ from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
 from vllm.config.pooler import PoolerConfig
 from vllm.config.scheduler import RunnerType
@@ -53,7 +54,6 @@ if TYPE_CHECKING:
 
     import vllm.model_executor.layers.quantization as me_quant
     import vllm.model_executor.models as me_models
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config.load import LoadConfig
     from vllm.config.parallel import ParallelConfig
     from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -61,7 +61,6 @@ if TYPE_CHECKING:
 else:
     PretrainedConfig = Any
 
-    AttentionBackendEnum = Any
     me_quant = LazyLoader(
         "model_executor", globals(), "vllm.model_executor.layers.quantization"
     )
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 590bc4dcd0760..8a2936de96d6f 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -2,19 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Any, Literal, TypeAlias
+from typing import Any, Literal, TypeAlias
 
 from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
-else:
-    AttentionBackendEnum = Any
-
 
 @dataclass
 class BaseDummyOptions:
@@ -170,9 +166,6 @@ class MultiModalConfig:
     def _validate_mm_encoder_attn_backend(
         cls, value: str | AttentionBackendEnum | None
     ) -> AttentionBackendEnum | None:
-        # We need to import the real type here (deferred to avoid circular import).
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if isinstance(value, str) and value.upper() == "XFORMERS":
             raise ValueError(
                 "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 74f09278b7bb1..cac45425bb7aa 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -42,12 +42,12 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
@@ -239,7 +239,7 @@ class KVConnectorBase_V1(ABC):
         return
 
     def register_cross_layers_kv_cache(
-        self, kv_cache: torch.Tensor, attn_backend: type["AttentionBackend"]
+        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
     ):
         """
         Initialize with a single KV cache tensor used by all layers.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
index 9cd7d93c92fa3..e9b2bd392b0ef 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
@@ -36,6 +36,7 @@ from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
     KVConnectorRole,
@@ -45,7 +46,6 @@ from vllm.logger import init_logger
 from vllm.utils.math_utils import cdiv
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -117,7 +117,7 @@ class DecodeBenchConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         # This connector doesn't save KV cache (benchmarking only)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 0c24a53fb754b..30da424ddcca0 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -7,6 +7,7 @@ from lmcache.integration.vllm.vllm_v1_adapter import (
     LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
 )
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -17,7 +18,6 @@ from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -91,7 +91,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
index 94572b02fa872..15ac5b049fce9 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -29,6 +29,7 @@ from lmcache.v1.lookup_client.lmcache_async_lookup_client import (
 from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer
 from lmcache.v1.plugin.plugin_launcher import PluginLauncher
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -50,7 +51,6 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.version import __version__ as VLLM_VERSION
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.multimodal.inputs import PlaceholderRange
     from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -915,7 +915,7 @@ class LMCacheConnectorV1Impl:
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         """Start saving the a layer of KV cache from vLLM's paged buffer
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index d1d3e475cc889..a4bddf5e03166 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -10,6 +10,7 @@ import zmq
 from lmcache.integration.vllm.utils import mla_enabled
 from lmcache.utils import init_logger as lmcache_init_logger
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -26,7 +27,6 @@ from vllm.v1.outputs import KVConnectorOutput
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
@@ -490,7 +490,7 @@ class LMCacheMPConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index c9d08e9b78ed0..f47e8ca7e6c50 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
@@ -27,7 +28,6 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -216,7 +216,7 @@ class MultiConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         for c in self._connectors:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index d5edf84e8e7f1..24c8d32dafedc 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -20,7 +20,7 @@ import torch
 import zmq
 
 from vllm import envs
-from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
@@ -51,7 +51,6 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.worker.block_table import BlockTable
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
@@ -308,7 +307,7 @@ class NixlConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         """NixlConnector does not save explicitly."""
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index a124a0d519db8..8f3a62d7bcdb0 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional
 import regex as re
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -22,7 +23,6 @@ from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -243,7 +243,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """Start saving the KV cache of the layer from vLLM's paged buffer
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 4611b4d1ff7b8..ed641cfc43ddd 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional
 import safetensors
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -19,7 +20,6 @@ from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -211,7 +211,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """Start saving the KV cache of the layer from vLLM's paged buffer
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 635419bc7cad4..173d366267e87 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,19 +5,17 @@ import time
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, NamedTuple
+from typing import Any, NamedTuple
 
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.ubatch_utils import UBatchSlices
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
-
 logger = init_logger(__name__)
 
 track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
@@ -195,7 +193,7 @@ class ForwardContext:
     for each microbatch.
     Set dynamically for each forward pass
     """
-    attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]]
+    attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
diff --git a/vllm/model_executor/layers/attention_layer_base.py b/vllm/model_executor/layers/attention_layer_base.py
index ffbef470b1868..a60cf787135c0 100644
--- a/vllm/model_executor/layers/attention_layer_base.py
+++ b/vllm/model_executor/layers/attention_layer_base.py
@@ -3,14 +3,11 @@
 """Base class for attention-like layers."""
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING
 
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.v1.kv_cache_interface import KVCacheSpec
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
 
 class AttentionLayerBase(ABC):
     """
@@ -22,7 +19,7 @@ class AttentionLayerBase(ABC):
     """
 
     @abstractmethod
-    def get_attn_backend(self) -> type["AttentionBackend"]:
+    def get_attn_backend(self) -> type[AttentionBackend]:
         """Get the attention backend class for this layer."""
         pass
 
diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py
index aa919d6fdc35c..74f4383e9c238 100644
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -2,18 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import abstractmethod
 from collections.abc import Iterable
-from typing import TYPE_CHECKING
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.selector import get_mamba_attn_backend
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
 
 class MambaBase(AttentionLayerBase):
     """
@@ -66,6 +63,6 @@ class MambaBase(AttentionLayerBase):
             ),
         )
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
+    def get_attn_backend(self) -> type[AttentionBackend]:
         """Get the attention backend class for this Mamba layer."""
         return get_mamba_attn_backend(self.mamba_type)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 7f61746a4e45c..f9d8f5883680b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -18,6 +18,7 @@ from compressed_tensors.quantization import (
 from compressed_tensors.transform import TransformConfig
 
 import vllm.envs as envs
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
@@ -131,8 +132,6 @@ class CompressedTensorsConfig(QuantizationConfig):
         layer: torch.nn.Module,
         prefix: str,
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if isinstance(layer, LinearBase):
             # collect schemes
             quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index e033032903e87..7dfc8a9c36c3e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -14,6 +14,7 @@ import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
+from vllm.attention.layer import Attention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
@@ -277,7 +278,6 @@ class Fp8Config(QuantizationConfig):
     def get_xpu_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention
         from vllm.model_executor.layers.quantization.ipex_quant import (
             XPUFp8LinearMethod,
             XPUFp8MoEMethod,
@@ -307,8 +307,6 @@ class Fp8Config(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if current_platform.is_xpu():
             return self.get_xpu_quant_method(layer, prefix)
         if isinstance(layer, LinearBase):
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 2cf7089e0ff90..80f8e3a03e7cf 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -12,6 +12,7 @@ from torch.nn.parameter import Parameter
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
@@ -149,8 +150,6 @@ class ModelOptQuantConfigBase(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         # handle kv-cache first so we can focus only on weight quantization thereafter
         if isinstance(layer, Attention):
             return self.KVCacheMethodCls(self)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index d975131f7cff7..bc241ac692e23 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -8,6 +8,7 @@ import torch
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm.attention.layer import Attention
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
@@ -184,8 +185,6 @@ class Mxfp4Config(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if isinstance(layer, LinearBase):
             if self.ignored_layers and is_layer_skipped(
                 prefix=prefix,
diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py
index 402cebc38c215..5ccc73166361a 100644
--- a/vllm/model_executor/layers/quantization/petit.py
+++ b/vllm/model_executor/layers/quantization/petit.py
@@ -8,6 +8,7 @@ import regex as re
 import torch
 from torch.nn.parameter import Parameter
 
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (
     LinearBase,
@@ -159,8 +160,6 @@ class PetitNvFp4Config(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         exclude = self.require_exclude_modules()
 
         if isinstance(layer, LinearBase):
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 26ba8e5b16bc0..ed8a2c7fa0841 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -7,6 +7,7 @@ import torch
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -65,8 +66,6 @@ class PTPCFp8Config(Fp8Config):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if isinstance(layer, LinearBase):
             if is_layer_skipped(prefix, self.ignored_layers):
                 return UnquantizedLinearMethod()
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index f59e5e2a0af7a..3640e5c452786 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Optional, cast
 
 import torch
 
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
@@ -102,8 +103,6 @@ class QuarkConfig(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         # Check if the layer is skipped for quantization.
         exclude_layers = cast(list[str], self.quant_config.get("exclude"))
         if should_ignore_layer(
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index ed655912d3964..5f9561366e0d5 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -14,6 +14,7 @@ import regex as re
 import torch
 
 from vllm import envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 
 from .interface import CpuArchEnum, Platform, PlatformEnum
@@ -21,10 +22,8 @@ from .interface import CpuArchEnum, Platform, PlatformEnum
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
-    AttentionBackendEnum = None
     VllmConfig = None
 
 
@@ -135,8 +134,6 @@ class CpuPlatform(Platform):
         use_sparse: bool,
         attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         if use_mla:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index e8e14387bb7f6..d5c3a177d9c2b 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -15,6 +15,8 @@ from typing_extensions import ParamSpec
 # import custom ops, trigger op registration
 import vllm._C  # noqa
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.import_utils import import_pynvml
 from vllm.utils.torch_utils import cuda_device_count_stateless
@@ -22,11 +24,9 @@ from vllm.utils.torch_utils import cuda_device_count_stateless
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
     from vllm.config.cache import CacheDType
 else:
-    AttentionBackendEnum = None
     VllmConfig = None
     CacheDType = None
 
@@ -48,8 +48,6 @@ def _get_backend_priorities(
     device_capability: DeviceCapability,
 ) -> list[AttentionBackendEnum]:
     """Get backend priorities with lazy import to avoid circular dependency."""
-    from vllm.attention.backends.registry import AttentionBackendEnum
-
     if use_mla:
         if device_capability.major == 10:
             return [
@@ -265,8 +263,6 @@ class CudaPlatformBase(Platform):
     def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         # Try FlashAttention first
         try:
             backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
@@ -335,8 +331,6 @@ class CudaPlatformBase(Platform):
         use_sparse: bool,
         attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.abstract import AttentionType
-
         if attn_type is None:
             attn_type = AttentionType.DECODER
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 1e6b53021f888..27c6fac09f498 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -12,12 +12,12 @@ from typing import TYPE_CHECKING, Any, NamedTuple
 import numpy as np
 import torch
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     from torch.distributed import PrefixStore, ProcessGroup
 
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
     from vllm.config.cache import CacheDType
     from vllm.inputs import ProcessorInputs, PromptType
@@ -226,9 +226,6 @@ class Platform:
     def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
-        # Import AttentionBackendEnum here to avoid circular import.
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 0483f6c06ada8..ccf3446a3a6e5 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -8,16 +8,14 @@ from typing import TYPE_CHECKING
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.torch_utils import cuda_device_count_stateless
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
-else:
-    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -196,7 +194,6 @@ class RocmPlatform(Platform):
         from importlib.util import find_spec
 
         from vllm._aiter_ops import rocm_aiter_ops
-        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if rocm_aiter_ops.is_mha_enabled():
             # Note: AITER FA is only supported for Qwen-VL models.
@@ -222,7 +219,6 @@ class RocmPlatform(Platform):
         attn_type: str | None = None,
     ) -> str:
         from vllm._aiter_ops import rocm_aiter_ops
-        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if use_sparse:
             if kv_cache_dtype.startswith("fp8"):
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 04325a522f444..cbc0a996f3661 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, cast
 import torch
 from tpu_info import device
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 
@@ -15,7 +16,6 @@ from .interface import Platform, PlatformEnum
 if TYPE_CHECKING:
     from typing import TypeAlias
 
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
     from vllm.config.cache import BlockSize
     from vllm.pooling_params import PoolingParams
@@ -26,7 +26,6 @@ else:
     BlockSize = None
     VllmConfig = None
     PoolingParams = None
-    AttentionBackendEnum = None
     ParamsType = None
 
 logger = init_logger(__name__)
@@ -67,8 +66,6 @@ class TpuPlatform(Platform):
         use_sparse,
         attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on TPU.")
         if selected_backend != AttentionBackendEnum.PALLAS:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 18a3186b142f1..768714fb16726 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -8,16 +8,15 @@ from typing import TYPE_CHECKING
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
     VllmConfig = None
-    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -60,8 +59,6 @@ class XPUPlatform(Platform):
             "only NHD layout is supported by XPU attention kernels."
         )
 
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on XPU.")
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
@@ -116,8 +113,6 @@ class XPUPlatform(Platform):
     def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         return AttentionBackendEnum.FLASH_ATTN
 
     @classmethod
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index d0b1f8c1b8071..fed7dcdf293bd 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -51,8 +51,6 @@ class CPUAttentionBackend(AttentionBackend):
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """CPU attention supports decoder and encoder-only attention."""
-        from vllm.attention.backends.abstract import AttentionType
-
         return attn_type in (
             AttentionType.DECODER,
             AttentionType.ENCODER,
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a1558073003fd..fb080b0b33bc0 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -84,8 +84,6 @@ class FlashAttentionBackend(AttentionBackend):
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """FlashAttention supports all attention types."""
-        from vllm.attention.backends.abstract import AttentionType
-
         return attn_type in (
             AttentionType.DECODER,
             AttentionType.ENCODER,
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 3869f1f4164c9..8de0a0a11471f 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -87,8 +87,6 @@ class FlexAttentionBackend(AttentionBackend):
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """FlexAttention supports both decoder and encoder-only attention."""
-        from vllm.attention.backends.abstract import AttentionType
-
         return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
 
     @staticmethod
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index ea9dccc702a0a..6e0d84e4fb4ac 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -24,12 +24,15 @@ from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.utils.math_utils import cdiv
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionImpl
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
 
 import vllm.envs as envs
-from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionMetadata,
+)
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     get_kv_connector_cache_layout,
 )
diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py
index 3afce55890752..2cdd5ba5ffe5c 100644
--- a/vllm/v1/kv_offload/spec.py
+++ b/vllm/v1/kv_offload/spec.py
@@ -6,12 +6,12 @@ from typing import TYPE_CHECKING
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
     from vllm.config import VllmConfig
 
 logger = init_logger(__name__)
@@ -51,7 +51,7 @@ class OffloadingSpec(ABC):
     def get_handlers(
         self,
         kv_caches: dict[str, torch.Tensor],
-        attn_backends: dict[str, type["AttentionBackend"]],
+        attn_backends: dict[str, type[AttentionBackend]],
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         """
         Get offloading handlers along with their respective src and dst types.
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 7b9037c03d4f0..7600df48150ac 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -8,6 +8,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CompilationMode,
     CUDAGraphMode,
@@ -157,8 +158,6 @@ class EagleProposer:
         )
 
         # Determine allowed attention backends once during initialization.
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         self.allowed_attn_types: tuple | None = None
         if current_platform.is_rocm():
             rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 92e4ce3abdba3..bd88cb1b253f8 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -2,11 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.layer import Attention
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
@@ -17,9 +17,6 @@ from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
 
-if TYPE_CHECKING:
-    from vllm.attention.layer import Attention
-
 
 class MultiModalBudget:
     """Helper class to calculate budget information for multi-modal models."""
@@ -278,7 +275,7 @@ def add_kv_sharing_layers_to_kv_cache_groups(
 
 def bind_kv_cache(
     kv_caches: dict[str, torch.Tensor],
-    forward_context: dict[str, "Attention"],
+    forward_context: dict[str, Attention],
     runner_kv_caches: list[torch.Tensor],
     num_attn_module: int = 1,
 ) -> None:

From e1f262337bcf774032019b5b717a6297a860f190 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 27 Nov 2025 16:42:14 +0000
Subject: [PATCH 123/134] Update Transformers pin in CI to 4.57.3 (#29418)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 requirements/nightly_torch_test.txt | 2 +-
 requirements/test.in                | 2 +-
 requirements/test.txt               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index d9c5d89c1d52f..53b012372be8e 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -29,7 +29,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
-transformers==4.57.1
+transformers==4.57.3
 tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
diff --git a/requirements/test.in b/requirements/test.in
index 05f6bcca5c2c4..da7a7db1f00c9 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -37,7 +37,7 @@ datamodel_code_generator # required for minicpm3 test
 # TODO: Use lm-eval[api]==0.4.10 once released
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.1
+transformers==4.57.3
 tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
diff --git a/requirements/test.txt b/requirements/test.txt
index bcd511660f85e..c5f103b8b0d78 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1196,7 +1196,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.57.1
+transformers==4.57.3
     # via
     #   -r requirements/test.in
     #   genai-perf

From 0840abdd242bbc7d0c42f0bfa73fec94a44e921b Mon Sep 17 00:00:00 2001
From: Injae Ryou <injaeryou@gmail.com>
Date: Fri, 28 Nov 2025 01:53:10 +0900
Subject: [PATCH 124/134] [BugFix] Optional tokenizer argument when loading
 GGUF models (#29582)

Signed-off-by: Injae Ryou <injaeryou@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/config/model.py                  | 15 +++++-----
 vllm/transformers_utils/gguf_utils.py | 42 +++++++++++++++++++++++++++
 vllm/transformers_utils/tokenizer.py  | 10 ++++++-
 3 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 5dabd636c18c6..21d602b30ac1a 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -439,13 +439,6 @@ class ModelConfig:
         self.model = maybe_model_redirect(self.model)
         # The tokenizer is consistent with the model by default.
         if self.tokenizer is None:
-            # Check if this is a GGUF model (either local file or remote GGUF)
-            if is_gguf(self.model):
-                raise ValueError(
-                    "Using a tokenizer is mandatory when loading a GGUF model. "
-                    "Please specify the tokenizer path or name using the "
-                    "--tokenizer argument."
-                )
             self.tokenizer = self.model
         if self.tokenizer_revision is None:
             self.tokenizer_revision = self.revision
@@ -699,6 +692,14 @@ class ModelConfig:
 
             self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
 
+        # Multimodal GGUF models must use original repo for mm processing
+        if is_gguf(self.tokenizer) and self.is_multimodal_model:
+            raise ValueError(
+                "Loading a multimodal GGUF model needs to use original "
+                "tokenizer. Please specify the unquantized hf model's "
+                "repo name or path using the --tokenizer argument."
+            )
+
         if self.disable_sliding_window:
             # Set after get_and_verify_max_len to ensure that max_model_len
             # can be correctly capped to sliding window size
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index 2bf59c91a3bb1..f727b1b4726bb 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -9,6 +9,7 @@ from gguf.constants import Keys, VisionProjectorType
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
 
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
@@ -164,3 +165,44 @@ def maybe_patch_hf_config_from_gguf(
             hf_config = new_hf_config
 
     return hf_config
+
+
+def get_gguf_file_path_from_hf(
+    repo_id: str | Path,
+    quant_type: str,
+    revision: str | None = None,
+) -> str:
+    """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type.
+
+    Args:
+        repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B")
+        quant_type: The quantization type (e.g., "Q4_K_M", "F16")
+        revision: Optional revision/branch name
+
+    Returns:
+        The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"),
+    """
+    repo_id = str(repo_id)
+    gguf_patterns = [
+        f"*-{quant_type}.gguf",
+        f"*-{quant_type}-*.gguf",
+        f"*/*-{quant_type}.gguf",
+        f"*/*-{quant_type}-*.gguf",
+    ]
+    matching_files = list_filtered_repo_files(
+        repo_id,
+        allow_patterns=gguf_patterns,
+        revision=revision,
+    )
+
+    if len(matching_files) == 0:
+        raise ValueError(
+            "Could not find GGUF file for repo %s with quantization %s.",
+            repo_id,
+            quant_type,
+        )
+
+    # Sort to ensure consistent ordering (prefer non-sharded files)
+    matching_files.sort(key=lambda x: (x.count("-"), x))
+    gguf_filename = matching_files[0]
+    return gguf_filename
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index f0e0ba8ef4246..929dc8bf481cb 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -19,6 +19,7 @@ from vllm.transformers_utils.config import (
     get_sentence_transformer_tokenizer_config,
     list_filtered_repo_files,
 )
+from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import (
     check_gguf_file,
@@ -190,7 +191,14 @@ def get_tokenizer(
             kwargs["gguf_file"] = Path(tokenizer_name).name
             tokenizer_name = Path(tokenizer_name).parent
         elif is_remote_gguf(tokenizer_name):
-            tokenizer_name, _ = split_remote_gguf(tokenizer_name)
+            tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
+            # Get the HuggingFace Hub path for the GGUF file
+            gguf_file = get_gguf_file_path_from_hf(
+                tokenizer_name,
+                quant_type,
+                revision=revision,
+            )
+            kwargs["gguf_file"] = gguf_file
 
     # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
     # first to use official Mistral tokenizer if possible.

From ee9841daa995a606139775043f0199d6a81037b3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 01:08:08 +0800
Subject: [PATCH 125/134] [Bugfix] Fix doc build on main (#29619)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/interfaces_base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 4267b6c6598e2..85c5574bacf0a 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -167,8 +167,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
 
     default_pooling_type: ClassVar[str] = "LAST"
     """
-    Indicates the
-    [vllm.model_executor.layers.pooler.PoolerConfig.pooling_type][]
+    Indicates the [vllm.config.pooler.PoolerConfig.pooling_type][]
     to use by default.
 
     You can use the

From d45269b37844b992dc4b34c0509ad8319bc043e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9E=9C=E5=86=BB=E8=99=BE=E4=BB=81?= <guodong@apache.org>
Date: Fri, 28 Nov 2025 01:21:00 +0800
Subject: [PATCH 126/134] add skip_reading_prefix_cache in repr for
 PoolingParams (#29620)

---
 vllm/pooling_params.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index d1aab98c274e1..c2094a2d920a2 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -219,6 +219,7 @@ class PoolingParams(
             f"step_tag_id={self.step_tag_id}, "
             f"returned_token_ids={self.returned_token_ids}, "
             f"requires_token_ids={self.requires_token_ids}, "
+            f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, "
             f"extra_kwargs={self.extra_kwargs})"
         )
 

From ea228b4491342f6b7a283e1a414e1a75171a0241 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 02:39:59 +0800
Subject: [PATCH 127/134] [Misc] Remove unused code from `protocol.py` (#29616)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/protocol.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 5e3374f9f6a10..6b3ee042daf3e 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import enum
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from typing import Any
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs.data import PromptType
-from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.plugins.io_processors import IOProcessor
@@ -19,13 +17,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.processor import Processor
 
-logger = init_logger(__name__)
-
-
-class Device(enum.Enum):
-    GPU = enum.auto()
-    CPU = enum.auto()
-
 
 class EngineClient(ABC):
     """Protocol class for Clients to Engine"""

From a24ea5414bc5b623cde301c8c6e1c5082ecfe412 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 03:04:58 +0800
Subject: [PATCH 128/134] [Deprecation] Advance deprecation status (#29617)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config/scheduler.py            | 15 +--------
 vllm/distributed/parallel_state.py  | 19 -----------
 vllm/model_executor/models/utils.py | 49 -----------------------------
 vllm/v1/core/sched/output.py        |  4 +--
 4 files changed, 3 insertions(+), 84 deletions(-)

diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 2cf42d57ec217..ff1ac0e18f324 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
 
 from pydantic import Field, field_validator
 from pydantic.dataclasses import dataclass
-from typing_extensions import Self, deprecated
+from typing_extensions import Self
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
@@ -224,19 +224,6 @@ class SchedulerConfig:
 
         self.verify_max_model_len(max_model_len)
 
-    @property
-    @deprecated(
-        "`SchedulerConfig.chunked_prefill_enabled` has been renamed to "
-        "`SchedulerConfig.enable_chunked_prefill`. "
-        "The old name will be removed in v0.12."
-    )
-    def chunked_prefill_enabled(self) -> bool:
-        return self.enable_chunked_prefill
-
-    @chunked_prefill_enabled.setter
-    def chunked_prefill_enabled(self, value: bool):
-        self.enable_chunked_prefill = value
-
     def verify_max_model_len(self, max_model_len: int) -> Self:
         if (
             self.max_num_batched_tokens < max_model_len
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 69c28e278f2d2..52b433cfaf1bd 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -41,7 +41,6 @@ import torch.distributed
 import torch.distributed._functional_collectives as funcol
 import torch.distributed._symmetric_memory
 from torch.distributed import Backend, ProcessGroup
-from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.distributed.device_communicators.base_device_communicator import (
@@ -1078,15 +1077,6 @@ def get_tp_group() -> GroupCoordinator:
     return _TP
 
 
-@deprecated(
-    "`get_tensor_model_parallel_group` has been replaced with "
-    "`get_tp_group` and may be removed after v0.12. Please use "
-    "`get_tp_group` instead."
-)
-def get_tensor_model_parallel_group():
-    return get_tp_group()
-
-
 _DCP: GroupCoordinator | None = None
 
 
@@ -1130,15 +1120,6 @@ def get_pcp_group() -> GroupCoordinator:
     return _PCP
 
 
-@deprecated(
-    "`get_pipeline_model_parallel_group` has been replaced with "
-    "`get_pp_group` and may be removed in v0.12. Please use "
-    "`get_pp_group` instead."
-)
-def get_pipeline_model_parallel_group():
-    return get_pp_group()
-
-
 @contextmanager
 def graph_capture(device: torch.device):
     """
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index ccefd7e66697f..f25ab9153a50d 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -10,7 +10,6 @@ import torch
 import torch.nn as nn
 from torch.func import functional_call
 from transformers import PretrainedConfig
-from typing_extensions import deprecated
 
 from vllm.config import VllmConfig
 from vllm.distributed import (
@@ -481,54 +480,6 @@ def _merge_multimodal_embeddings(
     return inputs_embeds
 
 
-@deprecated(
-    "`merge_multimodal_embeddings` has been replaced with "
-    "`SupportsMultiModal.embed_input_ids` and will be "
-    "removed in v0.12."
-)
-def merge_multimodal_embeddings(
-    input_ids: torch.Tensor,
-    inputs_embeds: torch.Tensor,
-    multimodal_embeddings: NestedTensors,
-    placeholder_token_id: int | list[int],
-) -> torch.Tensor:
-    """
-    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
-    positions in `inputs_embeds` corresponding to placeholder tokens in
-    `input_ids`.
-
-    `placeholder_token_id` can be a list of token ids (e.g, token ids
-    of img_start, img_break, and img_end tokens) when needed: This means
-    the order of these tokens in the `input_ids` MUST MATCH the order of
-    their embeddings in `multimodal_embeddings` since we need to
-    slice-merge instead of individually scattering.
-
-    For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
-    - T is text token
-    - S is image start token
-    - I is image embedding token
-    - B is image break token
-    - E is image end token.
-
-    Then the image embeddings (that correspond to I's) from vision encoder
-    must be padded with embeddings of S, B, and E in the same order of
-    input_ids for a correct embedding merge.
-
-    Note:
-        This updates `inputs_embeds` in place.
-    """
-    if isinstance(placeholder_token_id, list):
-        is_multimodal = isin_list(input_ids, placeholder_token_id)
-    else:
-        is_multimodal = input_ids == placeholder_token_id
-
-    return _merge_multimodal_embeddings(
-        inputs_embeds,
-        multimodal_embeddings=multimodal_embeddings,
-        is_multimodal=is_multimodal,
-    )
-
-
 def isin_list(
     elements: torch.Tensor,
     test_elements_list: list[int],
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 7902513dce49a..abfab43499b2a 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -126,12 +126,12 @@ class CachedRequestData:
         return len(self.req_ids)
 
     @cached_property
-    @deprecated("use resumed_req_ids field")
+    @deprecated("This will be removed in v0.14, use `resumed_req_ids` instead.")
     def resumed_from_preemption(self) -> list[bool]:
         return [req_id in self.resumed_req_ids for req_id in self.req_ids]
 
     @cached_property
-    @deprecated("use all_token_ids field")
+    @deprecated("This will be removed in v0.14, use `all_token_ids` instead.")
     def resumed_req_token_ids(self) -> list[list[int] | None]:
         return [
             self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None

From 38658ec6f3b3a09a6cd205bab23a550b3d3f8c0e Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 28 Nov 2025 03:17:37 +0800
Subject: [PATCH 129/134] [Bugfix][MM encoder] Fix ViT attention backend
 resolving for Turing GPU (#29614)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/platforms/cuda.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index d5c3a177d9c2b..4bf9401b6b051 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -264,14 +264,15 @@ class CudaPlatformBase(Platform):
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
         # Try FlashAttention first
-        try:
-            backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
-            if backend_class.supports_head_size(
-                head_size
-            ) and backend_class.supports_dtype(dtype):
-                return AttentionBackendEnum.FLASH_ATTN
-        except ImportError:
-            pass
+        if (cc := cls.get_device_capability()) and cc.major >= 8:
+            try:
+                backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
+                if backend_class.supports_head_size(
+                    head_size
+                ) and backend_class.supports_dtype(dtype):
+                    return AttentionBackendEnum.FLASH_ATTN
+            except ImportError:
+                pass
 
         return AttentionBackendEnum.TORCH_SDPA
 

From e5a621b724e5570aaffc4bbf9c5f6ec9bca63333 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 27 Nov 2025 20:31:52 +0100
Subject: [PATCH 130/134] [CI] Add batched audios Whisper test (#29308)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../openai/test_transcription_validation.py   | 197 +--------------
 .../test_transcription_validation_whisper.py  | 237 ++++++++++++++++++
 2 files changed, 238 insertions(+), 196 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_transcription_validation_whisper.py

diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 88580ed899f1a..8045ab1468d6a 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -2,20 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for structured outputs tests
-import io
 import json
 
-import librosa
-import numpy as np
-import openai
 import pytest
-import pytest_asyncio
-import soundfile as sf
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "openai/whisper-large-v3-turbo"
-SERVER_ARGS = ["--enforce-eager"]
 MISTRAL_FORMAT_ARGS = [
     "--tokenizer_mode",
     "mistral",
@@ -26,22 +18,8 @@ MISTRAL_FORMAT_ARGS = [
 ]
 
 
-@pytest.fixture(scope="module")
-def server():
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"]
-)
+@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
 async def test_basic_audio(mary_had_lamb, model_name):
     server_args = ["--enforce-eager"]
 
@@ -120,176 +98,3 @@ async def test_basic_audio_gemma(foscolo):
         )
         out = json.loads(transcription)["text"]
         assert "da cui vergine nacque Venere" in out
-
-
-@pytest.mark.asyncio
-async def test_non_asr_model(winning_call):
-    # text to text model
-    model_name = "JackFram/llama-68m"
-    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
-        client = remote_server.get_async_client()
-        res = await client.audio.transcriptions.create(
-            model=model_name, file=winning_call, language="en", temperature=0.0
-        )
-        err = res.error
-        assert err["code"] == 400 and not res.text
-        assert err["message"] == "The model does not support Transcriptions API"
-
-
-@pytest.mark.asyncio
-async def test_bad_requests(mary_had_lamb, client):
-    # invalid language
-    with pytest.raises(openai.BadRequestError):
-        await client.audio.transcriptions.create(
-            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
-        )
-
-
-@pytest.mark.asyncio
-async def test_long_audio_request(mary_had_lamb, client):
-    mary_had_lamb.seek(0)
-    audio, sr = librosa.load(mary_had_lamb)
-    # Add small silence after each audio for repeatability in the split process
-    audio = np.pad(audio, (0, 1600))
-    repeated_audio = np.tile(audio, 10)
-    # Repeated audio to buffer
-    buffer = io.BytesIO()
-    sf.write(buffer, repeated_audio, sr, format="WAV")
-    buffer.seek(0)
-    transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=buffer,
-        language="en",
-        response_format="text",
-        temperature=0.0,
-    )
-    out = json.loads(transcription)
-    out_text = out["text"]
-    out_usage = out["usage"]
-    counts = out_text.count("Mary had a little lamb")
-    assert counts == 10, counts
-    assert out_usage["seconds"] == 161, out_usage["seconds"]
-
-
-@pytest.mark.asyncio
-async def test_completion_endpoints(client):
-    # text to text model
-    res = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=[{"role": "system", "content": "You are a helpful assistant."}],
-    )
-    err = res.error
-    assert err["code"] == 400
-    assert err["message"] == "The model does not support Chat Completions API"
-
-    res = await client.completions.create(model=MODEL_NAME, prompt="Hello")
-    err = res.error
-    assert err["code"] == 400
-    assert err["message"] == "The model does not support Completions API"
-
-
-@pytest.mark.asyncio
-async def test_streaming_response(winning_call, client):
-    transcription = ""
-    res_no_stream = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=winning_call,
-        response_format="json",
-        language="en",
-        temperature=0.0,
-    )
-    res = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=winning_call,
-        language="en",
-        temperature=0.0,
-        stream=True,
-        timeout=30,
-    )
-    # Reconstruct from chunks and validate
-    async for chunk in res:
-        text = chunk.choices[0]["delta"]["content"]
-        transcription += text
-
-    assert transcription == res_no_stream.text
-
-
-@pytest.mark.asyncio
-async def test_stream_options(winning_call, client):
-    res = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=winning_call,
-        language="en",
-        temperature=0.0,
-        stream=True,
-        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
-        timeout=30,
-    )
-    final = False
-    continuous = True
-    async for chunk in res:
-        if not len(chunk.choices):
-            # final usage sent
-            final = True
-        else:
-            continuous = continuous and hasattr(chunk, "usage")
-    assert final and continuous
-
-
-@pytest.mark.asyncio
-async def test_sampling_params(mary_had_lamb, client):
-    """
-    Compare sampling with params and greedy sampling to assert results
-    are different when extreme sampling parameters values are picked.
-    """
-    transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        temperature=0.8,
-        extra_body=dict(
-            seed=42,
-            repetition_penalty=1.9,
-            top_k=12,
-            top_p=0.4,
-            min_p=0.5,
-            frequency_penalty=1.8,
-            presence_penalty=2.0,
-        ),
-    )
-
-    greedy_transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        temperature=0.0,
-        extra_body=dict(seed=42),
-    )
-
-    assert greedy_transcription.text != transcription.text
-
-
-@pytest.mark.asyncio
-async def test_audio_prompt(mary_had_lamb, client):
-    prompt = "This is a speech, recorded in a phonograph."
-    # Prompts should not omit the part of original prompt while transcribing.
-    prefix = "The first words I spoke in the original phonograph"
-    transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        response_format="text",
-        temperature=0.0,
-    )
-    out = json.loads(transcription)["text"]
-    assert prefix in out
-    transcription_wprompt = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        response_format="text",
-        prompt=prompt,
-        temperature=0.0,
-    )
-    out_prompt = json.loads(transcription_wprompt)["text"]
-    assert prefix in out_prompt
diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py
new file mode 100644
index 0000000000000..82c50e58a0168
--- /dev/null
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# imports for structured outputs tests
+import asyncio
+import io
+import json
+
+import librosa
+import numpy as np
+import openai
+import pytest
+import pytest_asyncio
+import soundfile as sf
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/whisper-large-v3-turbo"
+SERVER_ARGS = ["--enforce-eager"]
+
+
+@pytest.fixture(scope="module")
+def server():
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def whisper_client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_basic_audio(mary_had_lamb):
+    server_args = ["--enforce-eager"]
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(MODEL_NAME, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=MODEL_NAME,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0,
+        )
+        out = json.loads(transcription)
+        out_text = out["text"]
+        out_usage = out["usage"]
+        assert "Mary had a little lamb," in out_text
+        assert out_usage["seconds"] == 16, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client):
+    transcription = whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    transcription2 = whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    # Await both transcriptions by scheduling coroutines together
+    transcription, transcription2 = await asyncio.gather(transcription, transcription2)
+    out = json.loads(transcription)
+    out_text = out["text"]
+    assert "Mary had a little lamb," in out_text
+    out2 = json.loads(transcription2)
+    out_text2 = out2["text"]
+    assert "Edgar Martinez" in out_text2
+
+
+@pytest.mark.asyncio
+async def test_bad_requests(mary_had_lamb, whisper_client):
+    # invalid language
+    with pytest.raises(openai.BadRequestError):
+        await whisper_client.audio.transcriptions.create(
+            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
+        )
+
+
+@pytest.mark.asyncio
+async def test_long_audio_request(mary_had_lamb, whisper_client):
+    mary_had_lamb.seek(0)
+    audio, sr = librosa.load(mary_had_lamb)
+    # Add small silence after each audio for repeatability in the split process
+    audio = np.pad(audio, (0, 1600))
+    repeated_audio = np.tile(audio, 10)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format="WAV")
+    buffer.seek(0)
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=buffer,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    counts = out_text.count("Mary had a little lamb")
+    assert counts == 10, counts
+    assert out_usage["seconds"] == 161, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_completion_endpoints(whisper_client):
+    # text to text model
+    res = await whisper_client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{"role": "system", "content": "You are a helpful assistant."}],
+    )
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Chat Completions API"
+
+    res = await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello")
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Completions API"
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(winning_call, whisper_client):
+    transcription = ""
+    res_no_stream = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        response_format="json",
+        language="en",
+        temperature=0.0,
+    )
+    res = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        timeout=30,
+    )
+    # Reconstruct from chunks and validate
+    async for chunk in res:
+        text = chunk.choices[0]["delta"]["content"]
+        transcription += text
+
+    assert transcription == res_no_stream.text
+
+
+@pytest.mark.asyncio
+async def test_stream_options(winning_call, whisper_client):
+    res = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
+        timeout=30,
+    )
+    final = False
+    continuous = True
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            final = True
+        else:
+            continuous = continuous and hasattr(chunk, "usage")
+    assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_sampling_params(mary_had_lamb, whisper_client):
+    """
+    Compare sampling with params and greedy sampling to assert results
+    are different when extreme sampling parameters values are picked.
+    """
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.8,
+        extra_body=dict(
+            seed=42,
+            repetition_penalty=1.9,
+            top_k=12,
+            top_p=0.4,
+            min_p=0.5,
+            frequency_penalty=1.8,
+            presence_penalty=2.0,
+        ),
+    )
+
+    greedy_transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.0,
+        extra_body=dict(seed=42),
+    )
+
+    assert greedy_transcription.text != transcription.text
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(mary_had_lamb, whisper_client):
+    prompt = "This is a speech, recorded in a phonograph."
+    # Prompts should not omit the part of original prompt while transcribing.
+    prefix = "The first words I spoke in the original phonograph"
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)["text"]
+    assert prefix in out
+    transcription_wprompt = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        prompt=prompt,
+        temperature=0.0,
+    )
+    out_prompt = json.loads(transcription_wprompt)["text"]
+    assert prefix in out_prompt

From a5345bf49df74cd394a07797649f51cd67c6c697 Mon Sep 17 00:00:00 2001
From: Andrii Skliar <andreyws96@gmail.com>
Date: Thu, 27 Nov 2025 20:34:59 +0100
Subject: [PATCH 131/134] [BugFix] Fix `plan` API Mismatch when using latest
 FlashInfer (#29426)

Signed-off-by: Andrii Skliar <askliar@askliar-mlt.client.nvidia.com>
Co-authored-by: Andrii Skliar <askliar@askliar-mlt.client.nvidia.com>
---
 docker/Dockerfile                        | 4 ++--
 requirements/cuda.txt                    | 2 +-
 vllm/v1/attention/backends/flashinfer.py | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index aa3aad21d6c07..eb7c105071c00 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -398,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.5.2 \
-    && uv pip install --system flashinfer-jit-cache==0.5.2 \
+    uv pip install --system flashinfer-cubin==0.5.3 \
+    && uv pip install --system flashinfer-jit-cache==0.5.3 \
         --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
     && flashinfer show-config
 
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 15e8aadc56f47..462f18ef7159b 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -10,4 +10,4 @@ torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.5.2
+flashinfer-python==0.5.3
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index dbd72b298b1fd..777398bf8a20e 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -1508,7 +1508,7 @@ def fast_plan_decode(
     qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
 
     try:
-        # Make sure we pass exactly 18 arguments for tensor core version
+        # Make sure we pass exactly 19 arguments for tensor core version
         self._plan_info = self._cached_module.plan(
             self._float_workspace_buffer,
             self._int_workspace_buffer,
@@ -1528,6 +1528,7 @@ def fast_plan_decode(
             window_left,
             fixed_split_size,
             disable_split_kv,
+            0,
         )
     except Exception as e:
         raise RuntimeError(f"Error in tensor core plan: {e}") from e

From ae0ce1be272105f02a3ac6a63e646690be2481fb Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 27 Nov 2025 12:38:53 -0800
Subject: [PATCH 132/134] [Model Runner V2][BugFix] Keep reference to GPU
 tensors in AsyncOutput (#29623)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/async_utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
index 421fb29a7f87f..f6bc607c1ae67 100644
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -21,6 +21,9 @@ class AsyncOutput(AsyncModelRunnerOutput):
         copy_stream: torch.cuda.Stream,
         copy_event: torch.cuda.Event,
     ):
+        # NOTE(woosuk): We must retain references to the GPU tensors,
+        # as the copy operations are performed on a different CUDA stream than
+        # the one where the tensors were created.
         self.model_runner_output = model_runner_output
         self.sampler_output = sampler_output
         self.num_sampled_tokens = num_sampled_tokens
@@ -51,7 +54,9 @@ class AsyncOutput(AsyncModelRunnerOutput):
                 )
             else:
                 self.logprobs_tensors = None
-            self.num_sampled_tokens = num_sampled_tokens.to("cpu", non_blocking=True)
+            self.num_sampled_tokens_cpu = num_sampled_tokens.to(
+                "cpu", non_blocking=True
+            )
             self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
             if self.model_runner_output.prompt_logprobs_dict:
                 for k, v in self.model_runner_output.prompt_logprobs_dict.items():
@@ -63,7 +68,7 @@ class AsyncOutput(AsyncModelRunnerOutput):
 
     def get_output(self) -> ModelRunnerOutput:
         self.copy_event.synchronize()
-        num_sampled_tokens_np = self.num_sampled_tokens.numpy()
+        num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy()
 
         # NOTE(woosuk): The following code is to ensure compatibility with
         # the existing model runner.

From be493e0b3cfb5810d254e9845217878a39a4853b Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 27 Nov 2025 16:45:38 -0500
Subject: [PATCH 133/134] [BugFix] Fix new nightly failures (#29578)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/attention/backends/utils.py | 26 ++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py  | 12 +++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 6e0d84e4fb4ac..27f07218d9b2e 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -100,6 +100,32 @@ class CommonAttentionMetadata:
     dcp_local_seq_lens_cpu: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
+    # TODO(lucas): remove once we have FULL-CG spec-decode support
+    def unpadded(
+        self, num_actual_tokens: int, num_actual_reqs: int
+    ) -> "CommonAttentionMetadata":
+        maybe_slice_reqs = lambda x: x[:num_actual_reqs] if x is not None else None
+        return CommonAttentionMetadata(
+            query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
+            query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
+            seq_lens=self.seq_lens[:num_actual_reqs],
+            seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs],
+            num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs],
+            num_reqs=num_actual_reqs,
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=self.max_query_len,
+            max_seq_len=self.max_seq_len,
+            block_table_tensor=self.block_table_tensor[:num_actual_reqs],
+            slot_mapping=self.slot_mapping[:num_actual_tokens],
+            causal=self.causal,
+            logits_indices_padded=self.logits_indices_padded,
+            num_logits_indices=self.num_logits_indices,
+            encoder_seq_lens=maybe_slice_reqs(self.encoder_seq_lens),
+            encoder_seq_lens_cpu=maybe_slice_reqs(self.encoder_seq_lens_cpu),
+            dcp_local_seq_lens=maybe_slice_reqs(self.dcp_local_seq_lens),
+            dcp_local_seq_lens_cpu=maybe_slice_reqs(self.dcp_local_seq_lens_cpu),
+        )
+
 
 def slice_query_start_locs(
     query_start_loc: torch.Tensor,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0ae4eb48acf22..6bff83658b45a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1551,7 +1551,7 @@ class GPUModelRunner(
                 # Encoder-only layers do not have KV cache, so we need to
                 # create a dummy block table and slot mapping for them.
                 blk_table_tensor = torch.zeros(
-                    (num_tokens_padded, 1),
+                    (num_reqs_padded, 1),
                     dtype=torch.int32,
                     device=self.device,
                 )
@@ -1652,6 +1652,16 @@ class GPUModelRunner(
                     for layer_name in attn_group.layer_names:
                         attn_metadata[layer_name] = attn_metadata_i
 
+        if spec_decode_common_attn_metadata is not None and (
+            num_reqs != num_reqs_padded or num_tokens != num_tokens_padded
+        ):
+            # Currently the drafter still only uses piecewise cudagraphs (and modifies
+            # the attention metadata in directly), and therefore does not want to use
+            # padded attention metadata.
+            spec_decode_common_attn_metadata = (
+                spec_decode_common_attn_metadata.unpadded(num_tokens, num_reqs)
+            )
+
         return attn_metadata, spec_decode_common_attn_metadata
 
     def _compute_cascade_attn_prefix_lens(

From 35657bcd7a5fd7a7af1aa1b19d78eb8973ec79c1 Mon Sep 17 00:00:00 2001
From: scydas <scyda@outlook.com>
Date: Fri, 28 Nov 2025 09:34:33 +0800
Subject: [PATCH 134/134] [CPU]Update CPU PyTorch to 2.9.0 (#29589)

Signed-off-by: scyda <scyda@outlook.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 docker/Dockerfile.cpu      | 4 ----
 requirements/cpu-build.txt | 4 ++--
 requirements/cpu.txt       | 8 ++++----
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index eb3807ef0ca4e..67d3fb83a0275 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -119,7 +119,6 @@ FROM base AS vllm-test-deps
 
 WORKDIR /workspace/vllm
 
-# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
     cp requirements/test.in requirements/cpu-test.in && \
     sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
@@ -132,9 +131,6 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
       esac; \
     }; \
     remove_packages_not_supported_on_aarch64 && \
-    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
-    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
-    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
     uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
 
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 81d429a5e5f8d..0c6fdd3b33cd1 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -4,9 +4,9 @@ packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.0; platform_system == "Darwin"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
 wheel
 jinja2>=3.1.6
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index e23d3286f3f78..8c04d6d5ce1b0 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -7,17 +7,17 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.0; platform_system == "Darwin"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.8.0; platform_machine == "ppc64le"
+torchaudio==2.9.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.23.0; platform_machine == "ppc64le"
+torchvision==0.24.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # Intel Extension for PyTorch, only for x86_64 CPUs