From 4c23690f43e51eccf6ce5866ac47adcf39215e4d Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 18 Nov 2025 23:06:21 -0500
Subject: [PATCH 001/249] [Attention] FlashAttention ViT support, make default
 backend (#28763)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 cmake/external_projects/vllm_flash_attn.cmake |  2 +-
 tests/kernels/attention/test_flash_attn.py    |  4 +--
 tests/kernels/attention/test_mha_attn.py      | 30 +------------------
 vllm/platforms/cuda.py                        | 21 ++++++-------
 vllm/v1/attention/backends/flash_attn.py      |  4 +--
 5 files changed, 15 insertions(+), 46 deletions(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 567c8959f0454..6cc5cda14c525 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
+          GIT_TAG 71bb26f6295449be880344b93b51791cc009237d
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index 6e5468969bf25..26b8c77ab482f 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -13,14 +13,14 @@ from vllm.vllm_flash_attn import (
 )
 
 NUM_HEADS = [(4, 4), (8, 2)]
-HEAD_SIZES = [128, 256]
+HEAD_SIZES = [40, 72, 80, 128, 256]
 BLOCK_SIZES = [16]
 DTYPES = [torch.bfloat16]
 QDTYPES = [None, torch.float8_e4m3fn]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
-SOFT_CAPS = [None, 50.0]
+SOFT_CAPS = [None]
 SLIDING_WINDOWS = [None, 256]
 
 
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 183bbf3bf4e03..a878ac6396ce5 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -62,38 +62,10 @@ def test_mha_attn_platform(device: str):
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
         # Test CUDA with head_size=72 (not divisible by 32)
-        # - with upstream FA not available
-        # - should use xformers
+        # - should use vLLM's FlashAttention
         with (
             patch("vllm.attention.layer.current_platform", CudaPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
-            patch(
-                "vllm.attention.layer.check_upstream_fa_availability",
-                return_value=False,
-            ),
-        ):
-            attn = MultiHeadAttention(16, 72, scale=1)
-            assert attn.attn_backend == AttentionBackendEnum.XFORMERS
-
-        # Test CUDA with head_size=72 (not divisible by 32)
-        # - with upstream FA available
-        # - should use upstream FA
-        with (
-            patch("vllm.attention.layer.current_platform", CudaPlatform()),
-            patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
-            patch(
-                "vllm.attention.layer.check_upstream_fa_availability", return_value=True
-            ),
-            patch.dict(
-                "sys.modules",
-                {
-                    "flash_attn": type(
-                        "MockFlashAttn",
-                        (),
-                        {"flash_attn_varlen_func": lambda *args, **kwargs: None},
-                    )()
-                },
-            ),
         ):
             attn = MultiHeadAttention(16, 72, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2e4dd8bb808b4..f9bf242b7194e 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -267,24 +267,21 @@ class CudaPlatformBase(Platform):
     ) -> "AttentionBackendEnum":
         from vllm.attention.backends.registry import AttentionBackendEnum
 
-        # For Blackwell GPUs, force TORCH_SDPA for now.
-        # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501
-        if cls.has_device_capability(100):
-            return AttentionBackendEnum.TORCH_SDPA
-
-        if dtype not in (torch.float16, torch.bfloat16):
-            return AttentionBackendEnum.XFORMERS
-
-        if cls.has_device_capability(80):
+        # Try FlashAttention first
+        try:
             backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
             if backend_class.supports_head_size(
                 head_size
             ) and backend_class.supports_dtype(dtype):
                 return AttentionBackendEnum.FLASH_ATTN
-            else:
-                return AttentionBackendEnum.XFORMERS
+        except ImportError:
+            pass
+
+        if cls.has_device_capability(100):
+            # xFormers doesn't support Blackwell, fall back to SDPA
+            # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501
+            return AttentionBackendEnum.TORCH_SDPA
         else:
-            # Fallback for Volta/Turing GPUs or FA not supported
             return AttentionBackendEnum.XFORMERS
 
     @classmethod
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a5d4435000d4d..fdc99a0df1c8a 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -119,8 +119,8 @@ class FlashAttentionBackend(AttentionBackend):
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
     @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [32, 64, 96, 128, 160, 192, 224, 256]
+    def supports_head_size(cls, head_size: int) -> bool:
+        return head_size % 8 == 0 and head_size <= 256
 
     @classmethod
     def supports_kv_cache_dtype(cls, kv_cache_dtype: CacheDType | None) -> bool:

From 468a8d72bac181c1499320478940cec64363e107 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Tue, 18 Nov 2025 21:05:22 -0800
Subject: [PATCH 002/249] [Bugfix] Fix FusedMoEModularKernel for triton backend
 (#28913)

Signed-off-by: Xin Yang <xyangx@amazon.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index b95d1a6b3a1f5..66ae2e94c60a5 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -755,8 +755,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
             self.w13_weight = w13_weight
             self.w2_weight = w2_weight
-            layer.w13_weight = Parameter(w13_weight.storage.data, requires_grad=False)
-            layer.w2_weight = Parameter(w2_weight.storage.data, requires_grad=False)
+            del layer.w13_weight
+            del layer.w2_weight
+            layer.w13_weight = w13_weight
+            layer.w2_weight = w2_weight
         else:
             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
 
@@ -1065,8 +1067,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
             return triton_kernel_moe_forward(
                 hidden_states=x,
-                w1=self.w13_weight,
-                w2=self.w2_weight,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
                 gating_output=router_logits,
                 topk=top_k,
                 renormalize=renormalize,

From 73ff872db0d4e3f5e133d5d2a5307248619d93a6 Mon Sep 17 00:00:00 2001
From: Gleb Kurchanov <nepherpitou@gmail.com>
Date: Wed, 19 Nov 2025 08:21:02 +0300
Subject: [PATCH 003/249] [Bugfix] Fix typo in Qwen3 Next model executor
 (#28960)

Signed-off-by: Gleb Kurchanov <nepherpitou@gmail.com>
---
 vllm/model_executor/models/qwen3_next.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 86508a7c64317..0415c8e00fdfa 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -1154,8 +1154,8 @@ class QwenNextMixtureOfExperts(MixtureOfExperts):
                 example_moe = layer.mlp
                 self.moe_layers.append(layer.mlp.experts)
 
-            if example_moe is None:
-                raise RuntimeError("No Qwen3Next layer found in the model.layers.")
+        if example_moe is None:
+            raise RuntimeError("No Qwen3Next layer found in the model.layers.")
 
         # Set MoE hyperparameters
         self.num_moe_layers = len(self.moe_layers)

From 6a25ea5f0ea193e35b5a83cb0285c48964bc9eb1 Mon Sep 17 00:00:00 2001
From: Uranus <109661872+UranusSeven@users.noreply.github.com>
Date: Wed, 19 Nov 2025 13:30:08 +0800
Subject: [PATCH 004/249] [Docs] Update oneshot imports (#28188)

Signed-off-by: UranusSeven <109661872+UranusSeven@users.noreply.github.com>
---
 docs/features/quantization/fp8.md               | 2 +-
 docs/features/quantization/int4.md              | 2 +-
 docs/features/quantization/int8.md              | 2 +-
 docs/features/quantization/quantized_kvcache.md | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index 0c5111fb8af0d..d4a6176b236f1 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -60,7 +60,7 @@ Since simple RTN does not require data for weight quantization and the activatio
 ??? code
 
     ```python
-    from llmcompressor.transformers import oneshot
+    from llmcompressor import oneshot
     from llmcompressor.modifiers.quantization import QuantizationModifier
 
     # Configure the simple PTQ quantization
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index 035e7ea291f9e..9752039097d63 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -80,7 +80,7 @@ Now, apply the quantization algorithms:
 ??? code
 
     ```python
-    from llmcompressor.transformers import oneshot
+    from llmcompressor import oneshot
     from llmcompressor.modifiers.quantization import GPTQModifier
     from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index ec8a77f74ffef..701ca6378cb16 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -87,7 +87,7 @@ Now, apply the quantization algorithms:
 ??? code
 
     ```python
-    from llmcompressor.transformers import oneshot
+    from llmcompressor import oneshot
     from llmcompressor.modifiers.quantization import GPTQModifier
     from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index 56cf057678be6..d26a5e217f314 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -78,7 +78,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
     ```python
     from datasets import load_dataset
     from transformers import AutoModelForCausalLM, AutoTokenizer
-    from llmcompressor.transformers import oneshot
+    from llmcompressor import oneshot
 
     # Select model and load it
     MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

From 3d4e7d34be856cc4f54033e6a019059afacb5e76 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Wed, 19 Nov 2025 05:43:01 +0000
Subject: [PATCH 005/249] [Model][QwenVL] Simplify cos/sin rotary embedding
 indexing  (#28962)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/model_executor/models/glm4_1v.py           |  9 ++-------
 vllm/model_executor/models/qwen2_5_vl.py        |  9 ++-------
 vllm/model_executor/models/qwen2_vl.py          |  9 ++-------
 .../models/qwen3_omni_moe_thinker.py            |  9 ++-------
 vllm/model_executor/models/qwen3_vl.py          | 17 +++--------------
 5 files changed, 11 insertions(+), 42 deletions(-)

diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 2c2f45c2453ee..7a4fee76ae6b3 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -797,13 +797,8 @@ class Glm4vVisionTransformer(nn.Module):
         # Use pre-computed cos_sin_cache from RotaryEmbedding
         cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
 
-        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
-        cos_w = cos[pos_ids[:, 1]]
-        sin_h = sin[pos_ids[:, 0]]
-        sin_w = sin[pos_ids[:, 1]]
-
-        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
-        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
         return cos_combined, sin_combined, pos_ids
 
     def compute_attn_mask_seqlen(
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 2e4fd9645d88f..5b5d50ec8935a 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -738,13 +738,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
         # Use pre-computed cos_sin_cache from RotaryEmbedding
         cos, sin = self.rotary_pos_emb.get_cos_sin(max_size)
 
-        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
-        cos_w = cos[pos_ids[:, 1]]
-        sin_h = sin[pos_ids[:, 0]]
-        sin_w = sin[pos_ids[:, 1]]
-
-        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
-        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
 
         cos_combined = cos_combined.reshape(
             cos_combined.shape[0] // self.spatial_merge_unit,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 53df5972a8fe1..cda8eaf5377f1 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -724,13 +724,8 @@ class Qwen2VisionTransformer(nn.Module):
         # Use pre-computed cos_sin_cache from RotaryEmbedding
         cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
 
-        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
-        cos_w = cos[pos_ids[:, 1]]
-        sin_h = sin[pos_ids[:, 0]]
-        sin_w = sin[pos_ids[:, 1]]
-
-        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
-        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
         return cos_combined, sin_combined
 
     def compute_attn_mask_seqlen(
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 8274b92138f78..d2fd74a5e41ad 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -428,13 +428,8 @@ class Qwen3Omni_VisionTransformer(nn.Module):
         # Use pre-computed cos_sin_cache from RotaryEmbedding
         cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
 
-        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
-        cos_w = cos[pos_ids[:, 1]]
-        sin_h = sin[pos_ids[:, 0]]
-        sin_w = sin[pos_ids[:, 1]]
-
-        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
-        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
 
         return cos_combined, sin_combined
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 99a4007ef7f23..0c546309400b7 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -459,18 +459,13 @@ class Qwen3_VisionTransformer(nn.Module):
             else self.rot_pos_ids(h, w, self.spatial_merge_size).repeat(t, 1)
             for t, h, w in grid_thw
         ]
-        pos_ids = torch.cat(pos_ids, dim=0)
+        pos_ids = torch.cat(pos_ids, dim=0).to(self.device, non_blocking=True)
 
         # Use pre-computed cos_sin_cache from RotaryEmbedding
         cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
 
-        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
-        cos_w = cos[pos_ids[:, 1]]
-        sin_h = sin[pos_ids[:, 0]]
-        sin_w = sin[pos_ids[:, 1]]
-
-        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
-        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
 
         return cos_combined, sin_combined
 
@@ -566,12 +561,6 @@ class Qwen3_VisionTransformer(nn.Module):
         pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list)
         hidden_states = hidden_states + pos_embeds
         rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
-        rotary_pos_emb_cos = rotary_pos_emb_cos.to(
-            hidden_states.device, non_blocking=True
-        )
-        rotary_pos_emb_sin = rotary_pos_emb_sin.to(
-            hidden_states.device, non_blocking=True
-        )
 
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]

From 71d0ae1c54543689ea7541aa20b9522982b0815e Mon Sep 17 00:00:00 2001
From: Roman Solomatin <samoed.roman@gmail.com>
Date: Wed, 19 Nov 2025 09:28:40 +0300
Subject: [PATCH 006/249] [Misc] Update embedding/cross encoder tests to use
 `mteb` v2 (#27329)

Signed-off-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 requirements/test.in                          |   2 +-
 requirements/test.txt                         |   4 +-
 .../language/pooling_mteb_test/mteb_utils.py  | 181 +++++++++++-------
 .../test_bge_reranker_v2_gemma.py             |  31 ++-
 .../pooling_mteb_test/test_mxbai_rerank.py    |   5 +-
 .../pooling_mteb_test/test_qwen3_reranker.py  |   5 +-
 6 files changed, 144 insertions(+), 84 deletions(-)

diff --git a/requirements/test.in b/requirements/test.in
index 30d97e9b9c7d0..05f6bcca5c2c4 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -36,7 +36,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 # TODO: Use lm-eval[api]==0.4.10 once released
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
-mteb[bm25s]>=1.38.11, <2 # required for mteb test
+mteb[bm25s]>=2, <3 # required for mteb test
 transformers==4.57.1
 tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
diff --git a/requirements/test.txt b/requirements/test.txt
index 3263b74c08797..bcd511660f85e 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -201,8 +201,6 @@ email-validator==2.2.0
     # via pydantic
 encodec==0.1.1
     # via vocos
-eval-type-backport==0.2.2
-    # via mteb
 evaluate==0.4.3
     # via lm-eval
 fastapi==0.116.1
@@ -490,7 +488,7 @@ msgpack==1.1.0
     # via
     #   librosa
     #   ray
-mteb==1.38.11
+mteb==2.1.2
     # via -r requirements/test.in
 multidict==6.1.0
     # via
diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py
index 0384ff82790f0..189cdbae99dcd 100644
--- a/tests/models/language/pooling_mteb_test/mteb_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_utils.py
@@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import tempfile
-from collections.abc import Sequence
 
 import mteb
 import numpy as np
 import requests
 import torch
+from mteb.models import ModelMeta
+from mteb.types import Array
+from torch.utils.data import DataLoader
 
 import tests.ci_envs as ci_envs
 from tests.models.utils import (
@@ -27,24 +29,47 @@ MTEB_EMBED_TOL = 1e-4
 
 # See #19344
 MTEB_RERANK_TASKS = ["NFCorpus"]
-MTEB_RERANK_LANGS = ["en"]
+MTEB_RERANK_LANGS = ["eng"]
 MTEB_RERANK_TOL = 2e-3
 
+_empty_model_meta = ModelMeta(
+    loader=None,
+    name="vllm/model",
+    revision="1",
+    release_date=None,
+    languages=None,
+    framework=[],
+    similarity_fn_name=None,
+    n_parameters=None,
+    memory_usage_mb=None,
+    max_tokens=None,
+    embed_dim=None,
+    license=None,
+    open_weights=None,
+    public_training_code=None,
+    public_training_data=None,
+    use_instructions=None,
+    training_datasets=None,
+    modalities=["text"],  # 'image' can be added to evaluate multimodal models
+)
+
+
+class VllmMtebEncoder(mteb.EncoderProtocol):
+    mteb_model_meta = _empty_model_meta
 
-class VllmMtebEncoder(mteb.Encoder):
     def __init__(self, vllm_model):
-        super().__init__()
         self.llm = vllm_model
         self.rng = np.random.default_rng(seed=42)
 
     def encode(
         self,
-        sentences: Sequence[str],
+        inputs: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
         # Hoping to discover potential scheduling
         # issues by randomizing the order.
+        sentences = [text for batch in inputs for text in batch["text"]]
         r = self.rng.permutation(len(sentences))
         sentences = [sentences[i] for i in r]
         outputs = self.llm.embed(sentences, use_tqdm=False)
@@ -52,36 +77,70 @@ class VllmMtebEncoder(mteb.Encoder):
         embeds = embeds[np.argsort(r)]
         return embeds
 
+    def similarity(
+        self,
+        embeddings1: np.ndarray,
+        embeddings2: np.ndarray,
+    ) -> np.ndarray:
+        # Cosine similarity
+        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
+        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
+        sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
+        return sim
+
+    def similarity_pairwise(
+        self,
+        embeddings1: Array,
+        embeddings2: Array,
+    ) -> Array:
+        # Cosine similarity
+        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
+        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
+        sim = np.sum(embeddings1 * embeddings2, axis=1) / (
+            norm1.flatten() * norm2.flatten()
+        )
+        return sim
+
+
+class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
+    mteb_model_meta = _empty_model_meta
+
+    def __init__(self, vllm_model):
+        self.llm = vllm_model
+        self.rng = np.random.default_rng(seed=42)
+
     def predict(
         self,
-        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
-        r = self.rng.permutation(len(sentences))
-        sentences = [sentences[i] for i in r]
-
-        queries = [s[0] for s in sentences]
-        corpus = [s[1] for s in sentences]
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        corpus = [text for batch in inputs2 for text in batch["text"]]
 
         outputs = self.llm.score(
             queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
         )
         scores = np.array(outputs)
-        scores = scores[np.argsort(r)]
         return scores
 
 
-class OpenAIClientMtebEncoder(mteb.Encoder):
+class OpenAIClientMtebEncoder(VllmMtebEncoder):
     def __init__(self, model_name: str, client):
-        super().__init__()
         self.model_name = model_name
         self.client = client
         self.rng = np.random.default_rng(seed=42)
 
-    def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
+    def encode(
+        self,
+        inputs: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
         # Hoping to discover potential scheduling
         # issues by randomizing the order.
+        sentences = [text for batch in inputs for text in batch["text"]]
         r = self.rng.permutation(len(sentences))
         sentences = [sentences[i] for i in r]
 
@@ -94,28 +153,29 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
         return embeds
 
 
-class ScoreClientMtebEncoder(mteb.Encoder):
+class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
+    mteb_model_meta = _empty_model_meta
+
     def __init__(self, model_name: str, url):
-        super().__init__()
         self.model_name = model_name
         self.url = url
         self.rng = np.random.default_rng(seed=42)
 
     def predict(
         self,
-        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
-        r = self.rng.permutation(len(sentences))
-        sentences = [sentences[i] for i in r]
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        full_corpus = [text for batch in inputs2 for text in batch["text"]]
 
         outputs = []
-        for query, corpus, prompt in sentences:
+        for query, corpus in zip(queries, full_corpus):
             outputs.append(self.get_score(query, corpus))
 
         scores = np.array(outputs)
-        scores = scores[np.argsort(r)]
         return scores
 
     def get_score(self, query, corpus):
@@ -145,16 +205,13 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder):
         return response["results"][0]["relevance_score"]
 
 
-def run_mteb_embed_task(encoder, tasks):
+def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
     tasks = mteb.get_tasks(tasks=tasks)
-    evaluation = mteb.MTEB(tasks=tasks)
-    results = evaluation.run(
+    results = mteb.evaluate(
         encoder,
-        verbosity=0,
-        output_folder=None,
-        encode_kwargs={
-            "show_progress_bar": False,
-        },
+        tasks,
+        cache=None,
+        show_progress_bar=False,
     )
 
     main_score = results[0].scores["test"][0]["main_score"]
@@ -244,33 +301,39 @@ def mteb_test_embed_models(
     assert st_main_score - vllm_main_score < atol
 
 
-def run_mteb_rerank(cross_encoder, tasks, languages):
-    with tempfile.TemporaryDirectory() as results_folder:
+def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
+    with tempfile.TemporaryDirectory() as prediction_folder:
         bm25s = mteb.get_model("bm25s")
-        tasks = mteb.get_tasks(tasks=tasks, languages=languages)
-
-        subset = "default"
         eval_splits = ["test"]
 
-        evaluation = mteb.MTEB(tasks=tasks)
-        evaluation.run(
-            bm25s,
-            verbosity=0,
-            eval_splits=eval_splits,
-            save_predictions=True,
-            output_folder=f"{results_folder}/stage1",
-            encode_kwargs={"show_progress_bar": False},
+        mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
+            tasks=tasks, languages=languages, eval_splits=eval_splits
         )
 
-        results = evaluation.run(
+        mteb.evaluate(
+            bm25s,
+            mteb_tasks,
+            prediction_folder=prediction_folder,
+            show_progress_bar=False,
+            # don't save results for test runs
+            cache=None,
+            overwrite_strategy="always",
+        )
+
+        second_stage_tasks = []
+        for task in mteb_tasks:
+            second_stage_tasks.append(
+                task.convert_to_reranking(
+                    prediction_folder,
+                    top_k=10,
+                )
+            )
+
+        results = mteb.evaluate(
             cross_encoder,
-            verbosity=0,
-            eval_splits=eval_splits,
-            top_k=10,
-            save_predictions=True,
-            output_folder=f"{results_folder}/stage2",
-            previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
-            encode_kwargs={"show_progress_bar": False},
+            second_stage_tasks,
+            show_progress_bar=False,
+            cache=None,
         )
         main_score = results[0].scores["test"][0]["main_score"]
     return main_score
@@ -280,20 +343,6 @@ def mteb_test_rerank_models_hf(
     hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
 ):
     with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
-        original_predict = hf_model.predict
-
-        def _predict(
-            sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
-            *args,
-            **kwargs,
-        ):
-            # vllm and st both remove the prompt, fair comparison.
-            prompts = [(s[0], s[1]) for s in sentences]
-            return original_predict(prompts, *args, **kwargs, batch_size=8)
-
-        hf_model.predict = _predict
-        hf_model.original_predict = original_predict
-
         if hf_model_callback is not None:
             hf_model_callback(hf_model)
 
@@ -310,7 +359,7 @@ def mteb_test_rerank_models(
     model_info: RerankModelInfo,
     vllm_extra_kwargs=None,
     hf_model_callback=None,
-    vllm_mteb_encoder=VllmMtebEncoder,
+    vllm_mteb_encoder=VllmMtebCrossEncoder,
     atol=MTEB_RERANK_TOL,
 ):
     vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
index 2927a37111364..6b2e469644926 100644
--- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
@@ -2,13 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
+import mteb
 import numpy as np
 import pytest
 import torch
+from torch.utils.data import DataLoader
 
 from tests.conftest import HfRunner
 from tests.models.language.pooling_mteb_test.mteb_utils import (
-    VllmMtebEncoder,
+    VllmMtebCrossEncoder,
     mteb_test_rerank_models,
 )
 from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
@@ -103,7 +105,7 @@ class GemmaRerankerHfRunner(HfRunner):
         return torch.Tensor(scores)
 
 
-class GemmaMtebEncoder(VllmMtebEncoder):
+class GemmaMtebEncoder(VllmMtebCrossEncoder):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.query_template = "A: {query}\n"
@@ -111,17 +113,26 @@ class GemmaMtebEncoder(VllmMtebEncoder):
 
     def predict(
         self,
-        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
-        _sentences = []
-        for query, corpus, prompt in sentences:
-            query = self.query_template.format(query=query)
-            corpus = self.document_template.format(doc=corpus, prompt=PROMPT)
-            _sentences.append((query, corpus, prompt))
-
-        return super().predict(_sentences, *args, **kwargs)
+        queries = [
+            self.query_template.format(query=text)
+            for batch in inputs1
+            for text in batch["text"]
+        ]
+        corpus = [
+            self.document_template.format(doc=text, prompt=PROMPT)
+            for batch in inputs2
+            for text in batch["text"]
+        ]
+        outputs = self.llm.score(
+            queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
+        )
+        scores = np.array(outputs)
+        return scores
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
diff --git a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
index fd04dc1990238..a6f2a89b268f1 100644
--- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
+++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
@@ -70,8 +70,9 @@ class MxbaiRerankerHfRunner(HfRunner):
             return scores
 
         scores = []
-        for prompt in prompts:
-            inputs = process_inputs([prompt])
+        for query, doc, *_ in prompts:
+            pairs = [(query, doc)]
+            inputs = process_inputs(pairs)
             score = compute_logits(inputs)
             scores.append(score[0].item())
         return torch.Tensor(scores)
diff --git a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
index 00e99f44cfdb1..9a1be6c0be1d6 100644
--- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
+++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
@@ -72,8 +72,9 @@ class Qwen3RerankerHfRunner(HfRunner):
             return scores
 
         scores = []
-        for prompt in prompts:
-            inputs = process_inputs([prompt])
+        for query, doc, *_ in prompts:
+            pairs = [(query, doc)]
+            inputs = process_inputs(pairs)
             score = compute_logits(inputs)
             scores.append(score[0].item())
         return torch.Tensor(scores)

From a4511e38db375a85b4dd784c2c38528747288f46 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 19 Nov 2025 01:46:32 -0500
Subject: [PATCH 007/249] Speed up macOS smoke test (#28954)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .github/workflows/macos-smoke-test.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index 42b05ecd5ac06..a183033c9adde 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -9,7 +9,7 @@ on:
 jobs:
   macos-m1-smoke-test:
     runs-on: macos-latest
-    timeout-minutes: 20
+    timeout-minutes: 30
 
     steps:
       - uses: actions/checkout@v4
@@ -37,15 +37,14 @@ jobs:
       - name: Verify installation
         run: |
           python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
-          python -c "import torch; print(f'PyTorch: {torch.__version__}')"
 
       - name: Smoke test vllm serve
-        timeout-minutes: 10
         run: |
           # Start server in background
           vllm serve Qwen/Qwen3-0.6B \
-            --max-model-len=2048 \
+            --max-model-len=2K \
             --load-format=dummy \
+            --hf-overrides '{"num_hidden_layers": 2}' \
             --enforce-eager \
             --port 8000 &
 

From 7ed27f3cb55e3f64614300ec7acde1b382a48541 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Wed, 19 Nov 2025 07:52:30 +0100
Subject: [PATCH 008/249] [Doc]: fix typos in various files (#28945)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 docs/design/moe_kernel_features.md                   | 4 ++--
 docs/design/plugin_system.md                         | 2 +-
 docs/features/quantization/quark.md                  | 2 +-
 examples/online_serving/prometheus_grafana/README.md | 2 +-
 vllm/engine/arg_utils.py                             | 2 +-
 vllm/envs.py                                         | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 7663b82266f0b..36ae9506b65fb 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -4,7 +4,7 @@ The purpose of this document is to provide an overview of the various MoE kernel
 
 ## Fused MoE Modular All2All backends
 
-There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` sub-classes provide an interface for each all2all backend.
+There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` subclasses provide an interface for each all2all backend.
 
 The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support.
 
@@ -68,7 +68,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 
 ## Fused MoE Experts Kernels
 
-The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
+There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
 
 Each kernel must be provided with one of the supported input activation formats.  Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
 
diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index dc2f7c4aed3c3..e8db8047ca4e6 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -49,7 +49,7 @@ Every plugin has three parts:
 
 - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
 
-- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
+- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre-/post-processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
 
 - **Stat logger plugins** (with group name `vllm.stat_logger_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree loggers into vLLM. The entry point should be a class that subclasses StatLoggerBase.
 
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index bd7bc186e13aa..c54d7d2251999 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -306,7 +306,7 @@ As examples, we provide some ready-to-use quantized mixed precision model to sho
 
 ### 2. inference the quantized mixed precision model in vLLM
 
-Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follow:
+Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follows:
 
 ```bash
 lm_eval --model vllm \
diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
index 5cd4dab5a8fa7..9615210a2ad80 100644
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@@ -46,7 +46,7 @@ Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the de
 
 Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus.
 
-On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`.
+On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each container. You can just use `http://prometheus:9090`.
 
 Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ab6e5e594c239..e2f7326448b3a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1500,7 +1500,7 @@ class EngineArgs:
         # Local DP rank = 1, use pure-external LB.
         if data_parallel_external_lb:
             assert self.data_parallel_rank is not None, (
-                "data_parallel_rank or node_rank must be spefified if "
+                "data_parallel_rank or node_rank must be specified if "
                 "data_parallel_external_lb is enable."
             )
             assert self.data_parallel_size_local in (1, None), (
diff --git a/vllm/envs.py b/vllm/envs.py
index 6d92d5afee501..e61fb114325c6 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1261,7 +1261,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # MoE routing strategy selector.
     # See `RoutingSimulator.get_available_strategies()` # for available
     # strategies.
-    # Cutstom routing strategies can be registered by
+    # Custom routing strategies can be registered by
     # RoutingSimulator.register_strategy()
     # Note: custom strategies may not produce correct model outputs
     "VLLM_MOE_ROUTING_SIMULATION_STRATEGY": lambda: os.environ.get(

From ae4821a1086325decbc801d3292dee42e42549bb Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Tue, 18 Nov 2025 23:47:57 -0800
Subject: [PATCH 009/249] Add CPU support model (#28697)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 docs/models/hardware_supported_models/cpu.md | 26 ++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 docs/models/hardware_supported_models/cpu.md

diff --git a/docs/models/hardware_supported_models/cpu.md b/docs/models/hardware_supported_models/cpu.md
new file mode 100644
index 0000000000000..0832755f8fbe2
--- /dev/null
+++ b/docs/models/hardware_supported_models/cpu.md
@@ -0,0 +1,26 @@
+# CPU - Intel® Xeon®
+
+## Supported Models
+
+### Text-only Language Models
+
+| Model                                | Architecture                             | Supported |
+|--------------------------------------|-------------------------------------------|-----------|
+| meta-llama/Llama-3.1 / 3.3           | LlamaForCausalLM                          | ✅        |
+| meta-llama/Llama-4-Scout             | Llama4ForConditionalGeneration            | ✅        |
+| meta-llama/Llama-4-Maverick          | Llama4ForConditionalGeneration            | ✅        |
+| ibm-granite/granite (Granite-MOE)    | GraniteMoeForCausalLM                     | ✅        |
+| Qwen/Qwen3                           | Qwen3ForCausalLM                          | ✅        |
+| zai-org/GLM-4.5                      | GLMForCausalLM                            | ✅        |
+| google/gemma                         | GemmaForCausalLM                          | ✅        |
+
+### Multimodal Language Models
+
+| Model                                | Architecture                             | Supported |
+|--------------------------------------|-------------------------------------------|-----------|
+| Qwen/Qwen2.5-VL                      | Qwen2VLForConditionalGeneration           | ✅        |
+| openai/whisper                       | WhisperForConditionalGeneration           | ✅        |
+
+✅ Runs and optimized.  
+🟨 Runs and correct but not optimized to green yet.  
+❌ Does not pass accuracy test or does not run.  

From d69062c67af46a2e624be92162e9db585eef329b Mon Sep 17 00:00:00 2001
From: gnovack <gnovack@amazon.com>
Date: Wed, 19 Nov 2025 00:32:00 -0800
Subject: [PATCH 010/249] add support for --fully-sharded-loras in fused_moe
 (#28761)

Signed-off-by: gnovack <gnovack@amazon.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_fused_moe_lora_kernel.py      | 208 +++++++++++++++++-
 tests/lora/test_olmoe_tp.py                   |  10 +-
 vllm/lora/layers/fused_moe.py                 |  36 ++-
 vllm/lora/ops/triton_ops/fused_moe_lora_op.py |  24 +-
 vllm/lora/punica_wrapper/punica_base.py       |   2 +
 vllm/lora/punica_wrapper/punica_gpu.py        |   4 +
 6 files changed, 274 insertions(+), 10 deletions(-)

diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py
index 91ab4a87c65f8..91c8b861c3c5c 100644
--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -1,13 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import random
 
 import pytest
 import torch
 
+from tests.utils import multi_gpu_test
 from vllm import _custom_ops as ops
+from vllm.distributed import (
+    init_distributed_environment,
+    initialize_model_parallel,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_world_size,
+)
 from vllm.lora.ops.triton_ops import fused_moe_lora
 from vllm.platforms import current_platform
+from vllm.utils.network_utils import get_open_port
 
 
 @pytest.fixture(autouse=True)
@@ -122,6 +134,8 @@ def use_fused_moe_lora_kernel(
     max_loras,
     num_experts,
     block_size,
+    fully_sharded=False,
+    offset=0,
 ):
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
     max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
@@ -195,10 +209,10 @@ def use_fused_moe_lora_kernel(
         config["NUM_STAGES"],
         config["SPLIT_K"],
         mul_routed_weight,
+        fully_sharded=fully_sharded,
+        offset=offset,
     )
 
-    return output
-
 
 def use_torch(
     hidden_states,
@@ -317,3 +331,193 @@ def test_fused_moe_lora_kernel(
     )
 
     torch.testing.assert_close(output, output2, atol=1e-1, rtol=1e-1)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("num_tokens", [100])
+@pytest.mark.parametrize("top_k_num", [6])
+@pytest.mark.parametrize("num_experts", [64])
+@pytest.mark.parametrize("max_loras", [4])
+@pytest.mark.parametrize("N", [1408])
+@pytest.mark.parametrize("K", [2048])
+@pytest.mark.parametrize("max_lora_rank", [16, 32, 64])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("column_parallel", [True, False])
+def test_fused_moe_lora_kernel_fully_sharded(
+    num_tokens,
+    top_k_num,
+    num_experts,
+    max_loras,
+    N,
+    K,
+    max_lora_rank,
+    block_size,
+    dtype,
+    seed,
+    column_parallel,
+):
+    current_platform.seed_everything(seed)
+    # the number of randomly generated sentences.
+    num_sequences = 10
+    # generate data
+    topk_ids, topk_weights, token_lora_mapping = sample_data(
+        num_tokens, num_sequences, max_loras, num_experts, top_k_num
+    )
+
+    def run_torch_spawn(fn, nprocs):
+        torch.multiprocessing.spawn(
+            fn,
+            args=(
+                nprocs,
+                f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}",
+                dtype,
+                seed,
+                N,
+                K,
+                num_tokens,
+                topk_ids,
+                topk_weights,
+                token_lora_mapping,
+                max_lora_rank,
+                top_k_num,
+                max_loras,
+                num_experts,
+                block_size,
+                column_parallel,
+            ),
+            nprocs=nprocs,
+        )
+
+    run_torch_spawn(use_fused_moe_lora_kernel_tensor_parallel, nprocs=2)
+
+
+def use_fused_moe_lora_kernel_tensor_parallel(
+    local_rank,
+    world_size,
+    init_method,
+    dtype,
+    seed,
+    N,
+    K,
+    num_tokens,
+    topk_ids,
+    topk_weights,
+    token_lora_mapping,
+    max_lora_rank,
+    top_k_num,
+    max_loras,
+    num_experts,
+    block_size,
+    column_parallel,
+):
+    def _get_shard_slice(shard_size):
+        return slice(local_rank * shard_size, (local_rank + 1) * shard_size)
+
+    current_platform.seed_everything(seed)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    init_distributed_environment(
+        world_size=world_size,
+        rank=local_rank,
+        local_rank=local_rank,
+        distributed_init_method=init_method,
+    )
+    initialize_model_parallel(world_size, 1)
+    tp_size = get_tensor_model_parallel_world_size()
+
+    input_dim = K if column_parallel else N
+    output_dim = N if column_parallel else K
+
+    # init lora weights
+    lora_a = torch.rand(
+        (
+            max_loras,
+            num_experts,
+            max_lora_rank,
+            input_dim,
+        ),
+        dtype=dtype,
+    )
+    lora_b = torch.rand(
+        (
+            max_loras,
+            num_experts,
+            output_dim,
+            max_lora_rank,
+        ),
+        dtype=dtype,
+    )
+
+    hidden_states = torch.rand(
+        (
+            num_tokens,
+            input_dim,
+        ),
+        dtype=dtype,
+    )
+
+    output = torch.zeros((num_tokens, top_k_num, output_dim), dtype=dtype)
+    topk_ids = topk_ids.to(device)
+    topk_weights = topk_weights.to(device)
+    token_lora_mapping = token_lora_mapping.to(device)
+
+    ref_output = use_torch(
+        hidden_states,
+        token_lora_mapping,
+        topk_ids,
+        [lora_a],
+        [lora_b],
+        top_k_num,
+    )
+
+    if column_parallel:
+        # Column parallel (e.g. gate_up_proj): LoRA A is sliced along the rank dim,
+        # and Lora B is sliced along the output dim
+        lora_a_shard_size = max_lora_rank // tp_size
+        lora_a = lora_a[:, :, _get_shard_slice(lora_a_shard_size), :]
+        max_lora_rank = lora_a_shard_size
+        offset = 0
+
+        lora_b_shard_size = output_dim // tp_size
+        lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :]
+        output = output[:, :, _get_shard_slice(lora_b_shard_size)].contiguous()
+    else:
+        # Row parallel (e.g. down proj): LoRA A is sliced along the input dim,
+        # and LoRA B is sliced along the output dim
+        lora_a_shard_size = input_dim // tp_size
+        lora_a = lora_a[:, :, :, _get_shard_slice(lora_a_shard_size)]
+        hidden_states = hidden_states[:, _get_shard_slice(lora_a_shard_size)]
+
+        lora_b_shard_size = output_dim // tp_size
+        lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :]
+        offset = lora_b_shard_size * local_rank
+
+    use_fused_moe_lora_kernel(
+        topk_ids,
+        topk_weights,
+        token_lora_mapping,
+        max_lora_rank,
+        top_k_num,
+        [lora_a],
+        [lora_b],
+        hidden_states,
+        output,
+        max_loras,
+        num_experts,
+        block_size,
+        fully_sharded=True,
+        offset=offset,
+    )
+
+    if column_parallel:
+        output = tensor_model_parallel_all_gather(output)
+    else:
+        output = tensor_model_parallel_all_reduce(output)
+
+    torch.testing.assert_close(output, ref_output, atol=1e-1, rtol=1e-1)
diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py
index e659c1e1a9a07..e3c9816625ba7 100644
--- a/tests/lora/test_olmoe_tp.py
+++ b/tests/lora/test_olmoe_tp.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -111,8 +113,9 @@ def test_olmoe_lora_mixed(olmoe_lora_files):
     generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None])
 
 
+@pytest.mark.parametrize("fully_sharded_loras", [False, True])
 @multi_gpu_test(num_gpus=2)
-def test_olmoe_lora_tp2(olmoe_lora_files):
+def test_olmoe_lora_tp2(olmoe_lora_files, fully_sharded_loras):
     llm = vllm.LLM(
         MODEL_PATH,
         max_model_len=1024,
@@ -122,14 +125,16 @@ def test_olmoe_lora_tp2(olmoe_lora_files):
         trust_remote_code=True,
         enable_chunked_prefill=True,
         tensor_parallel_size=2,
+        fully_sharded_loras=fully_sharded_loras,
     )
 
     generate_and_test(llm, olmoe_lora_files, lora_id=1)
     generate_and_test(llm, olmoe_lora_files, lora_id=2)
 
 
+@pytest.mark.parametrize("fully_sharded_loras", [False, True])
 @multi_gpu_test(num_gpus=4)
-def test_olmoe_lora_tp4(olmoe_lora_files):
+def test_olmoe_lora_tp4(olmoe_lora_files, fully_sharded_loras):
     llm = vllm.LLM(
         MODEL_PATH,
         max_model_len=1024,
@@ -139,6 +144,7 @@ def test_olmoe_lora_tp4(olmoe_lora_files):
         trust_remote_code=True,
         enable_chunked_prefill=True,
         tensor_parallel_size=4,
+        fully_sharded_loras=fully_sharded_loras,
     )
 
     generate_and_test(llm, olmoe_lora_files, lora_id=1)
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 8fb3efa220f6d..3291c41fcda1e 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -12,6 +12,7 @@ from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
+from vllm.distributed.utils import divide
 from vllm.lora.layers.base import BaseLayerWithLoRA
 from vllm.lora.ops.triton_ops.utils import get_lora_op_configs
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -205,6 +206,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                     shrink_config,  ## pass the shrink config
                     expand_config,  ## pass the expand config
                     self.adapter_enabled,
+                    fully_sharded=self.fully_sharded,
                 )
 
                 result = func(*args, **kwargs)
@@ -250,7 +252,10 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1)
                 intermediate_cache2 = moe_state_dict["intermediate_cache2"]
                 intermediate_cache3 = args[0]
-                max_lora_rank = self.w1_lora_a_stacked.shape[-2]
+                max_lora_rank = self.w2_lora_a_stacked.shape[-2]
+
+                shard_size_w2 = divide(self.base_layer.hidden_size, self.tp_size)
+
                 self.punica_wrapper.add_lora_fused_moe(
                     intermediate_cache3,
                     intermediate_cache2,
@@ -266,6 +271,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                     expand_config,  ## pass the expand config
                     self.adapter_enabled,
                     True,
+                    fully_sharded=self.fully_sharded,
+                    offset=shard_size_w2 * self.tp_rank if self.fully_sharded else 0,
                 )
 
                 result = func(*args, **kwargs)
@@ -294,6 +301,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         model_config: PretrainedConfig | None = None,
     ) -> None:
         """Initializes lora matrices."""
+        self.fully_sharded = lora_config.fully_sharded_loras
 
         self.adapter_enabled = torch.tensor(
             [0] * (max_loras + 1), dtype=torch.int, device=self.device
@@ -303,7 +311,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
             (
                 max_loras,
                 self.base_layer.local_num_experts,
-                lora_config.max_lora_rank,
+                lora_config.max_lora_rank
+                if not self.fully_sharded
+                else divide(lora_config.max_lora_rank, self.tp_size),
                 self.base_layer.hidden_size,
             ),
             dtype=lora_config.lora_dtype,
@@ -334,7 +344,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
             (
                 max_loras,
                 self.base_layer.local_num_experts,
-                self.base_layer.hidden_size,
+                self.base_layer.hidden_size
+                if not self.fully_sharded
+                else divide(self.base_layer.hidden_size, self.tp_size),
                 lora_config.max_lora_rank,
             ),
             dtype=lora_config.lora_dtype,
@@ -345,7 +357,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
             (
                 max_loras,
                 self.base_layer.local_num_experts,
-                lora_config.max_lora_rank,
+                lora_config.max_lora_rank
+                if not self.fully_sharded
+                else divide(lora_config.max_lora_rank, self.tp_size),
                 self.base_layer.hidden_size,
             ),
             dtype=lora_config.lora_dtype,
@@ -419,6 +433,20 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 w3_lora_b = w3_lora_b[start_idx:end_idx, :]
                 w2_lora_a = w2_lora_a[:, start_idx:end_idx]
 
+                if self.fully_sharded:
+                    # Based on S-LoRA, we slice W1 and W3 A along the rank dim,
+                    # and W2 B along the hidden_size dim.
+                    w13_shard_size = self.w1_lora_a_stacked[index, eid].shape[0]
+                    w13_start_idx = self.tp_rank * w13_shard_size
+                    w13_end_idx = (self.tp_rank + 1) * w13_shard_size
+                    w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :]
+                    w3_lora_a = w3_lora_a[w13_start_idx:w13_end_idx, :]
+
+                    w2_shard_size = self.w2_lora_b_stacked[index, eid].shape[0]
+                    w2_start_idx = self.tp_rank * w2_shard_size
+                    w2_end_idx = (self.tp_rank + 1) * w2_shard_size
+                    w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :]
+
             self.w1_lora_a_stacked[
                 index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1]
             ].copy_(w1_lora_a, non_blocking=True)
diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index e2dd47dbb4e64..413ee8ecbbf96 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -3,6 +3,10 @@
 
 import torch
 
+from vllm.distributed import (
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import direct_register_custom_op
 
@@ -311,6 +315,7 @@ def _fused_moe_lora_expand(
     num_stages: int,
     split_k: int,
     mul_routed_weight: bool = False,
+    offset: int = 0,
 ) -> None:
     b_ptr = _get_ptr(lora_b_stacked, device)
     K = max_lora_rank
@@ -380,7 +385,7 @@ def _fused_moe_lora_expand(
         **expand_config,
     )
     for i in range(num_slices):
-        output[:, :, i * N : (i + 1) * N] += b_intermediate_cache1[i]
+        output[:, :, i * N + offset : (i + 1) * N + offset] += b_intermediate_cache1[i]
 
 
 @torch.inference_mode()
@@ -416,6 +421,8 @@ def _fused_moe_lora(
     expand_num_stages: int,
     expand_split_k: int,
     mul_routed_weight: bool = False,
+    fully_sharded: bool = False,
+    offset: int = 0,
 ) -> None:
     assert len(lora_a_stacked) == len(lora_b_stacked) > 0
     assert (
@@ -430,7 +437,6 @@ def _fused_moe_lora(
         == expert_ids.shape[0]
         == num_tokens_post_padded.shape[0]
     )
-    assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1]
     assert output.shape[0] == topk_weights.shape[0]
     assert top_k_num == topk_weights.shape[1]
     device = qcurr_hidden_states.device
@@ -480,6 +486,19 @@ def _fused_moe_lora(
         mul_routed_weight,
     )
 
+    if fully_sharded:
+        if max_lora_rank == w1_lora_b_stacked.shape[-1]:
+            a_intermediate_cache1 = tensor_model_parallel_all_reduce(
+                a_intermediate_cache1
+            )
+        else:
+            a_intermediate_cache1 = tensor_model_parallel_all_gather(
+                a_intermediate_cache1
+            )
+
+            # reset max_lora_rank to the full rank after allgather
+            max_lora_rank = a_intermediate_cache1.shape[-1]
+
     _fused_moe_lora_expand(
         output,
         a_intermediate_cache1,
@@ -510,6 +529,7 @@ def _fused_moe_lora(
         expand_num_stages,
         expand_split_k,
         mul_routed_weight,
+        offset,
     )
 
 
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index b6186e8561529..a6ffbb7b71ce4 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -483,6 +483,8 @@ class PunicaWrapperBase(PunicaWrapperABC):
         expand_config,
         adapter_enabled: torch.Tensor,
         mul_routed_weight=False,
+        fully_sharded: bool = False,
+        offset: int = 0,
     ):
         """
         Performs a fused forward computation for LoRA of
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index ede50a48af985..d863a5884d3c5 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -375,6 +375,8 @@ class PunicaWrapperGPU(PunicaWrapperBase):
         expand_config,
         adapter_enabled: torch.Tensor,
         mul_routed_weight=False,
+        fully_sharded: bool = False,
+        offset: int = 0,
     ):
         """
         Performs a fused forward computation for LoRA of Mixture-of-Experts (MoE) layer.
@@ -408,4 +410,6 @@ class PunicaWrapperGPU(PunicaWrapperBase):
             expand_config.get("NUM_STAGES", 3),
             expand_config.get("SPLIT_K", 1),
             mul_routed_weight,
+            fully_sharded,
+            offset,
         )

From fdf93486d6c4f36be2f410a846bf68654041dc51 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 19 Nov 2025 18:35:29 +0800
Subject: [PATCH 011/249] [Docs] Clean up moe_kernel_features.md (#28530)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/design/moe_kernel_features.md | 90 +++++++++++++++---------------
 1 file changed, 44 insertions(+), 46 deletions(-)

diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 36ae9506b65fb..f0d5a3e934f39 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -1,4 +1,4 @@
-# Fused MoE Kernel features
+# Fused MoE Kernel Features
 
 The purpose of this document is to provide an overview of the various MoE kernels (both modular and non-modular) so it will be easier to select an appropriate set of kernels for any particular situation. This includes information about the all2all backends used by modular kernels.
 
@@ -8,15 +8,15 @@ There are a number of all2all communication backends that are used to implement
 
 The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support.
 
-The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, the finalize step requires the same format. All the backend `prepare` methods expect activations in standard format and all the `finalize methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document.
+The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, and the finalize step requires the same format. All the backend `prepare` methods expect activations in the standard format and all the `finalize` methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document.
 
-The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports. e.g. deepep_high_throughput supports only block-quantized fp8 format, any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type.  The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 w/per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process.  If no quantization type is specified, the kernel operates on float16 and/or bfloat16.
+The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports, e.g. deepep_high_throughput supports only block-quantized fp8 format. Any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type. The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 with per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process. If no quantization type is specified, the kernel operates on float16 and/or bfloat16.
 
 Async backends support the use of DBO (Dual Batch Overlap) and shared expert overlap (where shared experts are computed during the combine step).
 
-Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass, for non-modular kernels, it is up to the experts function to deal with this flag.
+Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. Llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass. For non-modular kernels, it is up to the experts function to deal with this flag.
 
-unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`.  All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP w/o EP.
+Unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.
 
 <style>
 td {
@@ -30,24 +30,23 @@ th {
 }
 </style>
 
-| Backend                               | Output act. format | Quant. types    | Quant. format          | Async | Apply Weight On Input | Sub-class                                                                                                                                                     |
-|---------------------------------------|--------------------|-----------------|------------------------|-------|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| naive                                 | standard           | all<sup>1</sup> | G,A,T                  | N     | <sup>6</sup>          | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl]                                                                                  |
-| pplx                                  | batched            | fp8,int8        | G,A,T                  | Y     | Y                     | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize]                                                 |
-| deepep_high_throughput                | standard           | fp8             | G(128),A,T<sup>2</sup> | Y     | Y                     | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize]                                    |
-| deepep_low_latency                    | batched            | fp8             | G(128),A,T<sup>3</sup> | Y     | Y                     | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize]                                    |
-| flashinfer_all2allv                   | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] |
-| flashinfer<sup>4</sup>                | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize]   |
-| flashinfer<sup>4</sup>                | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize]   |
-| MoEPrepareAndFinalizeNoEP<sup>5</sup> | standard           | fp8,int8        | G,A,T                  | N     | Y                     | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP]                                                |
-| BatchedPrepareAndFinalize<sup>5</sup> | batched            | fp8,int8        | G,A,T                  | N     | Y                     | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize]                                               |
+| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
+|---------|--------------------|--------------|---------------|-------|-----------------------|-----------|
+| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] |
+| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] |
+| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
+| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
+| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] |
+| flashinfer<sup>4</sup> | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] |
+| MoEPrepareAndFinalizeNoEP<sup>5</sup> | standard | fp8,int8 | G,A,T | N | Y | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP] |
+| BatchedPrepareAndFinalize<sup>5</sup> | batched | fp8,int8 | G,A,T | N | Y | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize] |
 
 !!! info "Table key"
     1. All types: mxfp4, nvfp4, int4, int8, fp8
     2. A,T quantization occurs after dispatch.
     3. All quantization happens after dispatch.
     4. Controlled by different env vars (`VLLM_FLASHINFER_MOE_BACKEND` "throughput" or "latency")
-    5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs w/o dispatch or combine.  These cannot be selected via environment variable.  These are generally use for testing or adapting an expert subclass to the `fused_experts` API.
+    5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs without dispatch or combine. These cannot be selected via environment variable. These are generally use for testing or adapting an expert subclass to the `fused_experts` API.
     6. This depends on the experts implementation.
 
     ---
@@ -66,44 +65,43 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
 - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
 
-## Fused MoE Experts Kernels
+## Fused Experts Kernels
 
-There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
+There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters, so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
 
-Each kernel must be provided with one of the supported input activation formats.  Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
+Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx` and `DeepEPLLPrepareAndFinalize`.
 
 Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type.
 
-Each experts kernel supports one or more activation functions, e.g. silu, gelu that are applied to the intermediate results.
+Each experts kernel supports one or more activation functions, e.g. silu or gelu, which are applied to the intermediate results.
 
 As with the backends, some experts support applying topk weights on the input activations. The entries in the column in this table only apply to the non-modular experts.
 
 Most experts flavors include an equivalent modular interface which will be a subclass of `FusedMoEPermuteExpertsUnpermute`.
 
-To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels must have compatible activation formats, quantization types and quantization formats.
+To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels must have compatible activation formats, quantization types and quantization formats.
 
-| Kernel                       | Input act. format     | Quant. types     | Quant. format | Activation function                                         | Apply Weight On Input | Modular | Source                                                                                                                                                                                                                                                                                                      |
-|------------------------------|-----------------------|------------------|---------------|-------------------------------------------------------------|-----------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| triton                       | standard              | all<sup>1</sup>  | G,A,T         | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y                     | Y       | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts]                                                                                                                                        |
-| triton (batched)             | batched               | all<sup>1</sup>  | G,A,T         | silu, gelu                                                  | <sup>6</sup>          | Y       | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts]                                                                                                                                                                                                       |
-| deep gemm                    | standard,</br>batched | fp8              | G(128),A,T    | silu, gelu                                                  | <sup>6</sup>          | Y       | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
-| cutlass_fp4                  | standard,</br>batched | nvfp4            | A,T           | silu                                                        | Y                     | Y       | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],</br>[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4]                                                                                                                        |
-| cutlass_fp8                  | standard,</br>batched | fp8              | A,T           | silu, gelu                                                  | Y                     | Y       | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8]            |
-| flashinfer                   | standard              | nvfp4,</br>fp8   | T             | <sup>5</sup>                                                | N                     | Y       | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts]                                                                            |
-| gpt oss triton               | standard              | N/A              | N/A           | <sup>5</sup>                                                | Y                     | Y       | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts]                                                                    |
-| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup>  | G(128),A,T    | silu, gelu                                                  | <sup>6</sup>          | Y       | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts]                                                 |
-| marlin                       | standard              | <sup>3</sup>     | <sup>3</sup>  | silu,</br>swigluoai                                         | Y                     | Y       | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]          |
-| marlin experts               | standard,</br>batched | N/A              | N/A           | silu,</br>swigluoai                                         | Y                     | Y       | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]                                                                                                            |
-| trtllm                       | standard              | mxfp4,</br>nvfp4 | G(16),G(32)   | <sup>5</sup>                                                | N                     | Y       | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts]                                                                                                                                                                                                                      |
-| pallas                       | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe]                                                                                                                                                                                                                                    |
-| iterative                    | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe]                                                                                                                                                                                                                           |
-| rocm aiter moe               | standard              | fp8              | G(128),A,T    | silu, gelu                                                  | Y                     | N       | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts]                                                                                                                                                                                           |
-| cpu_fused_moe                | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE]                                                                                                                                                                                                                             |
-| naive batched<sup>4</sup>    | batched               | int8,</br>fp8    | G,A,T         | silu, gelu                                                  | <sup>6</sup>          | Y       | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts]                                                                                                                                                                                                         |
+| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
+|--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------|
+| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
+| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
+| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
+| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],</br>[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
+| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
+| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
+| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
+| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
+| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
+| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
+| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
+| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
+| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] |
+| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
+| naive batched<sup>4</sup> | batched | int8,</br>fp8 | G,A,T | silu, gelu | <sup>6</sup> | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] |
 
 !!! info "Table key"
     1. All types: mxfp4, nvfp4, int4, int8, fp8
-    2. A dispatcher wrapper around triton and deep gemm experts.  Will select based on type + shape + quantization params
+    2. A dispatcher wrapper around triton and deep gemm experts. Will select based on type + shape + quantization params
     3. uint4, uint8, fp8, fp4
     4. This is a naive implementation of experts that supports batched format. Mainly used for testing.
     5. The `activation` parameter is ignored and SwiGlu is used by default instead.
@@ -113,8 +111,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
 
 The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
 
-| backend                          | `FusedMoEPrepareAndFinalize` subclasses                    | `FusedMoEPermuteExpertsUnpermute` subclasses                                                                               |
-|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
-| deepep_high_throughput           | `DeepEPHTPrepareAndFinalize`                               |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts`                                  |
-| deepep_low_latency,</br>pplx     | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts`|
-| flashinfer                       | `FlashInferCutlassMoEPrepareAndFinalize`                   | `FlashInferExperts`                                                                                                                                    |
+| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
+|---------|-----------------------------------------|----------------------------------------------|
+| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
+| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
+| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |

From 815160958327d601933139b9e76a01eb6d2bc5cf Mon Sep 17 00:00:00 2001
From: ihb2032 <40718643+ihb2032@users.noreply.github.com>
Date: Wed, 19 Nov 2025 19:05:44 +0800
Subject: [PATCH 012/249] refactor(cpu_types_scalar.hpp): Unify scalar loop
 implementations using unroll_loop (#28847)

Signed-off-by: ihb2032 <1355790728@qq.com>
Co-authored-by: lyd1992 <liuyudong@iscas.ac.cn>
---
 csrc/cpu/cpu_types_scalar.hpp | 222 +++++++++++++---------------------
 1 file changed, 87 insertions(+), 135 deletions(-)

diff --git a/csrc/cpu/cpu_types_scalar.hpp b/csrc/cpu/cpu_types_scalar.hpp
index 1a9278bc662e5..f9da78283da5e 100644
--- a/csrc/cpu/cpu_types_scalar.hpp
+++ b/csrc/cpu/cpu_types_scalar.hpp
@@ -26,10 +26,6 @@ namespace vec_op {
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
-#define __max(a, b) ((a) > (b) ? (a) : (b))
-#define __min(a, b) ((a) < (b) ? (a) : (b))
-#define __abs(a) ((a) < (0) ? (0 - a) : (a))
-
 typedef struct f16x8_t {
   uint16_t val[8];
 } f16x8_t;
@@ -99,7 +95,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
   void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
 
   void save(void* ptr, const int elem_num) const {
-    int num = __min(elem_num, VEC_ELEM_NUM);
+    int num = std::min(elem_num, VEC_ELEM_NUM);
     std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
   }
 };
@@ -128,7 +124,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
   void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
 
   void save(void* ptr, const int elem_num) const {
-    int num = __min(elem_num, VEC_ELEM_NUM);
+    int num = std::min(elem_num, VEC_ELEM_NUM);
     std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
   }
 };
@@ -143,9 +139,9 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   explicit BF16Vec32(f16x32_t data) : reg(data) {};
 
   explicit BF16Vec32(BF16Vec8& vec8_data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+    unroll_loop<int, VEC_ELEM_NUM>([&vec8_data, this](int i) {
       reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
-    }
+    });
   }
 
   void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
@@ -157,15 +153,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
   f32x4_t reg;
 
   explicit FP32Vec4(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
   }
 
   explicit FP32Vec4() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
   }
 
   explicit FP32Vec4(const float* ptr)
@@ -182,15 +174,11 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   f32x8_t reg;
 
   explicit FP32Vec8(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
   }
 
   explicit FP32Vec8() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
   }
 
   explicit FP32Vec8(const float* ptr)
@@ -201,78 +189,68 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
 
   explicit FP32Vec8(const FP16Vec8& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = fp16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); });
   }
 
   FP32Vec8(const BF16Vec8& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = bf16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); });
   }
 
   float reduce_sum() const {
     float result = 0;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result += reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result += reg.val[i]; });
     return result;
   }
 
   FP32Vec8 exp() const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = expf(reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = expf(reg.val[i]); });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 tanh() const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = tanhf(reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = tanhf(reg.val[i]); });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 er() const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = erf(reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = erf(reg.val[i]); });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 operator*(const FP32Vec8& b) const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] * b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 operator+(const FP32Vec8& b) const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] + b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 operator-(const FP32Vec8& b) const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] - b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 operator/(const FP32Vec8& b) const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] / b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; });
     return FP32Vec8(ret);
   }
 
@@ -284,15 +262,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   f32x16_t reg;
 
   explicit FP32Vec16(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
   }
 
   explicit FP32Vec16() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
   }
 
   explicit FP32Vec16(const float* ptr)
@@ -301,29 +275,27 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   explicit FP32Vec16(f32x16_t data) : reg(data) {};
 
   FP32Vec16(const FP32Vec4& data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+    unroll_loop<int, VEC_ELEM_NUM>([&data, this](int i) {
       reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
-    }
+    });
   }
 
   FP32Vec16(const FP32Vec8& data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+    unroll_loop<int, VEC_ELEM_NUM>([&data, this](int i) {
       reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
-    }
+    });
   }
 
   FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
 
   explicit FP32Vec16(const FP16Vec16& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = fp16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); });
   }
 
   explicit FP32Vec16(const BF16Vec16& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = bf16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); });
   }
 
   explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
@@ -331,82 +303,74 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
 
   FP32Vec16 operator*(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] * b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 operator+(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] + b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 operator-(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] - b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 operator/(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] / b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 max(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>([&ret, &b, this](int i) {
+      ret.val[i] = std::max(reg.val[i], b.reg.val[i]);
+    });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 min(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>([&ret, &b, this](int i) {
+      ret.val[i] = std::min(reg.val[i], b.reg.val[i]);
+    });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 abs() const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __abs(reg.val[i]);
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = std::abs(reg.val[i]); });
+    return FP32Vec16(ret);
   }
 
   float reduce_sum() const {
     float result = 0.0f;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result += reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result += reg.val[i]; });
     return result;
   }
 
   float reduce_max() const {
-    float result = reg.val[0];
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result = __max(reg.val[i], result);
-    }
+    float result = std::numeric_limits<float>::lowest();
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result = std::max(reg.val[i], result); });
     return result;
   }
 
   float reduce_min() const {
-    float result = reg.val[0];
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result = __min(reg.val[i], result);
-    }
+    float result = std::numeric_limits<float>::max();
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result = std::min(reg.val[i], result); });
     return result;
   }
 
@@ -414,13 +378,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
     float sum = 0.0;
-    int start = idx * group_size;
-    int end = (idx + 1) * group_size;
-
-    for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
-      sum += reg.val[start];
-    }
-
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&sum, &start, this](int i) { sum += reg.val[start + i]; });
     return sum;
   }
 
@@ -477,17 +437,13 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
 }
 
 inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
-  int i = 0;
-  for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_fp16(v.reg.val[i]);
-  }
+  unroll_loop<int, FP16Vec16::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); });
 }
 
 inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
-  int i = 0;
-  for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_fp16(v.reg.val[i]);
-  }
+  unroll_loop<int, FP16Vec8::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); });
 }
 
 inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
@@ -495,17 +451,13 @@ inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
 }
 
 inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
-  int i = 0;
-  for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_bf16(v.reg.val[i]);
-  }
+  unroll_loop<int, BF16Vec8::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); });
 }
 
 inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
-  int i = 0;
-  for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_bf16(v.reg.val[i]);
-  }
+  unroll_loop<int, BF16Vec16::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); });
 }
 
 inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }

From bbc6c2f1e5bc856a9265dfa2b379ed1d242adc33 Mon Sep 17 00:00:00 2001
From: j20120307 <j20120307@gmail.com>
Date: Wed, 19 Nov 2025 03:07:22 -0800
Subject: [PATCH 013/249] [CI/Build] Fix broken build on Apple M1 (#28999)

Signed-off-by: Kan Zhu <j20120307@gmail.com>
---
 csrc/cpu/utils.hpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp
index d8399c56f6af8..d3def306b8069 100644
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -6,6 +6,10 @@
 #include <cstdint>
 #include <unistd.h>
 
+#if defined(__APPLE__)
+  #include <sys/sysctl.h>
+#endif
+
 #include "cpu_types.hpp"
 
 namespace cpu_utils {
@@ -21,10 +25,12 @@ struct VecTypeTrait<float> {
   using vec_t = vec_op::FP32Vec16;
 };
 
+#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
 template <>
 struct VecTypeTrait<c10::BFloat16> {
   using vec_t = vec_op::BF16Vec16;
 };
+#endif
 
 template <>
 struct VecTypeTrait<c10::Half> {
@@ -44,9 +50,21 @@ struct Counter {
 
 inline int64_t get_l2_size() {
   static int64_t size = []() {
+#if defined(__APPLE__)
+    // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
+    int64_t l2_cache_size = 0;
+    size_t len = sizeof(l2_cache_size);
+    if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
+        l2_cache_size > 0) {
+      return l2_cache_size >> 1;  // use 50% of L2 cache
+    }
+    // Fallback if sysctlbyname fails
+    return 128LL * 1024 >> 1;  // use 50% of 128KB
+#else
     long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
     assert(l2_cache_size != -1);
     return l2_cache_size >> 1;  // use 50% of L2 cache
+#endif
   }();
   return size;
 }

From 97cfa99d59375de6d5e4c17dc6aea955ae75b493 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 19 Nov 2025 12:32:04 +0100
Subject: [PATCH 014/249] [Docs] Take env var definition out of folded
 admonition (#29005)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/configuration/env_vars.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md
index 2c0a898754fa0..f6d548a19d91f 100644
--- a/docs/configuration/env_vars.md
+++ b/docs/configuration/env_vars.md
@@ -7,8 +7,6 @@ vLLM uses the following environment variables to configure the system:
 
     All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
 
-??? code
-
-    ```python
-    --8<-- "vllm/envs.py:env-vars-definition"
-    ```
+```python
+--8<-- "vllm/envs.py:env-vars-definition"
+```

From ba558c029ad65ab4f040c8320607ebd87612cf08 Mon Sep 17 00:00:00 2001
From: Tova Movshovitz <tovam@pliops.com>
Date: Wed, 19 Nov 2025 13:37:11 +0200
Subject: [PATCH 015/249] [config] Expose `get_total_num_hidden_layers()` in
 ModelConfig (#28961)

Signed-off-by: tovam <tovam@pliops.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/config/model.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 3e8790a26e0e3..f61dbb6a695a2 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1369,11 +1369,7 @@ class ModelConfig:
         # Coerce to 0 if explicitly set to None
         return num_experts or 0
 
-    def get_layers_start_end_indices(
-        self, parallel_config: ParallelConfig
-    ) -> tuple[int, int]:
-        from vllm.distributed.utils import get_pp_indices
-
+    def get_total_num_hidden_layers(self) -> int:
         if (
             self.hf_text_config.model_type == "deepseek_mtp"
             or self.hf_config.model_type == "mimo_mtp"
@@ -1393,6 +1389,15 @@ class ModelConfig:
             total_num_hidden_layers = getattr(
                 self.hf_text_config, "num_hidden_layers", 0
             )
+        return total_num_hidden_layers
+
+    def get_layers_start_end_indices(
+        self, parallel_config: ParallelConfig
+    ) -> tuple[int, int]:
+        from vllm.distributed.utils import get_pp_indices
+
+        total_num_hidden_layers = self.get_total_num_hidden_layers()
+
         # the layout order is: DP x PP x TP
         pp_rank = (
             parallel_config.rank // parallel_config.tensor_parallel_size

From da2f6800e0d6ac768c6f63b95f7c0755407f4263 Mon Sep 17 00:00:00 2001
From: Chen Bruce <bruceszchen@tencent.com>
Date: Wed, 19 Nov 2025 20:46:24 +0800
Subject: [PATCH 016/249] [Feat][Perf] Enable deepep-low-latency with
 round-robin expert placement. (#28449)

Signed-off-by: bruceszchen <bruceszchen@tencent.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../layers/fused_moe/all2all_utils.py         |  11 ++
 .../fused_moe/deepep_ll_prepare_finalize.py   |  30 +++-
 .../layers/fused_moe/fused_moe_method_base.py |   9 +-
 vllm/model_executor/layers/fused_moe/layer.py | 157 +++++++++++++++---
 .../fused_moe/unquantized_fused_moe_method.py |   7 +-
 .../compressed_tensors_moe.py                 |  14 +-
 .../model_executor/layers/quantization/fp8.py |   7 +-
 .../layers/quantization/modelopt.py           |  10 +-
 8 files changed, 208 insertions(+), 37 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py
index 2dd625054339c..86c50f39f0076 100644
--- a/vllm/model_executor/layers/fused_moe/all2all_utils.py
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
@@ -67,6 +67,7 @@ def maybe_roundup_layer_hidden_size(
 def maybe_make_prepare_finalize(
     moe: FusedMoEConfig,
     quant_config: FusedMoEQuantConfig | None,
+    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
 ) -> FusedMoEPrepareAndFinalize | None:
     if not moe.moe_parallel_config.use_all2all_kernels:
         return None
@@ -134,6 +135,13 @@ def maybe_make_prepare_finalize(
 
     elif moe.use_deepep_ll_kernels:
         assert quant_config is not None
+        global_to_physical = physical_to_global = local_expert_global_ids = None
+        if routing_tables is not None:
+            (
+                global_to_physical,
+                physical_to_global,
+                local_expert_global_ids,
+            ) = routing_tables
         all_to_all_args = dict(
             max_num_tokens_per_dp_rank=moe.max_num_tokens,
             token_hidden_size=moe.hidden_dim,
@@ -155,6 +163,9 @@ def maybe_make_prepare_finalize(
             max_tokens_per_rank=moe.max_num_tokens,
             num_dispatchers=all2all_manager.world_size,
             use_fp8_dispatch=use_fp8_dispatch,
+            global_to_physical=global_to_physical,
+            physical_to_global=physical_to_global,
+            local_expert_global_ids=local_expert_global_ids,
         )
 
     return prepare_finalize
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 06c9df317f7c7..e0db248958b47 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -85,6 +85,9 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         max_tokens_per_rank: int,
         num_dispatchers: int,
         use_fp8_dispatch: bool = False,
+        global_to_physical: torch.Tensor | None = None,
+        physical_to_global: torch.Tensor | None = None,
+        local_expert_global_ids: torch.Tensor | None = None,
     ):
         super().__init__()
 
@@ -97,6 +100,17 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self.handles: list[tuple | None] = [None, None]
         self.num_dispatchers_ = num_dispatchers
 
+        topk_indices_dtype = self.topk_indices_dtype()
+
+        def _maybe_cast(tensor: torch.Tensor | None) -> torch.Tensor | None:
+            if tensor is None or topk_indices_dtype is None:
+                return tensor
+            return tensor.to(dtype=topk_indices_dtype)
+
+        self.global_to_physical = _maybe_cast(global_to_physical)
+        self.physical_to_global = _maybe_cast(physical_to_global)
+        self.local_expert_global_ids = _maybe_cast(local_expert_global_ids)
+
         # We don't have enough information to determine if we should dispatch
         # activation scales in a packed ue8m0 format during object construction
         # time. This setting is handled by post_init_setup.
@@ -136,6 +150,16 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def topk_indices_dtype(self) -> torch.dtype | None:
         return torch.int64
 
+    def _map_global_to_physical_ids(self, topk_ids: torch.Tensor) -> torch.Tensor:
+        if self.global_to_physical is None:
+            return topk_ids
+        return self.global_to_physical[topk_ids]
+
+    def _map_local_to_global_ids(self, expert_topk_ids: torch.Tensor) -> torch.Tensor:
+        if self.local_expert_global_ids is None:
+            return expert_topk_ids
+        return self.local_expert_global_ids[expert_topk_ids]
+
     def _do_quant(
         self,
         x: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
@@ -226,9 +250,10 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             a1 = a1 * topk_weights.to(a1.dtype)
 
         # Dispatch
+        dispatch_topk_ids = self._map_global_to_physical_ids(topk_ids)
         expert_x, expert_num_tokens, handle, _, hook = self.buffer.low_latency_dispatch(
             a1,
-            topk_ids,
+            dispatch_topk_ids,
             self.max_tokens_per_rank,
             num_experts,
             use_fp8=self.use_fp8_dispatch,
@@ -313,11 +338,12 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             # weights have already been applied.
             combine_topk_weights = torch.ones_like(topk_weights)
 
+        combine_topk_ids = self._map_global_to_physical_ids(topk_ids)
         # TODO (varun) : Enable zero copy mode
         dbo_maybe_run_recv_hook()
         _, _, recv_hook = self.buffer.low_latency_combine(
             fused_expert_output,
-            topk_ids,
+            combine_topk_ids,
             combine_topk_weights,
             handle,
             async_finish=False,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
index 87f8c8d75a9b5..073e90a4e6808 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -50,10 +50,15 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         """
         return False
 
-    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> FusedMoEPrepareAndFinalize | None:
         from .all2all_utils import maybe_make_prepare_finalize
 
-        return maybe_make_prepare_finalize(self.moe, self.moe_quant_config)
+        return maybe_make_prepare_finalize(
+            self.moe, self.moe_quant_config, routing_tables
+        )
 
     def select_gemm_impl(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 023132acfed3f..c41995e4a9136 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -5,7 +5,7 @@ from collections.abc import Callable, Iterable
 from contextlib import nullcontext
 from enum import Enum
 from functools import partial
-from typing import Literal, get_args, overload
+from typing import Literal, cast, get_args, overload
 
 import torch
 import torch.nn.functional as F
@@ -192,6 +192,42 @@ def determine_expert_map(
     return (local_num_experts, expert_map, expert_mask)
 
 
+def determine_expert_placement_strategy(
+    expert_placement_strategy: ExpertPlacementStrategy,
+    moe_parallel_config: FusedMoEParallelConfig,
+    num_expert_group: int | None,
+    num_redundant_experts: int,
+    enable_eplb: bool,
+) -> ExpertPlacementStrategy:
+    if expert_placement_strategy == "round_robin":
+        round_robin_supported = (
+            (num_expert_group is not None and num_expert_group > 1)
+            and num_redundant_experts == 0
+            and not enable_eplb
+        )
+
+        if not round_robin_supported:
+            logger.warning(
+                "Round-robin expert placement is only supported for "
+                "models with multiple expert groups and no redundant "
+                "experts. Falling back to linear expert placement."
+            )
+            return "linear"
+        if (
+            moe_parallel_config.use_all2all_kernels
+            and not moe_parallel_config.use_deepep_ll_kernels
+        ):
+            logger.warning(
+                "Round-robin expert placement currently only supports "
+                "the DeepEP low-latency backend, but '%s' was configured. "
+                "Falling back to linear expert placement.",
+                moe_parallel_config.all2all_backend,
+            )
+            return "linear"
+
+    return expert_placement_strategy
+
+
 def get_compressed_expert_map(expert_map: torch.Tensor) -> str:
     """
     Compresses the expert map by removing any -1 entries.
@@ -400,6 +436,9 @@ class FusedMoE(CustomOp):
         self.expert_load_view: torch.Tensor | None = None
         self.logical_to_physical_map: torch.Tensor | None = None
         self.logical_replica_count: torch.Tensor | None = None
+        self.expert_placement_strategy: ExpertPlacementStrategy = (
+            vllm_config.parallel_config.expert_placement_strategy
+        )
 
         # ROCm aiter shared experts fusion
         self.rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
@@ -433,38 +472,27 @@ class FusedMoE(CustomOp):
                     "Redundant experts are only supported with EPLB."
                 )
 
-            expert_placement_strategy = (
-                vllm_config.parallel_config.expert_placement_strategy
+            self.expert_placement_strategy = determine_expert_placement_strategy(
+                expert_placement_strategy=self.expert_placement_strategy,
+                moe_parallel_config=self.moe_parallel_config,
+                num_expert_group=num_expert_group,
+                num_redundant_experts=num_redundant_experts,
+                enable_eplb=self.enable_eplb,
             )
-            if expert_placement_strategy == "round_robin":
-                # TODO(Bruce): will support round robin expert placement with
-                # EPLB enabled in the future.
-                round_robin_supported = (
-                    (num_expert_group is not None and num_expert_group > 1)
-                    and num_redundant_experts == 0
-                    and not self.enable_eplb
-                )
-
-                if not round_robin_supported:
-                    logger.warning(
-                        "Round-robin expert placement is only supported for "
-                        "models with multiple expert groups and no redundant "
-                        "experts. Falling back to linear expert placement."
-                    )
-                    expert_placement_strategy = "linear"
 
             self.expert_map: torch.Tensor | None
             local_num_experts, expert_map, expert_mask = determine_expert_map(
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
                 global_num_experts=self.global_num_experts,
-                expert_placement_strategy=expert_placement_strategy,
+                expert_placement_strategy=self.expert_placement_strategy,
                 num_fused_shared_experts=self.num_fused_shared_experts,
                 return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
             self.register_buffer("expert_map", expert_map)
             self.register_buffer("expert_mask", expert_mask)
+            self._maybe_init_expert_routing_tables()
             logger.info_once(
                 "[EP Rank %s/%s] Expert parallelism is enabled. Expert "
                 "placement strategy: %s. Local/global"
@@ -472,7 +500,7 @@ class FusedMoE(CustomOp):
                 " %s.",
                 self.ep_rank,
                 self.ep_size,
-                expert_placement_strategy,
+                self.expert_placement_strategy,
                 self.local_num_experts,
                 self.global_num_experts,
                 get_compressed_expert_map(self.expert_map),
@@ -621,7 +649,12 @@ class FusedMoE(CustomOp):
     # should be safe to swap out the quant_method.
     def maybe_init_modular_kernel(self) -> None:
         self.ensure_moe_quant_config_init()
-        prepare_finalize = self.quant_method.maybe_make_prepare_finalize()
+        # routing_tables only needed for round-robin expert placement with
+        # DeepEP all2all backend.
+        routing_tables = self._maybe_init_expert_routing_tables()
+        prepare_finalize = self.quant_method.maybe_make_prepare_finalize(
+            routing_tables=routing_tables
+        )
         if prepare_finalize is not None:
             logger.debug(
                 "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
@@ -703,6 +736,84 @@ class FusedMoE(CustomOp):
         # By default, router/gate is called before FusedMoE forward pass
         return False
 
+    def _maybe_init_expert_routing_tables(
+        self,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None:
+        # Currently routing_tables only needed for round-robin expert placement
+        # with DeepEP-ll all2all backend.
+        if (
+            self.expert_placement_strategy != "round_robin"
+            or not self.use_deepep_ll_kernels
+        ):
+            return None
+
+        if hasattr(self, "expert_global_to_physical"):
+            return cast(
+                tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+                (
+                    self.expert_global_to_physical,
+                    self.expert_physical_to_global,
+                    self.expert_local_to_global,
+                ),
+            )
+
+        if self.expert_map is None:
+            return None
+
+        routing_tables = self.ensure_round_robin_expert_routing_tables(
+            global_num_experts=self.global_num_experts,
+            ep_size=self.ep_size,
+            ep_rank=self.ep_rank,
+            local_num_experts=self.local_num_experts,
+            device=self.expert_map.device,
+        )
+
+        global_to_physical, physical_to_global, local_global = routing_tables
+        self.register_buffer("expert_global_to_physical", global_to_physical)
+        self.register_buffer("expert_physical_to_global", physical_to_global)
+        self.register_buffer("expert_local_to_global", local_global)
+
+        return routing_tables
+
+    @staticmethod
+    def ensure_round_robin_expert_routing_tables(
+        global_num_experts: int,
+        ep_size: int,
+        ep_rank: int,
+        local_num_experts: int,
+        device: torch.device | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        device_kwargs = {"device": device} if device is not None else {}
+        global_indices = torch.arange(
+            global_num_experts, dtype=torch.long, **device_kwargs
+        )
+        owner = torch.remainder(global_indices, ep_size)
+        local_index = torch.div(global_indices, ep_size, rounding_mode="floor")
+        base = global_num_experts // ep_size
+        remainder = global_num_experts % ep_size
+        physical_offset = owner * base
+        if remainder > 0:
+            remainder_tensor = torch.tensor(
+                remainder, dtype=torch.long, **device_kwargs
+            )
+            physical_offset = physical_offset + torch.minimum(owner, remainder_tensor)
+
+        global_to_physical = physical_offset + local_index
+        physical_to_global = torch.empty_like(global_to_physical)
+        physical_to_global[global_to_physical] = global_indices
+
+        local_global = torch.arange(
+            ep_rank,
+            global_num_experts,
+            ep_size,
+            dtype=torch.long,
+            **device_kwargs,
+        )
+        if local_global.numel() != local_num_experts:
+            local_global = local_global[:local_num_experts]
+
+        return (global_to_physical, physical_to_global, local_global)
+
     def update_expert_map(self):
         # ep_size and ep_rank should already be updated
         assert self.expert_map is not None
@@ -711,12 +822,14 @@ class FusedMoE(CustomOp):
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
                 global_num_experts=self.global_num_experts,
+                expert_placement_strategy=self.expert_placement_strategy,
                 num_fused_shared_experts=self.num_fused_shared_experts,
                 return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
             self.register_buffer("expert_map", expert_map)
             self.register_buffer("expert_mask", expert_mask)
+            self._maybe_init_expert_routing_tables()
             if self.aiter_fmoe_shared_expert_enabled:
                 self._init_aiter_shared_experts_topK_buffer(
                     vllm_config=get_current_vllm_config(),
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 2e0376553b913..63b0e6f573d65 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -108,11 +108,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     def allow_inplace(self) -> bool:
         return True
 
-    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> FusedMoEPrepareAndFinalize | None:
         if self.rocm_aiter_moe_enabled:
             return None
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 06ee96d55419c..22b3c477f420f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -380,11 +380,14 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             (layer.w2_input_global_scale), requires_grad=False
         )
 
-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin:
             return None
         elif not self.allow_flashinfer:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
         prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(self.moe)
         logger.debug_once("%s", prepare_finalize.__class__.__name__)
@@ -890,11 +893,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     layer.w2_weight_scale
                 )
 
-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin or self.rocm_aiter_moe_enabled:
             return None
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 0479bec338408..92fbdd7093483 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1018,7 +1018,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         if (
             self.rocm_aiter_moe_enabled
             or self.use_marlin
@@ -1039,7 +1042,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 476521813f464..38ab7cd4f115c 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -373,6 +373,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
 
     def maybe_make_prepare_finalize(
         self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
         # TRT LLM not supported with all2all yet.
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
@@ -384,7 +385,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
@@ -1179,7 +1180,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 " for ModelOptNvFp4FusedMoE."
             )
 
-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin or (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
@@ -1196,7 +1200,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,

From 09540cd918a5f7d776d7f7e0abec78fbc03938ad Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Wed, 19 Nov 2025 13:56:21 +0100
Subject: [PATCH 017/249] [Doc]: fix typos in various files (#29010)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 docs/deployment/frameworks/skypilot.md      | 2 +-
 docs/design/prefix_caching.md               | 2 +-
 docs/features/nixl_connector_usage.md       | 2 +-
 docs/getting_started/quickstart.md          | 2 +-
 tests/v1/ec_connector/integration/README.md | 2 +-
 vllm/multimodal/evs.py                      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md
index f4a984a6433e2..e9b0d5f0671c3 100644
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@@ -4,7 +4,7 @@
   <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
 </p>
 
-vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
+vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc., can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
 
 ## Prerequisites
 
diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md
index bd4070f381d81..48536a877bd3f 100644
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -1,6 +1,6 @@
 # Automatic Prefix Caching
 
-Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc) and most open source LLM inference frameworks (e.g., SGLang).
+Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc.) and most open source LLM inference frameworks (e.g., SGLang).
 
 While there are many ways to implement prefix caching, vLLM chooses a hash-based approach. Specifically, we hash each kv-cache block by the tokens in the block and the tokens in the prefix before the block:
 
diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
index 1ce038f4d6525..f0e25e31aa0b3 100644
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -158,7 +158,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
 
 ## Experimental Feature
 
-### Heterogenuous KV Layout support
+### Heterogeneous KV Layout support
 
 Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration
 
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index cfc8b4d9838a7..9e86f785b10c7 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -286,7 +286,7 @@ If desired, you can also manually set the backend of your choice by configuring
 - On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
 - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
 
-For AMD ROCm, you can futher control the specific Attention implementation using the following variables:
+For AMD ROCm, you can further control the specific Attention implementation using the following variables:
 
 - Triton Unified Attention: `VLLM_ROCM_USE_AITER=0 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
 - AITER Unified Attention: `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
diff --git a/tests/v1/ec_connector/integration/README.md b/tests/v1/ec_connector/integration/README.md
index 30426e055ade8..2dbcb307fda32 100644
--- a/tests/v1/ec_connector/integration/README.md
+++ b/tests/v1/ec_connector/integration/README.md
@@ -113,7 +113,7 @@ Quick sanity check:
 
 - Outputs differ between baseline and disagg
 - Server startup fails
-- Encoder cache not found (should fallback to local execution)
+- Encoder cache not found (should fall back to local execution)
 - Proxy routing errors
 
 ## Notes
diff --git a/vllm/multimodal/evs.py b/vllm/multimodal/evs.py
index 4a288d2d238c2..8a36ea415da4d 100644
--- a/vllm/multimodal/evs.py
+++ b/vllm/multimodal/evs.py
@@ -185,7 +185,7 @@ def recompute_mrope_positions(
 
     Args:
         input_ids: (N,) All input tokens of the prompt (entire sequence).
-        multimodal_positions: List of mrope positsions for each media.
+        multimodal_positions: List of mrope positions for each media.
         mrope_positions: Existing mrope positions (4, N) for entire sequence.
         num_computed_tokens: A number of computed tokens so far.
         vision_start_token_id: Token indicating start of vision media.

From 4f5299f7174ffb10bdc640b47d3494083fc39c48 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 19 Nov 2025 14:50:30 +0100
Subject: [PATCH 018/249] Relax Transformers modeling backend MoE experts check
 (#28952)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/models/supported_models.md                | 4 +++-
 vllm/model_executor/models/transformers/moe.py | 9 ++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index bd14bbb9ab662..80fe143269a76 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -79,7 +79,9 @@ To make your model compatible with the Transformers modeling backend, it needs:
         1. Add `is_causal = False` to `MyAttention`.
     - If your model is mixture-of-experts (MoE):
         1. Your sparse MoE block must have an attribute called `experts`.
-        2. The class of `experts` (`MyExperts`) must inherit from `nn.ModuleList`.
+        2. The class of `experts` (`MyExperts`) must either:
+            - Inherit from `nn.ModuleList` (naive).
+            - Or contain all 3D `nn.Parameters` (packed).
         3. `MyExperts.forward` must accept `hidden_states`, `top_k_index`, `top_k_weights`.
 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
 3. `MyModel` must contain `_supports_attention_backend = True`.
diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py
index 4973014c3d4ed..31db9d682bd40 100644
--- a/vllm/model_executor/models/transformers/moe.py
+++ b/vllm/model_executor/models/transformers/moe.py
@@ -256,7 +256,14 @@ class MoEMixin(MixtureOfExperts):
         def _recursive_replace(module: nn.Module, prefix: str):
             for child_name, child_module in module.named_children():
                 qual_name = maybe_prefix(prefix, child_name)
-                if child_name == "experts" and isinstance(child_module, nn.ModuleList):
+                # Naive implementations will have experts as ModuleList
+                is_modulelist = isinstance(child_module, nn.ModuleList)
+                # Packed implementations will have experts as 3D tensors of shapes like:
+                # gate_up_proj = (num_experts, 2 * intermediate_size, hidden_size)
+                # down_proj = (num_experts, intermediate_size, hidden_size)
+                params = list(child_module.parameters())
+                is_3d = len(params) > 0 and all(p.ndim == 3 for p in params)
+                if child_name == "experts" and (is_modulelist or is_3d):
                     # Alias for readability
                     mlp = module
                     experts = child_module

From 2c8b9182b5ced00d83bed15ef8bc0ac6e079b6ee Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Wed, 19 Nov 2025 06:13:50 -0800
Subject: [PATCH 019/249] [CI] Reorganize compile tests so new tests are
 automatically included in CI (#28625)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
---
 .buildkite/test-amd.yaml                      | 57 ++++++++---------
 .buildkite/test-pipeline.yaml                 | 62 +++++++++----------
 tests/compile/README.md                       |  5 ++
 .../{piecewise => distributed}/__init__.py    |  0
 .../{ => distributed}/test_async_tp.py        |  6 +-
 .../test_fusion_all_reduce.py                 |  4 +-
 .../{ => distributed}/test_fusions_e2e.py     |  2 +-
 .../test_sequence_parallelism.py              |  4 +-
 tests/compile/fullgraph/__init__.py           |  0
 .../{ => fullgraph}/test_basic_correctness.py |  2 +-
 .../test_full_cudagraph.py                    |  0
 .../{ => fullgraph}/test_full_graph.py        |  2 +-
 .../test_multimodal_compile.py                |  0
 .../test_multiple_graphs.py                   |  0
 .../{piecewise => fullgraph}/test_simple.py   |  0
 .../test_toy_llama.py                         |  0
 vllm/env_override.py                          |  2 +-
 17 files changed, 74 insertions(+), 72 deletions(-)
 create mode 100644 tests/compile/README.md
 rename tests/compile/{piecewise => distributed}/__init__.py (100%)
 rename tests/compile/{ => distributed}/test_async_tp.py (99%)
 rename tests/compile/{ => distributed}/test_fusion_all_reduce.py (99%)
 rename tests/compile/{ => distributed}/test_fusions_e2e.py (99%)
 rename tests/compile/{ => distributed}/test_sequence_parallelism.py (99%)
 create mode 100644 tests/compile/fullgraph/__init__.py
 rename tests/compile/{ => fullgraph}/test_basic_correctness.py (99%)
 rename tests/compile/{piecewise => fullgraph}/test_full_cudagraph.py (100%)
 rename tests/compile/{ => fullgraph}/test_full_graph.py (99%)
 rename tests/compile/{ => fullgraph}/test_multimodal_compile.py (100%)
 rename tests/compile/{piecewise => fullgraph}/test_multiple_graphs.py (100%)
 rename tests/compile/{piecewise => fullgraph}/test_simple.py (100%)
 rename tests/compile/{piecewise => fullgraph}/test_toy_llama.py (100%)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 2471b509a9fff..0049f35403409 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -187,7 +187,7 @@ steps:
   - tests/distributed/test_utils
   - tests/distributed/test_pynccl
   - tests/distributed/test_events
-  - tests/compile/test_basic_correctness
+  - tests/compile/fullgraph/test_basic_correctness.py
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
@@ -215,7 +215,7 @@ steps:
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
   - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s distributed/test_events.py
   - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -493,17 +493,12 @@ steps:
     - vllm/
     - tests/compile
   commands:
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_fusion_attn.py
-    - pytest -v -s compile/test_functionalization.py
-    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-  #  - pytest -v -s compile/test_sequence_parallelism.py
-  #  - pytest -v -s compile/test_async_tp.py
-    - pytest -v -s compile/test_fusion_all_reduce.py
-    - pytest -v -s compile/test_decorator.py
-    - pytest -v -s compile/test_noop_elimination.py
-    - pytest -v -s compile/test_aot_compile.py
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -515,9 +510,11 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_basic_correctness.py
-  - pytest -v -s compile/test_multimodal_compile.py
-  - pytest -v -s compile/piecewise/
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Test # 27min
   timeout_in_minutes: 40
@@ -529,10 +526,10 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
     # Limit to no custom ops to reduce running time
     # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -1066,10 +1063,10 @@ steps:
     - pytest -v -s tests/compile/test_fusion_attn.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
     # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
     # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
 
 - label: Blackwell Fusion E2E Tests # 30 min
   timeout_in_minutes: 40
@@ -1086,14 +1083,14 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusions_e2e.py
-  - tests/compile/test_full_graph.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
   commands:
     - nvidia-smi
     # Run all e2e fusion tests
-    - pytest -v -s tests/compile/test_fusions_e2e.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
     # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: ROCm GPT-OSS Eval
   timeout_in_minutes: 60
@@ -1198,7 +1195,7 @@ steps:
   - vllm/worker/worker_base.py
   - vllm/v1/engine/
   - vllm/v1/worker/
-  - tests/compile/test_basic_correctness.py
+  - tests/compile/fullgraph/test_basic_correctness.py
   - tests/compile/test_wrapper.py
   - tests/distributed/
   - tests/entrypoints/llm/test_collective_rpc.py
@@ -1211,7 +1208,7 @@ steps:
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1417,10 +1414,10 @@ steps:
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - pytest -v -s tests/compile/test_async_tp.py
-    - pytest -v -s tests/compile/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 4ac76aba67b9c..e62cd60efaec0 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -167,7 +167,7 @@ steps:
   - tests/distributed/test_utils
   - tests/distributed/test_pynccl
   - tests/distributed/test_events
-  - tests/compile/test_basic_correctness
+  - tests/compile/fullgraph/test_basic_correctness.py
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
@@ -197,7 +197,7 @@ steps:
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
   - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s distributed/test_events.py
   - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -445,18 +445,12 @@ steps:
     - vllm/
     - tests/compile
   commands:
-    - pytest -v -s compile/test_graph_partition.py
-    - pytest -v -s compile/test_config.py
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_fusion_attn.py
-    - pytest -v -s compile/test_functionalization.py
-    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s compile/test_fusion_all_reduce.py
-    - pytest -v -s compile/test_decorator.py
-    - pytest -v -s compile/test_noop_elimination.py
-    - pytest -v -s compile/test_aot_compile.py
-    - pytest -v -s compile/test_qk_norm_rope_fusion.py
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -466,9 +460,11 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_basic_correctness.py
-  - pytest -v -s compile/test_multimodal_compile.py
-  - pytest -v -s compile/piecewise/
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Test # 27min
   timeout_in_minutes: 40
@@ -479,10 +475,10 @@ steps:
   - tests/compile
   commands:
     # fp8 kv scales not supported on sm89, tested on Blackwell instead
-  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
     # Limit to no custom ops to reduce running time
     # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -939,17 +935,22 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
   commands:
     - nvidia-smi
     - pytest -v -s tests/compile/test_fusion_attn.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
     # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
     # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
     # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell Fusion E2E Tests # 30 min
   timeout_in_minutes: 40
@@ -966,12 +967,11 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusions_e2e.py
-  - tests/compile/test_full_graph.py
+  - tests/compile/distributed/test_fusions_e2e.py
   commands:
     - nvidia-smi
     # Run all e2e fusion tests
-    - pytest -v -s tests/compile/test_fusions_e2e.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
 
 - label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
@@ -1069,7 +1069,7 @@ steps:
   - vllm/worker/worker_base.py
   - vllm/v1/engine/
   - vllm/v1/worker/
-  - tests/compile/test_basic_correctness.py
+  - tests/compile/fullgraph/test_basic_correctness.py
   - tests/compile/test_wrapper.py
   - tests/distributed/
   - tests/entrypoints/llm/test_collective_rpc.py
@@ -1084,7 +1084,7 @@ steps:
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1264,10 +1264,10 @@ steps:
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - pytest -v -s tests/compile/test_async_tp.py
-    - pytest -v -s tests/compile/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
+    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     - pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
diff --git a/tests/compile/README.md b/tests/compile/README.md
new file mode 100644
index 0000000000000..300a956860005
--- /dev/null
+++ b/tests/compile/README.md
@@ -0,0 +1,5 @@
+# compile test folder structure
+
+- `compile/test_*.py` : various unit tests meant for testing particular code path/features. Future tests are most likely added here. New test files added here will be included in CI automatically
+- `compile/fullgraph/` : full model tests, including all tests previously in compile/piecewise. These tests do not target particular features. New test files added here will be included in CI automatically
+- `compile/distributed/` : tests that require multiple GPUs. New test files added here will **NOT** be included in CI automatically as these tests generally need to be manually configured to run in runners with particular number/type of GPUs.
diff --git a/tests/compile/piecewise/__init__.py b/tests/compile/distributed/__init__.py
similarity index 100%
rename from tests/compile/piecewise/__init__.py
rename to tests/compile/distributed/__init__.py
diff --git a/tests/compile/test_async_tp.py b/tests/compile/distributed/test_async_tp.py
similarity index 99%
rename from tests/compile/test_async_tp.py
rename to tests/compile/distributed/test_async_tp.py
index 71ee228781438..86d409f1eadb0 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/distributed/test_async_tp.py
@@ -27,13 +27,13 @@ from vllm.distributed.parallel_state import (
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
 
-from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import (
+from ...models.registry import HF_EXAMPLE_MODELS
+from ...utils import (
     compare_two_settings,
     create_new_process_for_each_test,
     multi_gpu_test,
 )
-from .backend import TestBackend
+from ..backend import TestBackend
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py
similarity index 99%
rename from tests/compile/test_fusion_all_reduce.py
rename to tests/compile/distributed/test_fusion_all_reduce.py
index 6d0a0ed7d89d2..d401d57032752 100644
--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/distributed/test_fusion_all_reduce.py
@@ -33,8 +33,8 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
 
-from ..utils import has_module_attribute, multi_gpu_test
-from .backend import TestBackend
+from ...utils import has_module_attribute, multi_gpu_test
+from ..backend import TestBackend
 
 
 class TestAllReduceRMSNormModel(torch.nn.Module):
diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
similarity index 99%
rename from tests/compile/test_fusions_e2e.py
rename to tests/compile/distributed/test_fusions_e2e.py
index f22d60ef000b2..2e1b595a43895 100644
--- a/tests/compile/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -18,7 +18,7 @@ from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
-from ..utils import flat_product, multi_gpu_test
+from ...utils import flat_product, multi_gpu_test
 
 is_blackwell = lambda: current_platform.is_device_capability(100)
 """Are we running on Blackwell, a lot of tests depend on it"""
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py
similarity index 99%
rename from tests/compile/test_sequence_parallelism.py
rename to tests/compile/distributed/test_sequence_parallelism.py
index 9cd7f64b04af5..30084dfd5a950 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/distributed/test_sequence_parallelism.py
@@ -32,8 +32,8 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
 
-from ..utils import multi_gpu_test
-from .backend import TestBackend
+from ...utils import multi_gpu_test
+from ..backend import TestBackend
 
 FP8_DTYPE = current_platform.fp8_dtype()
 prompts = [
diff --git a/tests/compile/fullgraph/__init__.py b/tests/compile/fullgraph/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py
similarity index 99%
rename from tests/compile/test_basic_correctness.py
rename to tests/compile/fullgraph/test_basic_correctness.py
index 3f6898607f6b9..965938c4433dd 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/fullgraph/test_basic_correctness.py
@@ -7,7 +7,7 @@ import pytest
 from vllm.config import CompilationMode
 from vllm.utils.torch_utils import cuda_device_count_stateless
 
-from ..utils import compare_all_settings
+from ...utils import compare_all_settings
 
 
 @dataclasses.dataclass
diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py
similarity index 100%
rename from tests/compile/piecewise/test_full_cudagraph.py
rename to tests/compile/fullgraph/test_full_cudagraph.py
diff --git a/tests/compile/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
similarity index 99%
rename from tests/compile/test_full_graph.py
rename to tests/compile/fullgraph/test_full_graph.py
index b4e5e56ac9fe6..2c11ecef7f029 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -15,7 +15,7 @@ from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassC
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
-from ..utils import create_new_process_for_each_test
+from ...utils import create_new_process_for_each_test
 
 
 def models_list(*, all: bool = True, keywords: list[str] | None = None):
diff --git a/tests/compile/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py
similarity index 100%
rename from tests/compile/test_multimodal_compile.py
rename to tests/compile/fullgraph/test_multimodal_compile.py
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/fullgraph/test_multiple_graphs.py
similarity index 100%
rename from tests/compile/piecewise/test_multiple_graphs.py
rename to tests/compile/fullgraph/test_multiple_graphs.py
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/fullgraph/test_simple.py
similarity index 100%
rename from tests/compile/piecewise/test_simple.py
rename to tests/compile/fullgraph/test_simple.py
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/fullgraph/test_toy_llama.py
similarity index 100%
rename from tests/compile/piecewise/test_toy_llama.py
rename to tests/compile/fullgraph/test_toy_llama.py
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 14dae2850c354..9ae1af3af46cf 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -95,7 +95,7 @@ def memory_plan_reuse_patched(self):
 # ===================================================
 # This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to
 # fix inductor partition + attention-nvfp4 quant fusion, tested in
-# `tests/compile/test_fusions_e2e.py::test_attn_quant`.
+# `tests/compile/distributed/test_fusions_e2e.py::test_attn_quant`.
 # For more context, see https://github.com/pytorch/pytorch/pull/165815.
 
 
From 1ffe934c8ae978e5ed82559a1eaeca05e37f9b35 Mon Sep 17 00:00:00 2001
From: vnadathur <glvikramn@gmail.com>
Date: Wed, 19 Nov 2025 06:13:54 -0800
Subject: [PATCH 020/249] [torch.compile] caching of config fields should be
 opt-out by default (#26468)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: vnadathur <glvikramn@gmail.com>
Signed-off-by: WorldExplored <srreyansh.sethi@gmail.com>
Signed-off-by: Srreyansh Sethi <srreyansh.sethi@gmail.com>
Signed-off-by: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com>
Co-authored-by: WorldExplored <srreyansh.sethi@gmail.com>
Co-authored-by: Srreyansh Sethi <107075589+worldexplored@users.noreply.github.com>
Co-authored-by: vnadathur <236933696+vnadathur@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 tests/config/test_config_utils.py | 166 +++++++++++++++++++++++++++++
 vllm/compilation/backends.py      | 105 +++++++++++++++----
 vllm/compilation/pass_manager.py  |   2 +-
 vllm/config/cache.py              |  31 ++++--
 vllm/config/compilation.py        |  40 +++----
 vllm/config/model.py              |  88 ++++++++--------
 vllm/config/parallel.py           |  49 ++++++---
 vllm/config/utils.py              | 119 ++++++++++++++++++++-
 vllm/envs.py                      | 167 +++++++++++++++---------------
 vllm/logging_utils/__init__.py    |   2 +
 vllm/logging_utils/lazy.py        |  20 ++++
 11 files changed, 599 insertions(+), 190 deletions(-)
 create mode 100644 tests/config/test_config_utils.py
 create mode 100644 vllm/logging_utils/lazy.py

diff --git a/tests/config/test_config_utils.py b/tests/config/test_config_utils.py
new file mode 100644
index 0000000000000..1277c7e64eb21
--- /dev/null
+++ b/tests/config/test_config_utils.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from enum import Enum
+
+import pytest
+
+from vllm.config.utils import get_hash_factors, hash_factors, normalize_value
+
+# Helpers
+
+
+def endswith_fqname(obj, suffix: str) -> bool:
+    # normalize_value(type) returns fully-qualified name
+    # Compare suffix to avoid brittle import paths.
+    out = normalize_value(obj)
+    return isinstance(out, str) and out.endswith(suffix)
+
+
+def expected_path(p_str: str = ".") -> str:
+    import pathlib
+
+    p = pathlib.Path(p_str)
+    return p.expanduser().resolve().as_posix()
+
+
+# Minimal dataclass to test get_hash_factors.
+# Avoid importing heavy vLLM configs.
+@dataclass
+class SimpleConfig:
+    a: object
+    b: object | None = None
+
+
+class DummyLogprobsMode(Enum):
+    RAW_LOGITS = "raw_logits"
+
+
+def test_hash_factors_deterministic():
+    """Test that hash_factors produces consistent SHA-256 hashes"""
+    factors = {"a": 1, "b": "test"}
+    hash1 = hash_factors(factors)
+    hash2 = hash_factors(factors)
+
+    assert hash1 == hash2
+    # Dict key insertion order should not affect the hash.
+    factors_reordered = {"b": "test", "a": 1}
+    assert hash_factors(factors_reordered) == hash1
+    assert len(hash1) == 64
+    assert all(c in "0123456789abcdef" for c in hash1)
+
+
+@pytest.mark.parametrize(
+    "inp, expected",
+    [
+        (None, None),
+        (True, True),
+        (1, 1),
+        (1.0, 1.0),
+        ("x", "x"),
+        (b"ab", "6162"),
+        (bytearray(b"ab"), "6162"),
+        ([1, 2], (1, 2)),
+        ({"b": 2, "a": 1}, (("a", 1), ("b", 2))),
+    ],
+)
+def test_normalize_value_matrix(inp, expected):
+    """Parametric input→expected normalization table."""
+    assert normalize_value(inp) == expected
+
+
+def test_normalize_value_enum():
+    # Enums normalize to (module.QualName, value).
+    # DummyLogprobsMode uses a string payload.
+    out = normalize_value(DummyLogprobsMode.RAW_LOGITS)
+    assert isinstance(out, tuple)
+    assert out[0].endswith("DummyLogprobsMode")
+    # Expect string payload 'raw_logits'.
+    assert out[1] == "raw_logits"
+
+
+def test_normalize_value_set_order_insensitive():
+    # Sets are unordered; normalize_value sorts elements for determinism.
+    assert normalize_value({3, 1, 2}) == normalize_value({1, 2, 3})
+
+
+def test_normalize_value_path_normalization():
+    from pathlib import Path  # local import to avoid global dependency
+
+    # Paths expand/resolve to absolute strings.
+    # Stabilizes hashing across working dirs.
+    assert normalize_value(Path(".")) == expected_path(".")
+
+
+def test_normalize_value_uuid_and_to_json():
+    # Objects may normalize via uuid() or to_json_string().
+    class HasUUID:
+        def uuid(self):
+            return "test-uuid"
+
+    class ToJson:
+        def to_json_string(self):
+            return '{"x":1}'
+
+    assert normalize_value(HasUUID()) == "test-uuid"
+    assert normalize_value(ToJson()) == '{"x":1}'
+
+
+@pytest.mark.parametrize(
+    "bad",
+    [
+        (lambda x: x),
+        (type("CallableInstance", (), {"__call__": lambda self: 0}))(),
+        (lambda: (lambda: 0))(),  # nested function instance
+    ],
+)
+def test_error_cases(bad):
+    """Inputs expected to raise TypeError."""
+    # Reject functions/lambdas/callable instances
+    # to avoid under-hashing.
+    with pytest.raises(TypeError):
+        normalize_value(bad)
+
+
+def test_enum_vs_int_disambiguation():
+    # int stays primitive
+    nf_int = normalize_value(1)
+    assert nf_int == 1
+
+    # enum becomes ("module.QualName", value)
+    nf_enum = normalize_value(DummyLogprobsMode.RAW_LOGITS)
+    assert isinstance(nf_enum, tuple) and len(nf_enum) == 2
+    enum_type, enum_val = nf_enum
+    assert enum_type.endswith(".DummyLogprobsMode")
+    assert enum_val == "raw_logits"
+
+    # Build factor dicts from configs with int vs enum
+    f_int = get_hash_factors(SimpleConfig(1), set())
+    f_enum = get_hash_factors(SimpleConfig(DummyLogprobsMode.RAW_LOGITS), set())
+    # The int case remains a primitive value
+    assert f_int["a"] == 1
+    # The enum case becomes a tagged tuple ("module.QualName", "raw_logits")
+    assert isinstance(f_enum["a"], tuple) and f_enum["a"][1] == "raw_logits"
+    # Factor dicts must differ so we don't collide primitives with Enums.
+    assert f_int != f_enum
+    # Hash digests must differ correspondingly
+    assert hash_factors(f_int) != hash_factors(f_enum)
+
+    # Hash functions produce stable hex strings
+    h_int = hash_factors(f_int)
+    h_enum = hash_factors(f_enum)
+    assert isinstance(h_int, str) and len(h_int) == 64
+    assert isinstance(h_enum, str) and len(h_enum) == 64
+
+
+def test_classes_are_types():
+    """Types normalize to FQNs; include real vLLM types."""
+    # Only classes allowed; functions/lambdas are rejected.
+    # Canonical form is the fully-qualified name.
+    assert isinstance(normalize_value(str), str)
+
+    class LocalDummy:
+        pass
+
+    assert endswith_fqname(LocalDummy, ".LocalDummy")
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 60ef6eef21663..1e66f21ff6388 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -4,12 +4,14 @@
 import ast
 import dataclasses
 import hashlib
+import json
 import operator
 import os
 import pprint
 import time
 from collections.abc import Callable, Sequence
 from contextlib import contextmanager
+from functools import partial
 from typing import Any
 
 import torch
@@ -23,7 +25,9 @@ from vllm.compilation.partition_rules import (
     should_split,
 )
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
+from vllm.config.utils import hash_factors
 from vllm.logger import init_logger
+from vllm.logging_utils import lazy
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -580,35 +584,47 @@ class VllmBackend:
     def __call__(
         self, graph: fx.GraphModule, example_inputs
     ) -> VllmSerializableFunction:
-        from .caching import _compute_code_hash, compilation_config_hash_factors
-
         vllm_config = self.vllm_config
+        # Minimal hashing here with existing utilities, reused below.
+
+        env_factors = envs.compile_factors()
+        env_hash = hash_factors(env_factors)
+        # Compute config/compiler/code hashes once and reuse
+        config_hash = vllm_config.compute_hash()
+        compiler_hash = self.compiler_manager.compute_hash(vllm_config)
+        forward_code_files = list(sorted(self.compilation_config.traced_files))
+
+        logger.debug(
+            "Traced files (to be considered for compilation cache):\n%s",
+            lazy(lambda: "\n".join(forward_code_files)),
+        )
+        hash_content = []
+        for filepath in forward_code_files:
+            hash_content.append(filepath)
+            if filepath == "<string>":
+                # This means the function was dynamically generated, with
+                # e.g. exec(). We can't actually check these.
+                continue
+            try:
+                with open(filepath) as f:
+                    hash_content.append(f.read())
+            except Exception:
+                logger.warning("Failed to read file %s", filepath)
+                continue
+        code_hash = hashlib.sha256("\n".join(hash_content).encode()).hexdigest()
+        # Clear after consumption
+        self.compilation_config.traced_files.clear()
         if not self.compilation_config.cache_dir:
             # no provided cache dir, generate one based on the known factors
             # that affects the compilation. if none of the factors change,
             # the cache dir will be the same so that we can reuse the compiled
             # graph.
-
-            factors = compilation_config_hash_factors(vllm_config)
-            # 2. factors come from the code files that are traced by Dynamo (
-            #    it mainly summarizes how the model is used in forward pass)
-            code_hash = _compute_code_hash(self.compilation_config.traced_files)
-            self.compilation_config.traced_files.clear()
-            factors.append(code_hash)
-
-            # 3. compiler hash
-            compiler_hash = self.compiler_manager.compute_hash(vllm_config)
-            factors.append(compiler_hash)
-
-            # combine all factors to generate the cache dir
-            hash_key = hashlib.md5(
-                str(factors).encode(), usedforsecurity=False
-            ).hexdigest()[:10]
-
+            factors = [env_hash, config_hash, code_hash, compiler_hash]
+            # Use SHA-256 for cache key hashing to be consistent across
+            # compute_hash functions. Truncate for a short cache dir name.
+            hash_key = hashlib.sha256(str(factors).encode()).hexdigest()[:10]
             cache_dir = os.path.join(
-                envs.VLLM_CACHE_ROOT,
-                "torch_compile_cache",
-                hash_key,
+                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key
             )
             self.compilation_config.cache_dir = cache_dir
 
@@ -621,6 +637,7 @@ class VllmBackend:
         os.makedirs(local_cache_dir, exist_ok=True)
         self.compilation_config.local_cache_dir = local_cache_dir
 
+        # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE.
         disable_cache = not is_compile_cache_enabled(
             self.compilation_config.inductor_compile_config
         )
@@ -638,6 +655,50 @@ class VllmBackend:
             local_cache_dir, disable_cache, self.prefix
         )
 
+        # Reuses existing cache key
+
+        logger.debug(
+            "torch.compile cache factors: env=%s cfg=%s comp=%s code=%s dir=%s",
+            env_hash,
+            config_hash,
+            compiler_hash,
+            code_hash,
+            local_cache_dir,
+        )
+
+        # Persist and log only hash-relevant factors together.
+        try:
+            logger.debug(
+                "Compile env factors (raw):\n%s\nVllm config hash: %s",
+                lazy(partial(pprint.pformat, env_factors, width=120)),
+                config_hash,
+            )
+            meta_path = os.path.join(local_cache_dir, "cache_key_factors.json")
+            if not os.path.exists(meta_path):
+                with open(meta_path, "w") as f:
+                    json.dump(
+                        {
+                            "env": env_factors,  # raw factors used for env_hash
+                            "config_hash": config_hash,
+                            "code_hash": code_hash,
+                            "compiler_hash": compiler_hash,
+                        },
+                        f,
+                        indent=2,
+                        sort_keys=True,
+                    )
+        except Exception:
+            # Best-effort only; metadata write failures are non-fatal.
+            logger.warning(
+                (
+                    "Could not write compile cache metadata at %s; continuing without "
+                    "metadata. Compiled cache remains valid; diagnostics may be "
+                    "limited."
+                ),
+                local_cache_dir,
+                exc_info=True,
+            )
+
         # when dynamo calls the backend, it means the bytecode
         # transform and analysis are done
         compilation_counter.num_graphs_seen += 1
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 0e8bb2fc97351..fe2547d7fecaf 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -127,7 +127,7 @@ class PostGradPassManager(CustomGraphPass):
         affects compilation caching. Its uuid depends on the UUIDs of all
         dependent passes and the pass config. See InductorPass for more info.
         """
-        state = {"pass_config": self.pass_config.uuid(), "passes": []}
+        state = {"pass_config": self.pass_config.compute_hash(), "passes": []}
         for pass_ in self.passes:
             state["passes"].append(pass_.uuid())
         state["passes"].append(self.fix_functionalization.uuid())
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 864cf1be81b20..2652c7c06ad0f 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from dataclasses import field
 from typing import TYPE_CHECKING, Any, Literal
 
@@ -160,13 +159,29 @@ class CacheConfig:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: list[Any] = []
-        factors.append(self.cache_dtype)
-        factors.append(self.mamba_cache_dtype)
-        factors.append(self.mamba_ssm_cache_dtype)
-        # `cpu_offload_gb` does not use `torch.compile` yet.
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
-        return hash_str
+        ignored_factors = {
+            # Runtime/derived knobs that don't affect compiled graph shape
+            "gpu_memory_utilization",
+            "swap_space",
+            "is_attention_free",
+            "num_gpu_blocks_override",
+            "enable_prefix_caching",
+            "prefix_caching_hash_algo",
+            # `cpu_offload_gb` does not use `torch.compile` yet.
+            "cpu_offload_gb",
+            "cpu_kvcache_space_bytes",
+            "mamba_page_size_padded",
+            # Post-init/derived counters
+            "num_gpu_blocks",
+            "num_cpu_blocks",
+            # WIP feature toggle not impacting compiled graph shape
+            "kv_sharing_fast_prefill",
+        }
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, ignored_factors)
+        return hash_factors(factors)
 
     def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 088d0b1af757a..ca01cb3fb55d5 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
-import hashlib
 from collections import Counter
 from collections.abc import Callable
 from dataclasses import asdict, field
@@ -160,7 +159,7 @@ class PassConfig:
             current_platform.get_device_capability().to_int(), {}
         )
 
-    def uuid(self):
+    def compute_hash(self) -> str:
         """
         Produces a hash unique to the pass configuration.
         Any new fields that affect compilation should be added to the hash.
@@ -506,28 +505,33 @@ class CompilationConfig:
 
     def compute_hash(self) -> str:
         """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
         Provide a hash that uniquely identifies all the configs
         that affect the structure of the computation
         graph from input ids/embeddings to the final hidden states,
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: list[Any] = []
-        factors.append(self.mode)
-        factors.append(self.backend)
-        factors.append(self.custom_ops)
-        factors.append(self.splitting_ops)
-        factors.append(self.use_inductor)
-        factors.append(self.use_inductor_graph_partition)
-        factors.append(self.inductor_compile_config)
-        factors.append(self.inductor_passes)
-        factors.append(self.pass_config.uuid())
-        factors.append(self.compile_cache_save_format)
-        return hashlib.sha256(str(factors).encode()).hexdigest()
+        # Opt-out: default-include declared fields; keep a tiny exclude set;
+        # normalize types; keep SHA-256. For nested opaque configs, include a
+        # stable identifier (e.g., pass_config.compute_hash()) instead of object id.
+
+        ignored_factors = {
+            # Paths/dirs and runtime/metrics that don’t affect compiled graph
+            "debug_dump_path",
+            "cache_dir",
+            "local_cache_dir",
+            "bs_to_padded_graph_size",
+            "traced_files",
+            "compilation_time",
+            "static_forward_context",
+            "pass_config",  # handled separately below
+        }
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, ignored_factors)
+        factors["pass_config"] = self.pass_config.compute_hash()
+        return hash_factors(factors)
 
     def __repr__(self) -> str:
         exclude = {
diff --git a/vllm/config/model.py b/vllm/config/model.py
index f61dbb6a695a2..b563a40eb8fc9 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
-import json
 import warnings
 from collections.abc import Callable
 from dataclasses import InitVar, field
@@ -18,7 +16,7 @@ import vllm.envs as envs
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
 from vllm.config.pooler import PoolerConfig
 from vllm.config.scheduler import RunnerType
-from vllm.config.utils import assert_hashable, config, getattr_iter
+from vllm.config.utils import config, getattr_iter
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import (
@@ -324,50 +322,50 @@ class ModelConfig:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: list[Any] = []
-        factors.append(self.model)
-        factors.append(self.dtype)
-        factors.append(self.quantization)
-        factors.append(self.revision)
-        factors.append(self.code_revision)
-        factors.append(self.max_model_len)
-        factors.append(self.max_logprobs)
-        factors.append(self.disable_sliding_window)
-        factors.append(self.trust_remote_code)
-        factors.append(self.generation_config)
-        factors.append(self.model_impl)
-        factors.append(self.override_generation_config)
-        factors.append(self.video_pruning_rate)
-        factors.append(self.enable_prompt_embeds)
+        ignored_factors = {
+            "runner",
+            "convert",
+            "task",
+            "tokenizer",
+            "tokenizer_mode",
+            "seed",
+            "hf_config_path",
+            "allowed_local_media_path",
+            "allowed_media_domains",
+            "tokenizer_revision",
+            "spec_target_max_model_len",
+            "enforce_eager",
+            "logprobs_mode",
+            "disable_cascade_attn",
+            "skip_tokenizer_init",
+            "enable_prompt_embeds",
+            "served_model_name",
+            "config_format",
+            "hf_token",
+            "hf_overrides",
+            "logits_processor_pattern",
+            "enable_sleep_mode",
+            "override_attention_dtype",
+            "logits_processors",
+            "io_processor_plugin",
+            "pooler_config",
+            "override_pooler_config",
+            "multimodal_config",
+            "limit_mm_per_prompt",
+            "media_io_kwargs",
+            "mm_processor_kwargs",
+            "mm_processor_cache_gb",
+            "mm_processor_cache_type",
+            "mm_shm_cache_max_object_size_mb",
+            "mm_encoder_tp_mode",
+            "interleave_mm_strings",
+            "skip_mm_profiling",
+        }
 
-        # hf_config can control how the model looks!
-        try:
-            hf_config_json = self.hf_config.to_json_string(use_diff=False)
-        except TypeError:
-            from transformers import PretrainedConfig
+        from vllm.config.utils import get_hash_factors, hash_factors
 
-            from vllm.utils.jsontree import json_map_leaves
-
-            # Handle nested HF configs with unserializable values gracefully
-            hf_config_json = (
-                json.dumps(
-                    json_map_leaves(
-                        lambda v: v.to_dict()
-                        if isinstance(v, PretrainedConfig)
-                        else str(v),
-                        self.hf_config.to_dict(),
-                    ),
-                    indent=2,
-                    sort_keys=True,
-                )
-                + "\n"
-            )
-
-        factors.append(hf_config_json)
-
-        str_factors = str(factors)
-        assert_hashable(str_factors)
-        return hashlib.sha256(str(factors).encode()).hexdigest()
+        factors = get_hash_factors(self, ignored_factors)
+        return hash_factors(factors)
 
     def _update_nested(
         self,
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 9a6326d62e82e..0f107a7a3ef83 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 import os
 from typing import TYPE_CHECKING, Any, Literal
 
@@ -448,19 +447,41 @@ class ParallelConfig:
         This hash is also used for DP worker configuration validation
         to prevent hangs from mismatched collective communication patterns.
         """
-        factors: list[Any] = []
-        factors.append(self.pipeline_parallel_size)
-        factors.append(self.tensor_parallel_size)
-        factors.append(self.enable_expert_parallel)
-        factors.append(self.data_parallel_size)
-        factors.append(self.all2all_backend)
-        factors.append(self.enable_eplb)
-        if self.enable_eplb:
-            factors.append(self.eplb_config.log_balancedness)
-            factors.append(self.eplb_config.window_size)
-            factors.append(self.eplb_config.step_interval)
-            factors.append(self.eplb_config.num_redundant_experts)
-        return hashlib.sha256(str(factors).encode()).hexdigest()
+        ignored_factors = {
+            # Derived/runtime topology, networking, or launch details
+            "data_parallel_rank",
+            "data_parallel_rank_local",
+            "data_parallel_backend",
+            "data_parallel_external_lb",
+            "data_parallel_hybrid_lb",
+            "data_parallel_master_ip",
+            "data_parallel_master_port",
+            "_data_parallel_master_port_list",
+            "data_parallel_rpc_port",
+            "rank",
+            "master_addr",
+            "master_port",
+            "node_rank",
+            "nnodes",
+            "max_parallel_loading_workers",
+            "disable_custom_all_reduce",
+            "ray_workers_use_nsight",
+            "ray_runtime_env",
+            "placement_group",
+            "distributed_executor_backend",
+            "worker_cls",
+            "sd_worker_cls",
+            "worker_extension_cls",
+            "_api_process_count",
+            "_api_process_rank",
+        }
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, ignored_factors)
+        # Explicitly include backend affecting env factor as before
+        factors["VLLM_ALL2ALL_BACKEND"] = str(envs.VLLM_ALL2ALL_BACKEND)
+        return hash_factors(factors)
 
     def __post_init__(self) -> None:
         # Set all2all_backend from env var if not specified, with deprecation warning
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 7e0878d96bbd6..02f2b75f608f1 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -3,14 +3,19 @@
 """Utility functions for vLLM config dataclasses."""
 
 import ast
+import enum
+import hashlib
 import inspect
+import json
+import pathlib
 import textwrap
-from collections.abc import Iterable
+from collections.abc import Iterable, Mapping, Sequence, Set
 from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
 from itertools import pairwise
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar
 
 import regex as re
+import torch
 from pydantic.fields import FieldInfo
 from typing_extensions import runtime_checkable
 
@@ -176,3 +181,115 @@ def update_config(config: ConfigT, overrides: dict[str, Any]) -> ConfigT:
             )
         processed_overrides[field_name] = value
     return replace(config, **processed_overrides)
+
+
+def normalize_value(x):
+    """Return a stable, JSON-serializable canonical form for hashing.
+    Order: primitives, special types (Enum, callable, torch.dtype, Path), then
+    generic containers (Mapping/Set/Sequence) with recursion.
+    """
+    # Fast path
+    if x is None or isinstance(x, (bool, int, float, str)):
+        return x
+
+    # Enums: tag with FQN to avoid primitive collisions.
+    # Ex: Enum(1) vs int(1) -> ("module.QualName", value).
+    if isinstance(x, enum.Enum):
+        enum_type = f"{x.__class__.__module__}.{x.__class__.__qualname__}"
+        return (enum_type, normalize_value(x.value))
+
+    # Classes (types) are accepted and canonicalized by their fully-qualified
+    # name (module.qualname) for a stable identifier.
+    # Instances are only accepted if they expose uuid(); otherwise they are
+    # rejected to avoid under-hashing object state.
+
+    # Callables: accept classes only; reject funcs/lambdas/methods.
+    # Used by LogitsProcessor types and ModelConfig.hf_overrides.
+    if isinstance(x, type):
+        module = getattr(x, "__module__", "")
+        qual = getattr(x, "__qualname__", getattr(x, "__name__", ""))
+        return ".".join([p for p in (module, qual) if p]) or repr(x)
+
+    # Prefer stable uuid identifiers for objects that provide them, even if
+    # they are callable instances (e.g., InductorPass wrappers).
+    if hasattr(x, "uuid") and callable(getattr(x, "uuid", None)):
+        return x.uuid()
+
+    if callable(x):
+        raise TypeError("normalize_value: function or callable instance unsupported")
+
+    # Torch dtype: stringify (torch.float64 -> "torch.float64").
+    # We rely on the string form here; dtype-bearing fields that need additional
+    # disambiguation should encode that at the config layer.
+    if isinstance(x, torch.dtype):
+        return str(x)
+
+    # Bytes
+    if isinstance(x, (bytes, bytearray)):
+        return x.hex()
+
+    # Paths (canonicalize)
+    if isinstance(x, pathlib.Path):
+        try:
+            return str(x.expanduser().resolve())
+        except Exception:
+            return str(x)
+
+    # Dataclasses: represent as (FQN, sorted(field,value) tuple) for stability.
+    if is_dataclass(x):
+        type_fqn = f"{x.__class__.__module__}.{x.__class__.__qualname__}"
+        items = tuple(
+            (f.name, normalize_value(getattr(x, f.name)))
+            for f in sorted(fields(x), key=lambda f: f.name)
+        )
+        return (type_fqn, items)
+
+    # Containers (generic)
+    if isinstance(x, Mapping):
+        return tuple(sorted((str(k), normalize_value(v)) for k, v in x.items()))
+    if isinstance(x, Set):
+        return tuple(sorted(repr(normalize_value(v)) for v in x))
+    if isinstance(x, Sequence) and not isinstance(x, (str, bytes, bytearray)):
+        return tuple(normalize_value(v) for v in x)
+
+    # PretrainedConfig
+    if hasattr(x, "to_json_string") and callable(x.to_json_string):
+        return x.to_json_string()
+
+    # Unsupported type: e.g., modules, generators, open files, or objects
+    # without a stable JSON/UUID representation. Hard-error to avoid
+    # under-hashing.
+    # If you hit this, either reshape your config to use supported primitives
+    # and containers, or extend normalize_value to provide a stable encoding
+    # (e.g., via uuid() or to_json_string()) for this type.
+    raise TypeError(
+        f"normalize_value: unsupported type '{type(x).__name__}'. "
+        "Ensure config values use supported primitives/containers or add a "
+        "stable representation for this type."
+    )
+
+
+def get_hash_factors(config: ConfigT, ignored_factors: set[str]) -> dict[str, object]:
+    """Gets the factors used for hashing a config class.
+    - Includes all dataclass fields not in `ignored_factors`.
+    - Errors on non-normalizable values.
+    """
+    factors: dict[str, object] = {}
+    for dc_field in fields(config):
+        factor = dc_field.name
+        if factor in ignored_factors:
+            continue
+        value = getattr(config, factor, None)
+        try:
+            factors[factor] = normalize_value(value)
+        except TypeError as e:
+            raise TypeError(
+                f"get_hash_factors: unsupported type for key '{factor}' "
+                f"({type(value).__name__})"
+            ) from e
+    return factors
+
+
+def hash_factors(items: dict[str, object]) -> str:
+    """Return a SHA-256 hex digest of the canonical items structure."""
+    return hashlib.sha256(json.dumps(items, sort_keys=True).encode()).hexdigest()
diff --git a/vllm/envs.py b/vllm/envs.py
index e61fb114325c6..212d68114e46e 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
-import hashlib
 import json
+import logging
 import os
 import sys
 import tempfile
@@ -426,6 +426,8 @@ def get_vllm_port() -> int | None:
 
 # --8<-- [start:env-vars-definition]
 
+logger = logging.getLogger(__name__)
+
 environment_variables: dict[str, Callable[[], Any]] = {
     # ================== Installation Time Env Vars ==================
     # Target device of vLLM, supporting [cuda (by default),
@@ -1540,85 +1542,88 @@ def is_set(name: str):
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
-def compute_hash() -> str:
-    """
-    WARNING: Whenever a new key is added to this environment
-    variables, ensure that it is included in the factors list if
-    it affects the computation graph. For example, different values
-    of VLLM_PP_LAYER_PARTITION will generate different computation
-    graphs, so it is included in the factors list. The env vars that
-    affect the choice of different kernels or attention backends should
-    also be included in the factors list.
-    """
+def compile_factors() -> dict[str, object]:
+    """Return env vars used for torch.compile cache keys.
 
-    # The values of envs may affects the computation graph.
-    # TODO(DefTruth): hash all environment variables?
-    # for key in environment_variables:
-    #     factorize(key)
-    environment_variables_to_hash = [
-        "VLLM_PP_LAYER_PARTITION",
-        "VLLM_MLA_DISABLE",
-        "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
-        "VLLM_USE_TRITON_AWQ",
-        "VLLM_DP_RANK",
-        "VLLM_DP_SIZE",
-        "VLLM_USE_STANDALONE_COMPILE",
-        "VLLM_FUSED_MOE_CHUNK_SIZE",
-        "VLLM_FLASHINFER_MOE_BACKEND",
-        "VLLM_V1_USE_PREFILL_DECODE_ATTENTION",
-        "VLLM_ATTENTION_BACKEND",
-        "VLLM_USE_FLASHINFER_SAMPLER",
-        "VLLM_DISABLED_KERNELS",
-        "VLLM_USE_DEEP_GEMM",
-        "VLLM_MOE_USE_DEEP_GEMM",
-        "VLLM_USE_DEEP_GEMM_E8M0",
-        "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
-        "VLLM_USE_FLASHINFER_MOE_FP16",
-        "VLLM_USE_FLASHINFER_MOE_FP8",
-        "VLLM_USE_FLASHINFER_MOE_FP4",
-        "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
-        "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
-        "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
-        "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE",
-        "VLLM_USE_CUDNN_PREFILL",
-        "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
-        "VLLM_USE_TRTLLM_ATTENTION",
-        "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
-        "VLLM_ROCM_USE_AITER",
-        "VLLM_ROCM_USE_AITER_PAGED_ATTN",
-        "VLLM_ROCM_USE_AITER_LINEAR",
-        "VLLM_ROCM_USE_AITER_MOE",
-        "VLLM_ROCM_USE_AITER_RMSNORM",
-        "VLLM_ROCM_USE_AITER_MLA",
-        "VLLM_ROCM_USE_AITER_MHA",
-        "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM",
-        "VLLM_ROCM_USE_AITER_TRITON_ROPE",
-        "VLLM_ROCM_USE_AITER_FP8BMM",
-        "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION",
-        "VLLM_ROCM_USE_AITER_TRITON_GEMM",
-        "VLLM_ROCM_USE_SKINNY_GEMM",
-        "VLLM_ROCM_FP8_PADDING",
-        "VLLM_ROCM_MOE_PADDING",
-        "VLLM_ROCM_CUSTOM_PAGED_ATTN",
-        "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION",
-        "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16",
-        "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB",
-        "VLLM_ROCM_FP8_MFMA_PAGE_ATTN",
-        "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE",
-        "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING",
-        "VLLM_NVFP4_GEMM_BACKEND",
-        "VLLM_USE_FBGEMM",
-        "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE",
-        "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL",
-    ]
-    for key in environment_variables_to_hash:
-        # if this goes out of sync with environment_variables,
-        # it's not a user error, it's a bug
-        assert key in environment_variables, (
-            "Please update environment_variables_to_hash in envs.py"
-        )
+    Start with every known vLLM env var; drop entries in `ignored_factors`;
+    hash everything else. This keeps the cache key aligned across workers."""
 
-    factors = [environment_variables[key]() for key in environment_variables_to_hash]
+    ignored_factors: set[str] = {
+        "MAX_JOBS",
+        "VLLM_RPC_BASE_PATH",
+        "VLLM_USE_MODELSCOPE",
+        "VLLM_RINGBUFFER_WARNING_INTERVAL",
+        "VLLM_DEBUG_DUMP_PATH",
+        "VLLM_PORT",
+        "VLLM_CACHE_ROOT",
+        "LD_LIBRARY_PATH",
+        "VLLM_SERVER_DEV_MODE",
+        "VLLM_DP_MASTER_IP",
+        "VLLM_DP_MASTER_PORT",
+        "VLLM_RANDOMIZE_DP_DUMMY_INPUTS",
+        "VLLM_CI_USE_S3",
+        "VLLM_MODEL_REDIRECT_PATH",
+        "VLLM_HOST_IP",
+        "S3_ACCESS_KEY_ID",
+        "S3_SECRET_ACCESS_KEY",
+        "S3_ENDPOINT_URL",
+        "VLLM_USAGE_STATS_SERVER",
+        "VLLM_NO_USAGE_STATS",
+        "VLLM_DO_NOT_TRACK",
+        "VLLM_LOGGING_LEVEL",
+        "VLLM_LOGGING_PREFIX",
+        "VLLM_LOGGING_STREAM",
+        "VLLM_LOGGING_CONFIG_PATH",
+        "VLLM_LOG_STATS_INTERVAL",
+        "VLLM_DEBUG_LOG_API_SERVER_RESPONSE",
+        "VLLM_TUNED_CONFIG_FOLDER",
+        "VLLM_ENGINE_ITERATION_TIMEOUT_S",
+        "VLLM_HTTP_TIMEOUT_KEEP_ALIVE",
+        "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS",
+        "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH",
+        "VLLM_SLEEP_WHEN_IDLE",
+        "VLLM_IMAGE_FETCH_TIMEOUT",
+        "VLLM_VIDEO_FETCH_TIMEOUT",
+        "VLLM_AUDIO_FETCH_TIMEOUT",
+        "VLLM_MEDIA_URL_ALLOW_REDIRECTS",
+        "VLLM_MEDIA_LOADING_THREAD_COUNT",
+        "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB",
+        "VLLM_VIDEO_LOADER_BACKEND",
+        "VLLM_MEDIA_CONNECTOR",
+        "VLLM_ASSETS_CACHE",
+        "VLLM_ASSETS_CACHE_MODEL_CLEAN",
+        "VLLM_MM_INPUT_CACHE_GIB",
+        "VLLM_WORKER_MULTIPROC_METHOD",
+        "VLLM_ENABLE_V1_MULTIPROCESSING",
+        "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE",
+        "VLLM_CPU_KVCACHE_SPACE",
+        "VLLM_CPU_OMP_THREADS_BIND",
+        "VLLM_CPU_NUM_OF_RESERVED_CPU",
+        "VLLM_CPU_MOE_PREPACK",
+        "VLLM_CPU_SGL_KERNEL",
+        "VLLM_TEST_FORCE_LOAD_FORMAT",
+        "LOCAL_RANK",
+        "CUDA_VISIBLE_DEVICES",
+    }
+
+    from vllm.config.utils import normalize_value
+
+    factors: dict[str, object] = {}
+    for factor, getter in environment_variables.items():
+        if factor in ignored_factors:
+            continue
+
+        try:
+            raw = getter()
+        except Exception as exc:  # pragma: no cover - defensive logging
+            logger.warning(
+                "Skipping environment variable %s while hashing compile factors: %s",
+                factor,
+                exc,
+            )
+            continue
+
+        factors[factor] = normalize_value(raw)
 
     ray_noset_env_vars = [
         # Refer to
@@ -1641,8 +1646,8 @@ def compute_hash() -> str:
         "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR",
         "RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES",
     ]
-    factors.extend([os.getenv(var) for var in ray_noset_env_vars])
 
-    hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+    for var in ray_noset_env_vars:
+        factors[var] = normalize_value(os.getenv(var))
 
-    return hash_str
+    return factors
diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
index 7202259ca21aa..44b40ead973ba 100644
--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
@@ -2,9 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.logging_utils.formatter import NewLineFormatter
+from vllm.logging_utils.lazy import lazy
 from vllm.logging_utils.log_time import logtime
 
 __all__ = [
     "NewLineFormatter",
+    "lazy",
     "logtime",
 ]
diff --git a/vllm/logging_utils/lazy.py b/vllm/logging_utils/lazy.py
new file mode 100644
index 0000000000000..3ade798962857
--- /dev/null
+++ b/vllm/logging_utils/lazy.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+from typing import Any
+
+
+class lazy:
+    """Wrap a zero-argument callable evaluated only during log formatting."""
+
+    __slots__ = ("_factory",)
+
+    def __init__(self, factory: Callable[[], Any]) -> None:
+        self._factory = factory
+
+    def __str__(self) -> str:
+        return str(self._factory())
+
+    def __repr__(self) -> str:
+        return str(self)

From 48fc8b1e595766af9c91edfc1de43f3a352575eb Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 19 Nov 2025 10:04:07 -0500
Subject: [PATCH 021/249] [BugFix] Fix async-scheduling + FlashAttn MLA
 (#28990)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/attention/backends/mla/common.py        | 15 +++++++++------
 vllm/v1/attention/backends/mla/flashattn_mla.py |  2 +-
 vllm/v1/attention/backends/utils.py             |  1 +
 vllm/v1/worker/gpu_model_runner.py              | 10 +++++++---
 4 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 2ccdd1f143ce8..e328049b53c7e 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -755,6 +755,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         dcp_local_seq_lens = common_attn_metadata.dcp_local_seq_lens
+        dcp_local_seq_lens_cpu = common_attn_metadata.dcp_local_seq_lens_cpu
 
         query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
@@ -944,18 +945,20 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
         decode_metadata = None
         if num_decodes > 0:
+            dcp_tot_seq_lens_device = None
+            if self.dcp_world_size > 1:
+                dcp_tot_seq_lens_device = seq_lens[:num_decodes]
+                seq_lens_cpu = dcp_local_seq_lens_cpu
+                seq_lens = dcp_local_seq_lens
+
             decode_metadata = self._build_decode(
                 block_table_tensor=block_table_tensor[:num_decodes, ...],
                 seq_lens_cpu=seq_lens_cpu[:num_decodes],
-                seq_lens_device=dcp_local_seq_lens[:num_decodes]
-                if self.dcp_world_size > 1 and dcp_local_seq_lens is not None
-                else seq_lens[:num_decodes],
+                seq_lens_device=seq_lens[:num_decodes],
                 query_start_loc_cpu=query_start_loc_cpu[: num_decodes + 1],
                 query_start_loc_device=query_start_loc[: num_decodes + 1],
                 num_decode_tokens=num_decode_tokens,
-                dcp_tot_seq_lens_device=seq_lens[:num_decodes]
-                if self.dcp_world_size > 1
-                else None,
+                dcp_tot_seq_lens_device=dcp_tot_seq_lens_device,
             )
 
         attn_metadata = self.metadata_cls(
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index 7794e89cc0a94..12639edc8b9a1 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -173,7 +173,7 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
     ) -> FlashAttnMLADecodeMetadata:
         query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
         max_query_len = query_lens_cpu.max().item()
-        max_seq_len = seq_lens_device.max().item()
+        max_seq_len = seq_lens_cpu.max().item()
 
         # For Flash Attention MLA + full cudagraph
         max_num_splits = 0
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 578153cda7863..0dd1896331291 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -92,6 +92,7 @@ class CommonAttentionMetadata:
     encoder_seq_lens: np.ndarray | None = None
 
     dcp_local_seq_lens: torch.Tensor | None = None
+    dcp_local_seq_lens_cpu: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 506118d2d762b..3b00085b6bb99 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1451,9 +1451,12 @@ class GPUModelRunner(
         num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
             :num_reqs
         ]
-        dcp_local_seq_lens = (
-            self.dcp_local_seq_lens.gpu[:num_reqs] if self.dcp_world_size > 1 else None
-        )
+
+        dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None
+        if self.dcp_world_size > 1:
+            dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs]
+            dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs]
+
         spec_decode_common_attn_metadata = None
 
         if for_cudagraph_capture:
@@ -1521,6 +1524,7 @@ class GPUModelRunner(
                 causal=True,
                 encoder_seq_lens=encoder_seq_lens,
                 dcp_local_seq_lens=dcp_local_seq_lens,
+                dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu,
             )
 
             if self.speculative_config and spec_decode_common_attn_metadata is None:

From d44e9df7d49a9bb3400b002c38c06fae2dd7d1e8 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Thu, 20 Nov 2025 00:24:55 +0800
Subject: [PATCH 022/249] [Model][Mamba] Add selector for mamba attention
 backend and make it pluggable for other device (#26487)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 docs/contributing/model/basic.md              |   1 +
 vllm/attention/__init__.py                    |   3 +-
 vllm/attention/backends/registry.py           | 114 +++++++++++++++---
 vllm/attention/selector.py                    |  33 ++++-
 vllm/model_executor/layers/kda.py             |   8 +-
 vllm/model_executor/layers/mamba/abstract.py  |  10 +-
 .../layers/mamba/linear_attn.py               |  14 ---
 .../layers/mamba/mamba_mixer.py               |  10 +-
 .../layers/mamba/mamba_mixer2.py              |   9 --
 .../model_executor/layers/mamba/short_conv.py |   9 --
 vllm/model_executor/models/plamo2.py          |   9 --
 vllm/model_executor/models/qwen3_next.py      |   9 +-
 12 files changed, 144 insertions(+), 85 deletions(-)

diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index a7b54f015c2da..d7f5d2f311a37 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -146,6 +146,7 @@ We use "mamba-like" to refer to layers that posses a state that is updated in-pl
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
 Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
+It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/attention/backends/registry.py) when adding a new mamba backend.
 Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
 Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
 The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index dd35165d5415e..8b4dc4013362e 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -7,7 +7,7 @@ from vllm.attention.backends.abstract import (
     AttentionType,
 )
 from vllm.attention.layer import Attention
-from vllm.attention.selector import get_attn_backend
+from vllm.attention.selector import get_attn_backend, get_mamba_attn_backend
 
 __all__ = [
     "Attention",
@@ -15,4 +15,5 @@ __all__ = [
     "AttentionMetadata",
     "AttentionType",
     "get_attn_backend",
+    "get_mamba_attn_backend",
 ]
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index f07a6059be377..51899b0235915 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention backend registry"""
 
-import enum
 from collections.abc import Callable
+from enum import Enum, EnumMeta
 from typing import TYPE_CHECKING, cast
 
 from vllm.logger import init_logger
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
 
 
-class _AttentionBackendEnumMeta(enum.EnumMeta):
+class _AttentionBackendEnumMeta(EnumMeta):
     """Metaclass for AttentionBackendEnum to provide better error messages."""
 
     def __getitem__(cls, name: str):
@@ -23,15 +23,15 @@ class _AttentionBackendEnumMeta(enum.EnumMeta):
         try:
             return super().__getitem__(name)
         except KeyError:
-            members = cast("dict[str, AttentionBackendEnum]", cls.__members__).values()
-            valid_backends = ", ".join(m.name for m in members)
+            members = cast("dict[str, Enum]", cls.__members__).keys()
+            valid_backends = ", ".join(members)
             raise ValueError(
                 f"Unknown attention backend: '{name}'. "
                 f"Valid options are: {valid_backends}"
             ) from None
 
 
-class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta):
+class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     """Enumeration of all supported attention backends.
 
     The enum value is the default class path, but this can be overridden
@@ -83,7 +83,7 @@ class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta):
         Raises:
             ValueError: If Backend.CUSTOM is used without being registered
         """
-        path = _OVERRIDES.get(self, self.value)
+        path = _ATTN_OVERRIDES.get(self, self.value)
         if not path:
             raise ValueError(
                 f"Backend {self.name} must be registered before use. "
@@ -111,18 +111,93 @@ class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta):
         Returns:
             True if the backend has a registered override
         """
-        return self in _OVERRIDES
+        return self in _ATTN_OVERRIDES
 
     def clear_override(self) -> None:
         """Clear any override for this backend, reverting to the default."""
-        _OVERRIDES.pop(self, None)
+        _ATTN_OVERRIDES.pop(self, None)
 
 
-_OVERRIDES: dict[AttentionBackendEnum, str] = {}
+class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
+    """Enumeration of all supported mamba attention backends.
+
+    The enum value is the default class path, but this can be overridden
+    at runtime using register_backend().
+
+    To get the actual backend class (respecting overrides), use:
+        backend.get_class()
+    """
+
+    MAMBA1 = "vllm.v1.attention.backends.mamba1_attn.Mamba1AttentionBackend"
+    MAMBA2 = "vllm.v1.attention.backends.mamba2_attn.Mamba2AttentionBackend"
+    SHORT_CONV = "vllm.v1.attention.backends.short_conv_attn.ShortConvAttentionBackend"
+    LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend"
+    GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend"
+    # Placeholder for third-party/custom backends - must be registered before use
+    CUSTOM = ""
+
+    def get_path(self, include_classname: bool = True) -> str:
+        """Get the class path for this backend (respects overrides).
+
+        Returns:
+            The fully qualified class path string
+
+        Raises:
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        path = _MAMBA_ATTN_OVERRIDES.get(self, self.value)
+        if not path:
+            raise ValueError(
+                f"Backend {self.name} must be registered before use. "
+                f"Use register_backend(Backend.{self.name}, 'your.module.YourClass')"
+            )
+        if not include_classname:
+            path = path.rsplit(".", 1)[0]
+        return path
+
+    def get_class(self) -> "type[AttentionBackend]":
+        """Get the backend class (respects overrides).
+
+        Returns:
+            The backend class
+
+        Raises:
+            ImportError: If the backend class cannot be imported
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        return resolve_obj_by_qualname(self.get_path())
+
+    def is_overridden(self) -> bool:
+        """Check if this backend has been overridden.
+
+        Returns:
+            True if the backend has a registered override
+        """
+        return self in _MAMBA_ATTN_OVERRIDES
+
+    def clear_override(self) -> None:
+        """Clear any override for this backend, reverting to the default."""
+        _MAMBA_ATTN_OVERRIDES.pop(self, None)
+
+
+MAMBA_TYPE_TO_BACKEND_MAP = {
+    "mamba1": MambaAttentionBackendEnum.MAMBA1.name,
+    "mamba2": MambaAttentionBackendEnum.MAMBA2.name,
+    "short_conv": MambaAttentionBackendEnum.SHORT_CONV.name,
+    "linear_attention": MambaAttentionBackendEnum.LINEAR.name,
+    "gdn_attention": MambaAttentionBackendEnum.GDN_ATTN.name,
+    "custom": MambaAttentionBackendEnum.CUSTOM.name,
+}
+
+
+_ATTN_OVERRIDES: dict[AttentionBackendEnum, str] = {}
+_MAMBA_ATTN_OVERRIDES: dict[MambaAttentionBackendEnum, str] = {}
 
 
 def register_backend(
-    backend: AttentionBackendEnum, class_path: str | None = None
+    backend: AttentionBackendEnum | MambaAttentionBackendEnum,
+    is_mamba: bool = False,
+    class_path: str | None = None,
 ) -> Callable[[type], type]:
     """Register or override a backend implementation.
 
@@ -135,12 +210,17 @@ def register_backend(
         Decorator function if class_path is None, otherwise a no-op
 
     Examples:
-        # Override an existing backend
+        # Override an existing attention backend
         @register_backend(AttentionBackendEnum.FLASH_ATTN)
         class MyCustomFlashAttn:
             ...
 
-        # Register a custom third-party backend
+        # Override an existing mamba attention backend
+        @register_backend(MambaAttentionBackendEnum.LINEAR, is_mamba=True)
+        class MyCustomMambaAttn:
+            ...
+
+        # Register a custom third-party attention backend
         @register_backend(AttentionBackendEnum.CUSTOM)
         class MyCustomBackend:
             ...
@@ -153,11 +233,17 @@ def register_backend(
     """
 
     def decorator(cls: type) -> type:
-        _OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"
+        if is_mamba:
+            _MAMBA_ATTN_OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"  # type: ignore[index]
+        else:
+            _ATTN_OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"  # type: ignore[index]
         return cls
 
     if class_path is not None:
-        _OVERRIDES[backend] = class_path
+        if is_mamba:
+            _MAMBA_ATTN_OVERRIDES[backend] = class_path  # type: ignore[index]
+        else:
+            _ATTN_OVERRIDES[backend] = class_path  # type: ignore[index]
         return lambda x: x
 
     return decorator
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 1a092db9ce378..e9af08b2316d2 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -12,7 +12,11 @@ import torch
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.backends.registry import (
+    MAMBA_TYPE_TO_BACKEND_MAP,
+    AttentionBackendEnum,
+    MambaAttentionBackendEnum,
+)
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.utils import STR_BACKEND_ENV_VAR
@@ -197,6 +201,33 @@ def _cached_get_attn_backend(
     return backend
 
 
+def get_mamba_attn_backend(
+    mamba_type: str,
+) -> type[AttentionBackend]:
+    """Select which mamba attention backend to use and lazily import it."""
+    return _cached_get_mamba_attn_backend(mamba_type)
+
+
+@cache
+def _cached_get_mamba_attn_backend(
+    mamba_type: str,
+) -> type[AttentionBackend]:
+    assert mamba_type and isinstance(mamba_type, str)
+
+    selected_backend = None
+    try:
+        backend_name = MAMBA_TYPE_TO_BACKEND_MAP[mamba_type]
+        selected_backend = MambaAttentionBackendEnum[backend_name]
+    except KeyError as e:
+        raise ValueError(
+            f"Invalid mamba attention backend type: '{backend_name}'. Valid "
+            f"backends are: {list(MambaAttentionBackendEnum.__members__.keys())}"
+        ) from e
+
+    mamba_attn_backend = selected_backend.get_class()
+    return mamba_attn_backend
+
+
 @contextmanager
 def global_force_attn_backend_context_manager(
     attn_backend: AttentionBackendEnum,
diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py
index 2e7500bac7188..27cc3884517f9 100644
--- a/vllm/model_executor/layers/kda.py
+++ b/vllm/model_executor/layers/kda.py
@@ -5,7 +5,6 @@ import torch
 from einops import rearrange
 from torch import nn
 
-from vllm.attention import AttentionBackend
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed import (
@@ -83,12 +82,7 @@ direct_register_custom_op(
 class KimiDeltaAttention(nn.Module, MambaBase):
     @property
     def mamba_type(self) -> str:
-        return "linear_attention"
-
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend
-
-        return GDNAttentionBackend
+        return "gdn_attention"
 
     def get_state_dtype(
         self,
diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py
index e68b09b4d81f5..aa919d6fdc35c 100644
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING
 
 import torch
 
+from vllm.attention.selector import get_mamba_attn_backend
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec
@@ -38,11 +39,6 @@ class MambaBase(AttentionLayerBase):
     def mamba_type(self) -> str:
         pass
 
-    @abstractmethod
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        """Get the attention backend class for this Mamba layer."""
-        pass
-
     @abstractmethod
     def get_state_dtype(self) -> tuple[torch.dtype, ...]:
         pass
@@ -69,3 +65,7 @@ class MambaBase(AttentionLayerBase):
                 else 0
             ),
         )
+
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        """Get the attention backend class for this Mamba layer."""
+        return get_mamba_attn_backend(self.mamba_type)
diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
index 0a2742ff49a44..d85b3e61c5d61 100644
--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -2,12 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-from typing import TYPE_CHECKING
 
 import torch
 import torch.nn.functional as F
@@ -37,9 +31,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
 
 class MiniMaxText01RMSNormTP(CustomOp):
     name = "MiniMaxText01RMSNormTP"
@@ -123,11 +114,6 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
     def mamba_type(self) -> str:
         return "linear_attention"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend
-
-        return LinearAttentionBackend
-
     def get_state_dtype(self) -> tuple[torch.dtype]:
         assert self.model_config is not None
         assert self.cache_config is not None
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index b6345b8af7f0a..90e520e244416 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, NamedTuple
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
+from typing import NamedTuple
 
 import torch
 from torch import nn
@@ -452,11 +449,6 @@ class MambaMixer(MambaBase, CustomOp):
     def mamba_type(self) -> str:
         return "mamba1"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
-
-        return Mamba1AttentionBackend
-
     def _time_proj_bias(self) -> torch.Tensor | None:
         if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None:
             return self.dt_proj.bias.float()
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 57313990b8206..900701c46348b 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -1,10 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 from torch import nn
@@ -908,11 +904,6 @@ class MambaMixer2(MambaBase, CustomOp):
     def mamba_type(self) -> str:
         return "mamba2"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
-
-        return Mamba2AttentionBackend
-
 
 def mamba_mixer2(
     projected_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py
index 04efa8a8b3734..0bbad17d7ebc7 100644
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -1,10 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 
@@ -232,11 +228,6 @@ class ShortConv(MambaBase, CustomOp):
     def mamba_type(self) -> str:
         return "short_conv"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionBackend
-
-        return ShortConvAttentionBackend
-
 
 def short_conv(
     hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 0c87f5000ff45..52c9755e0e0ea 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -4,10 +4,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 from torch import nn
@@ -467,11 +463,6 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
     def mamba_type(self) -> str:
         return "mamba2"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
-
-        return Mamba2AttentionBackend
-
 
 def plamo2_mamba_mixer(
     hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 0415c8e00fdfa..ad631f61e4b93 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -10,7 +10,7 @@ from einops import rearrange
 from torch import nn
 from transformers.activations import ACT2FN
 
-from vllm.attention import Attention, AttentionBackend, AttentionMetadata
+from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CacheConfig,
@@ -216,12 +216,7 @@ class Qwen3NextSparseMoeBlock(nn.Module):
 class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
     @property
     def mamba_type(self) -> str:
-        return "linear_attention"
-
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend
-
-        return GDNAttentionBackend
+        return "gdn_attention"
 
     def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
         return MambaStateDtypeCalculator.gated_delta_net_state_dtype(

From a8b70304d68497ac1c432a2ff343e9bfb516c227 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 19 Nov 2025 18:06:36 +0100
Subject: [PATCH 023/249] Update `rope_scaling` to `rope_parameters` in
 preparation for Transformers v5 (#28542)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |   6 +-
 benchmarks/kernels/benchmark_mrope.py         |  19 ++--
 .../offline_inference/context_extension.py    |   6 +-
 tests/compile/test_functionalization.py       |   4 +-
 tests/kernels/core/test_mrope.py              |  16 +--
 tests/kernels/core/test_pos_encoding.py       |  39 +++----
 .../moe/test_gpt_oss_triton_kernels.py        |   2 +-
 .../pooling/test_nomic_max_model_len.py       |  16 +--
 tests/test_config.py                          |  37 ++++---
 vllm/config/model.py                          |  59 +++++------
 .../layers/rotary_embedding/__init__.py       |  76 ++++++-------
 vllm/model_executor/models/afmoe.py           |  17 +--
 vllm/model_executor/models/apertus.py         |  22 +---
 vllm/model_executor/models/arcee.py           |  11 --
 vllm/model_executor/models/arctic.py          |   3 +-
 vllm/model_executor/models/baichuan.py        |   8 +-
 vllm/model_executor/models/bailing_moe.py     |   3 +-
 vllm/model_executor/models/bamba.py           |   6 +-
 vllm/model_executor/models/chameleon.py       |  29 +----
 vllm/model_executor/models/chatglm.py         |   3 +-
 vllm/model_executor/models/commandr.py        |   5 +-
 vllm/model_executor/models/config.py          |  22 ++--
 vllm/model_executor/models/dbrx.py            |   7 +-
 vllm/model_executor/models/deepseek_v2.py     |  43 +++-----
 vllm/model_executor/models/dots1.py           |  11 +-
 vllm/model_executor/models/ernie45_moe.py     |  14 +--
 vllm/model_executor/models/ernie45_vl_moe.py  |  13 +--
 vllm/model_executor/models/exaone.py          |  21 +---
 vllm/model_executor/models/exaone4.py         |  19 +---
 vllm/model_executor/models/falcon.py          |   3 +-
 vllm/model_executor/models/falcon_h1.py       |   8 +-
 vllm/model_executor/models/gemma.py           |   8 +-
 vllm/model_executor/models/gemma2.py          |   5 +-
 vllm/model_executor/models/gemma3.py          |  21 ++--
 vllm/model_executor/models/gemma3n.py         |  20 ++--
 vllm/model_executor/models/glm4.py            |  10 +-
 vllm/model_executor/models/glm4_1v.py         |   1 -
 vllm/model_executor/models/glm4_moe.py        |  11 +-
 vllm/model_executor/models/gpt_j.py           |   3 +-
 vllm/model_executor/models/gpt_neox.py        |   3 +-
 vllm/model_executor/models/gpt_oss.py         |  13 ++-
 vllm/model_executor/models/granite.py         |  17 +--
 vllm/model_executor/models/granitemoe.py      |  13 +--
 .../model_executor/models/granitemoehybrid.py |   5 +-
 .../model_executor/models/granitemoeshared.py |   6 +-
 vllm/model_executor/models/grok1.py           |  11 +-
 vllm/model_executor/models/hunyuan_v1.py      |  25 +----
 vllm/model_executor/models/internlm2.py       |  12 +--
 vllm/model_executor/models/internlm2_ve.py    |   5 +-
 vllm/model_executor/models/kimi_linear.py     |   5 -
 vllm/model_executor/models/lfm2.py            |  17 +--
 vllm/model_executor/models/lfm2_moe.py        |  17 +--
 vllm/model_executor/models/llama.py           |  22 +---
 vllm/model_executor/models/llama4.py          |  11 +-
 vllm/model_executor/models/longcat_flash.py   |  22 ++--
 vllm/model_executor/models/minicpm.py         |  12 +--
 vllm/model_executor/models/minicpm3.py        |  10 +-
 vllm/model_executor/models/minicpm_eagle.py   |   5 +-
 vllm/model_executor/models/minimax_m2.py      |  12 +--
 vllm/model_executor/models/minimax_text_01.py |   9 +-
 vllm/model_executor/models/mixtral.py         |   7 +-
 vllm/model_executor/models/mllama4.py         |   8 +-
 vllm/model_executor/models/molmo.py           |   3 +-
 vllm/model_executor/models/nemotron.py        |  17 +--
 vllm/model_executor/models/nemotron_nas.py    |  19 +---
 vllm/model_executor/models/olmo.py            |   3 +-
 vllm/model_executor/models/olmo2.py           |  13 +--
 vllm/model_executor/models/olmoe.py           |   6 +-
 vllm/model_executor/models/openpangu.py       |  26 ++---
 vllm/model_executor/models/orion.py           |  12 +--
 vllm/model_executor/models/ouro.py            |  11 +-
 vllm/model_executor/models/persimmon.py       |   3 +-
 vllm/model_executor/models/phi.py             |   6 +-
 vllm/model_executor/models/phimoe.py          |  18 ++--
 vllm/model_executor/models/plamo2.py          |   7 +-
 vllm/model_executor/models/qwen.py            |  11 +-
 vllm/model_executor/models/qwen2.py           |  16 +--
 vllm/model_executor/models/qwen2_5_vl.py      |   1 -
 vllm/model_executor/models/qwen2_moe.py       |  12 +--
 vllm/model_executor/models/qwen2_vl.py        |   1 -
 vllm/model_executor/models/qwen3.py           |  15 +--
 vllm/model_executor/models/qwen3_moe.py       |  12 +--
 vllm/model_executor/models/qwen3_next.py      |   3 +-
 .../models/qwen3_omni_moe_thinker.py          |   1 -
 vllm/model_executor/models/qwen3_vl.py        |   1 -
 vllm/model_executor/models/seed_oss.py        |  15 +--
 vllm/model_executor/models/solar.py           |  18 +---
 vllm/model_executor/models/stablelm.py        |   2 +-
 vllm/model_executor/models/starcoder2.py      |   3 +-
 vllm/model_executor/models/step3_text.py      |  16 ++-
 .../models/transformers/utils.py              |  10 +-
 vllm/model_executor/models/zamba2.py          |   4 +-
 vllm/transformers_utils/config.py             | 100 +++++++++++++-----
 vllm/transformers_utils/configs/afmoe.py      |   7 +-
 vllm/transformers_utils/configs/arctic.py     |  18 +++-
 vllm/transformers_utils/configs/flex_olmo.py  |  17 +--
 .../transformers_utils/configs/kimi_linear.py |  12 ++-
 vllm/transformers_utils/configs/lfm2_moe.py   |  12 ++-
 .../transformers_utils/configs/midashenglm.py |   2 +-
 vllm/transformers_utils/configs/mistral.py    |   4 +-
 vllm/transformers_utils/configs/nemotron.py   |  60 ++++++-----
 vllm/transformers_utils/configs/olmo3.py      |  12 ++-
 vllm/transformers_utils/configs/qwen3_next.py |  17 +--
 vllm/transformers_utils/configs/step3_vl.py   |  12 ++-
 104 files changed, 542 insertions(+), 910 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e62cd60efaec0..d4b6f4077ab32 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -872,12 +872,12 @@ steps:
   optional: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
     - pytest -v -s tests/models/test_transformers.py
     # - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
+    - pytest -v -s tests/models/multimodal/test_mapping.py
     - python3 examples/offline_inference/basic/chat.py
-    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 
diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
index cb848d2bf579e..83bd91917508f 100644
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -6,7 +6,7 @@
 #
 # The CSV file (named with current date/time) contains these columns:
 # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
-# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
+# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99,
 # torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
 # speedup
 #
@@ -86,9 +86,8 @@ def benchmark_mrope(
     num_heads: int,
     num_kv_heads: int,
     max_position: int = 8192,
-    rope_theta: float = 10000,
     is_neox_style: bool = True,
-    rope_scaling: dict[str, Any] = None,
+    rope_parameters: dict[str, Any] | None = None,
     dtype: torch.dtype = torch.bfloat16,
     seed: int = 0,
     warmup_iter: int = 10,
@@ -102,9 +101,8 @@ def benchmark_mrope(
         head_size=head_dim,
         rotary_dim=head_dim,
         max_position=max_position,
-        base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=rope_scaling,
+        rope_parameters=rope_parameters,
         dtype=dtype,
     ).to(device=device)
 
@@ -203,9 +201,8 @@ def benchmark_mrope(
             num_kv_heads,
             head_dim,
             max_position,
-            rope_theta,
             is_neox_style,
-            str(rope_scaling),
+            str(rope_parameters),
             str(dtype).split(".")[-1],
             torch_stats["mean"],
             torch_stats["median"],
@@ -255,9 +252,8 @@ if __name__ == "__main__":
             "num_kv_heads",
             "head_dim",
             "max_position",
-            "rope_theta",
             "is_neox_style",
-            "rope_scaling",
+            "rope_parameters",
             "dtype",
             "torch_mean",
             "torch_median",
@@ -303,7 +299,7 @@ if __name__ == "__main__":
                 q_size = num_heads * head_dim
                 kv_size = num_kv_heads * head_dim
                 is_neox_style = True
-                rope_theta = config.rope_theta
+                rope_parameters = config.rope_parameters
                 max_position = config.max_position_embeddings
 
                 for num_tokens in num_tokens_list:
@@ -315,9 +311,8 @@ if __name__ == "__main__":
                         num_heads=num_heads,
                         num_kv_heads=num_kv_heads,
                         max_position=max_position,
-                        rope_theta=rope_theta,
                         is_neox_style=is_neox_style,
-                        rope_scaling=config.rope_scaling,
+                        rope_parameters=rope_parameters,
                         dtype=getattr(torch, args.dtype),
                         seed=args.seed,
                         warmup_iter=args.warmup_iter,
diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py
index df39e4c25d5c8..67d33e1881ee9 100644
--- a/examples/offline_inference/context_extension.py
+++ b/examples/offline_inference/context_extension.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This script demonstrates how to extend the context length
-of a Qwen model using the YARN method (rope_scaling)
+of a Qwen model using the YARN method (rope_parameters)
 and run a simple chat example.
 
 Usage:
@@ -19,8 +19,8 @@ def create_llm():
 
     # Use yarn to extend context
     hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 11ae96e930da7..515e0a93ac2a8 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -137,7 +137,7 @@ class TestRotaryEmbedding(torch.nn.Module):
             self.head_dim,
             rotary_dim=self.rotary_dim,
             max_position=max_position,
-            base=base,
+            rope_parameters={"rope_type": "default", "rope_theta": base},
         )
 
     def forward(self, positions, q, k):
@@ -172,7 +172,7 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=base,
+            rope_parameters={"rope_type": "default", "rope_theta": base},
         )
 
     def forward(self, positions, hidden_states):
diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py
index 02b795721f46e..43b242ab2d586 100644
--- a/tests/kernels/core/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -5,11 +5,11 @@ from typing import NamedTuple
 import pytest
 import torch
 from packaging.version import Version
-from transformers import AutoConfig
 from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
+from vllm.transformers_utils.config import get_config
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -98,8 +98,7 @@ def test_mrope(
     atol = model_info.atol
     rtol = model_info.rtol
 
-    config = AutoConfig.from_pretrained(model_name)
-    config = config.get_text_config()
+    config = get_config(model_name, False).get_text_config()
 
     # get the model config
     total_num_kv_heads = config.num_key_value_heads
@@ -113,7 +112,6 @@ def test_mrope(
     )
     is_neox_style = True
 
-    rope_theta = config.rope_theta
     max_position = config.max_position_embeddings
     partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
     rotary_dim = int(head_dim * partial_rotary_factor)
@@ -122,9 +120,8 @@ def test_mrope(
         head_size=head_dim,
         rotary_dim=rotary_dim,
         max_position=max_position,
-        base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=config.rope_scaling,
+        rope_parameters=config.rope_parameters,
         dtype=dtype,
     ).to(device=device)
 
@@ -173,8 +170,7 @@ def test_mrope_torch_compile_tracing(
     atol = model_info.atol
     rtol = model_info.rtol
 
-    config = AutoConfig.from_pretrained(model_name)
-    config = config.get_text_config()
+    config = get_config(model_name, False).get_text_config()
 
     # get the model config
     total_num_kv_heads = config.num_key_value_heads
@@ -187,7 +183,6 @@ def test_mrope_torch_compile_tracing(
         else config.hidden_size // total_num_heads
     )
     is_neox_style = True
-    rope_theta = config.rope_theta
     max_position = config.max_position_embeddings
     partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
     rotary_dim = int(head_dim * partial_rotary_factor)
@@ -196,9 +191,8 @@ def test_mrope_torch_compile_tracing(
         head_size=head_dim,
         rotary_dim=rotary_dim,
         max_position=max_position,
-        base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=config.rope_scaling,
+        rope_parameters=config.rope_parameters,
         dtype=dtype,
     ).to(device=device)
 
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index c35ee5016ba05..a8ed3825689d3 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -74,7 +74,7 @@ def test_rotary_embedding(
     device: str,
     use_key: bool,
     max_position: int = 8192,
-    base: float = 10000,
+    rope_theta: float = 10000,
 ) -> None:
     if rotary_dim is None:
         rotary_dim = head_size
@@ -83,7 +83,8 @@ def test_rotary_embedding(
     torch.set_default_device(device)
     if rotary_dim is None:
         rotary_dim = head_size
-    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+    rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+    rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters)
     rope = rope.to(dtype=dtype, device=torch.get_default_device())
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
@@ -120,9 +121,9 @@ def test_rotary_embedding(
 @torch.inference_mode()
 def test_rope_module_cache():
     MAX_POSITIONS = [123, 1234]
-    BASES = [10000, 1000000]
-    ROPE_SCALINGS = (
-        None,
+    ROPE_THETAS = [10000, 1000000]
+    ROPE_PARAMETERS = (
+        {"rope_type": "default"},
         {"rope_type": "linear", "factor": (1,)},
         {"rope_type": "dynamic", "factor": 1},
     )
@@ -130,9 +131,9 @@ def test_rope_module_cache():
         HEAD_SIZES,
         ROTARY_DIMS,
         MAX_POSITIONS,
-        BASES,
+        ROPE_THETAS,
         IS_NEOX_STYLE,
-        ROPE_SCALINGS,
+        ROPE_PARAMETERS,
         DTYPES,
     )
     rope_setting_id_map: dict[str, int] = {}
@@ -141,20 +142,20 @@ def test_rope_module_cache():
             head_size,
             rotary_dim,
             max_position,
-            base,
-            is_neox_stype,
-            rope_scaling,
+            rope_theta,
+            is_neox_style,
+            rope_parameters,
             dtype,
         ) = setting
         if rotary_dim is None:
             rotary_dim = head_size
+        rope_parameters["rope_theta"] = rope_theta
         rope = get_rope(
             head_size,
             rotary_dim,
             max_position,
-            base,
-            is_neox_stype,
-            rope_scaling,
+            is_neox_style,
+            rope_parameters,
             dtype,
         )
         # different settings cannot share the same rope module
@@ -168,20 +169,20 @@ def test_rope_module_cache():
             head_size,
             rotary_dim,
             max_position,
-            base,
-            is_neox_stype,
-            rope_scaling,
+            rope_theta,
+            is_neox_style,
+            rope_parameters,
             dtype,
         ) = setting
         if rotary_dim is None:
             rotary_dim = head_size
+        rope_parameters["rope_theta"] = rope_theta
         rope = get_rope(
             head_size,
             rotary_dim,
             max_position,
-            base,
-            is_neox_stype,
-            rope_scaling,
+            is_neox_style,
+            rope_parameters,
             dtype,
         )
         # check if cache take effect
diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
index dfd317bcf72f1..af33fd4e3fc3b 100644
--- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -201,7 +201,7 @@ class ModelConfig:
     sliding_window: int = 128
     initial_context_length: int = 4096
     rope_theta: float = 150000.0
-    rope_scaling_factor: float = 32.0
+    rope_parameters_factor: float = 32.0
     rope_ntk_alpha: float = 1.0
     rope_ntk_beta: float = 32.0
 
diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py
index 88f088c603276..d6216a87a229e 100644
--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
+from typing import Any
+
 import pytest
 
 from ...utils import EmbedModelInfo
@@ -79,8 +81,8 @@ def test_set_max_model_len_illegal(model_info, vllm_runner):
 @pytest.mark.parametrize("model_info", MODELS)
 def test_use_rope_scaling_legal(model_info, vllm_runner):
     hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
@@ -96,9 +98,9 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_use_rope_scaling_illegal(model_info, vllm_runner):
-    hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+    hf_overrides: dict[str, Any] = {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
@@ -115,8 +117,8 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
             pass
 
     hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
diff --git a/tests/test_config.py b/tests/test_config.py
index bba2fbec3db29..16f68d18fc68b 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -249,45 +249,48 @@ def test_get_bert_tokenization_sentence_transformer_config():
 
 
 def test_rope_customization():
-    TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
-    TEST_ROPE_THETA = 16_000_000.0
-    LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
+    TEST_ROPE_PARAMETERS = {
+        "rope_theta": 16_000_000.0,
+        "rope_type": "dynamic",
+        "factor": 2.0,
+    }
+    LLAMA_ROPE_PARAMETERS = {"rope_theta": 500000.0, "rope_type": "default"}
+    LONGCHAT_ROPE_PARAMETERS = {"rope_type": "linear", "factor": 8.0}
 
     llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct")
-    assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
-    assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
+    assert (
+        getattr(llama_model_config.hf_config, "rope_parameters", None)
+        == LLAMA_ROPE_PARAMETERS
+    )
     assert llama_model_config.max_model_len == 8192
 
     llama_model_config = ModelConfig(
         "meta-llama/Meta-Llama-3-8B-Instruct",
-        hf_overrides={
-            "rope_scaling": TEST_ROPE_SCALING,
-            "rope_theta": TEST_ROPE_THETA,
-        },
+        hf_overrides={"rope_parameters": TEST_ROPE_PARAMETERS},
     )
     assert (
-        getattr(llama_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING
+        getattr(llama_model_config.hf_config, "rope_parameters", None)
+        == TEST_ROPE_PARAMETERS
     )
-    assert getattr(llama_model_config.hf_config, "rope_theta", None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig("lmsys/longchat-13b-16k")
-    # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
+    # Check if LONGCHAT_ROPE_PARAMETERS entries are in longchat_model_config
     assert all(
-        longchat_model_config.hf_config.rope_scaling.get(key) == value
-        for key, value in LONGCHAT_ROPE_SCALING.items()
+        longchat_model_config.hf_config.rope_parameters.get(key) == value
+        for key, value in LONGCHAT_ROPE_PARAMETERS.items()
     )
     assert longchat_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig(
         "lmsys/longchat-13b-16k",
         hf_overrides={
-            "rope_scaling": TEST_ROPE_SCALING,
+            "rope_parameters": TEST_ROPE_PARAMETERS,
         },
     )
     assert (
-        getattr(longchat_model_config.hf_config, "rope_scaling", None)
-        == TEST_ROPE_SCALING
+        getattr(longchat_model_config.hf_config, "rope_parameters", None)
+        == TEST_ROPE_PARAMETERS
     )
     assert longchat_model_config.max_model_len == 4096
 
diff --git a/vllm/config/model.py b/vllm/config/model.py
index b563a40eb8fc9..d1e56a72a318b 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -11,6 +11,7 @@ import torch
 from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
+from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 import vllm.envs as envs
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
@@ -2100,31 +2101,32 @@ def _get_and_verify_max_len(
         )
         derived_max_model_len = default_max_len
 
-    rope_scaling = getattr(hf_config, "rope_scaling", None)
+    # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict].
+    # To simplify the verification, we convert it to dict[str, TypedDict].
+    rope_parameters = getattr(hf_config, "rope_parameters", None)
+    if rope_parameters and not set(rope_parameters.keys()).issubset(
+        ALLOWED_LAYER_TYPES
+    ):
+        rope_parameters = {"": rope_parameters}
+
     # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE
     # scaling, so we skip applying the scaling factor again.
-    if rope_scaling is not None and "gemma3" not in hf_config.model_type:
-        # No need to consider "type" key because of patch_rope_scaling when
-        # loading HF config
-        rope_type = rope_scaling["rope_type"]
+    if rope_parameters is not None and "gemma3" not in hf_config.model_type:
+        scaling_factor = 1.0
+        for rp in rope_parameters.values():
+            # No need to consider "type" key because of patch_rope_parameters when
+            # loading HF config
+            rope_type = rp["rope_type"]
 
-        if rope_type not in ("su", "longrope", "llama3"):
-            if disable_sliding_window:
-                # TODO(robertgshaw): Find a model that supports rope_scaling
-                # with sliding window to see if this case should be allowed.
-                raise NotImplementedError(
-                    "Disabling sliding window is not supported for models "
-                    "with rope_scaling. Please raise an issue so we can "
-                    "investigate."
-                )
+            if rope_type not in ("su", "longrope", "llama3"):
+                # NOTE: rope_type == "default" does not define factor https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
+                # NOTE: This assumes all layer types have the same scaling factor.
+                scaling_factor = rp.get("factor", scaling_factor)
 
-            # NOTE: rope_type == "default" does not define factor
-            # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
-            scaling_factor = rope_scaling.get("factor", 1.0)
-
-            if rope_type == "yarn":
-                derived_max_model_len = rope_scaling["original_max_position_embeddings"]
-            derived_max_model_len *= scaling_factor
+                if rope_type == "yarn":
+                    derived_max_model_len = rp["original_max_position_embeddings"]
+        # Do this outside loop since all layer types should have the same scaling
+        derived_max_model_len *= scaling_factor
 
     if encoder_config and "max_seq_length" in encoder_config:
         derived_max_model_len = encoder_config["max_seq_length"]
@@ -2134,7 +2136,9 @@ def _get_and_verify_max_len(
     if max_model_len is None:
         # For LongRoPE, default to original_max_position_embeddings to avoid
         # performance degradation for shorter sequences
-        if rope_scaling is not None and rope_scaling["rope_type"] == "longrope":
+        if rope_parameters is not None and any(
+            rp["rope_type"] == "longrope" for rp in rope_parameters.values()
+        ):
             max_model_len = int(
                 getattr(
                     hf_config, "original_max_position_embeddings", derived_max_model_len
@@ -2151,16 +2155,7 @@ def _get_and_verify_max_len(
         # that will be bigger than derived_max_model_len. We compare user input
         # with model_max_length and allow this override when it's smaller.
         model_max_length = getattr(hf_config, "model_max_length", None)
-        if model_max_length is not None and max_model_len <= model_max_length:
-            if disable_sliding_window:
-                # TODO(robertgshaw): Find a model that has model_max_length
-                # with sliding window to see if this case should be allowed.
-                raise NotImplementedError(
-                    "Disabling sliding window is not supported for models "
-                    "model_max_length in the config. Please raise an issue "
-                    "so we can investigate."
-                )
-        else:
+        if model_max_length is None or max_model_len > model_max_length:
             msg = (
                 f"User-specified max_model_len ({max_model_len}) is greater "
                 f"than the derived max_model_len ({max_len_key}="
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index 56c165f9c041a..ae8a7d93b50e4 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -26,23 +26,23 @@ def get_rope(
     head_size: int,
     rotary_dim: int,
     max_position: int,
-    base: float,
     is_neox_style: bool = True,
-    rope_scaling: dict[str, Any] | None = None,
+    rope_parameters: dict[str, Any] | None = None,
     dtype: torch.dtype | None = None,
     partial_rotary_factor: float = 1.0,
     dual_chunk_attention_config: dict[str, Any] | None = None,
 ) -> RotaryEmbedding:
     if dtype is None:
         dtype = torch.get_default_dtype()
-    if rope_scaling is not None:
+    if rope_parameters is not None:
         # Transforms every value that is a list into a tuple for caching calls
-        rope_scaling_tuple = {
-            k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items()
+        rope_parameters_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in rope_parameters.items()
         }
-        rope_scaling_args = tuple(rope_scaling_tuple.items())
+        rope_parameters_args = tuple(rope_parameters_tuple.items())
     else:
-        rope_scaling_args = None
+        rope_parameters_args = None
 
     if dual_chunk_attention_config is not None:
         dual_chunk_attention_tuple = {
@@ -60,15 +60,15 @@ def get_rope(
         head_size,
         rotary_dim,
         max_position,
-        base,
         is_neox_style,
-        rope_scaling_args,
+        rope_parameters_args,
         dual_chunk_attention_args,
         dtype,
     )
     if key in _ROPE_DICT:
         return _ROPE_DICT[key]
 
+    base = rope_parameters["rope_theta"] if rope_parameters else 10000
     if dual_chunk_attention_config is not None:
         extra_kwargs = {
             k: v
@@ -84,18 +84,18 @@ def get_rope(
             dtype,
             **extra_kwargs,
         )
-    elif not rope_scaling:
+    elif not rope_parameters:
         rotary_emb = RotaryEmbedding(
             head_size, rotary_dim, max_position, base, is_neox_style, dtype
         )
     else:
-        scaling_type = rope_scaling["rope_type"]
+        scaling_type = rope_parameters["rope_type"]
 
         if scaling_type == "llama3":
-            scaling_factor = rope_scaling["factor"]
-            low_freq_factor = rope_scaling["low_freq_factor"]
-            high_freq_factor = rope_scaling["high_freq_factor"]
-            original_max_position = rope_scaling["original_max_position_embeddings"]
+            scaling_factor = rope_parameters["factor"]
+            low_freq_factor = rope_parameters["low_freq_factor"]
+            high_freq_factor = rope_parameters["high_freq_factor"]
+            original_max_position = rope_parameters["original_max_position_embeddings"]
             rotary_emb = Llama3RotaryEmbedding(
                 head_size,
                 rotary_dim,
@@ -113,7 +113,7 @@ def get_rope(
                 head_size, rotary_dim, max_position, base, is_neox_style, dtype
             )
         elif scaling_type == "default":
-            if "mrope_section" in rope_scaling:
+            if "mrope_section" in rope_parameters:
                 rotary_emb = MRotaryEmbedding(
                     head_size,
                     rotary_dim,
@@ -121,8 +121,8 @@ def get_rope(
                     base,
                     is_neox_style,
                     dtype,
-                    mrope_section=rope_scaling["mrope_section"],
-                    mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
+                    mrope_section=rope_parameters["mrope_section"],
+                    mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
                 )
             else:
                 rotary_emb = RotaryEmbedding(
@@ -134,7 +134,7 @@ def get_rope(
                     dtype,
                 )
         elif scaling_type == "linear":
-            scaling_factor = rope_scaling["factor"]
+            scaling_factor = rope_parameters["factor"]
             rotary_emb = LinearScalingRotaryEmbedding(
                 head_size,
                 rotary_dim,
@@ -145,8 +145,8 @@ def get_rope(
                 dtype,
             )
         elif scaling_type == "ntk":
-            scaling_factor = rope_scaling["factor"]
-            mixed_b = rope_scaling.get("mixed_b", None)
+            scaling_factor = rope_parameters["factor"]
+            mixed_b = rope_parameters.get("mixed_b")
             rotary_emb = NTKScalingRotaryEmbedding(
                 head_size,
                 rotary_dim,
@@ -158,8 +158,8 @@ def get_rope(
                 mixed_b,
             )
         elif scaling_type == "dynamic":
-            if "alpha" in rope_scaling:
-                scaling_alpha = rope_scaling["alpha"]
+            if "alpha" in rope_parameters:
+                scaling_alpha = rope_parameters["alpha"]
                 rotary_emb = DynamicNTKAlphaRotaryEmbedding(
                     head_size,
                     rotary_dim,
@@ -169,8 +169,8 @@ def get_rope(
                     scaling_alpha,
                     dtype,
                 )
-            elif "factor" in rope_scaling:
-                scaling_factor = rope_scaling["factor"]
+            elif "factor" in rope_parameters:
+                scaling_factor = rope_parameters["factor"]
                 rotary_emb = DynamicNTKScalingRotaryEmbedding(
                     head_size,
                     rotary_dim,
@@ -185,11 +185,11 @@ def get_rope(
                     "Dynamic rope scaling must contain either 'alpha' or 'factor' field"
                 )
         elif scaling_type == "yarn":
-            scaling_factor = rope_scaling["factor"]
-            original_max_position = rope_scaling["original_max_position_embeddings"]
+            scaling_factor = rope_parameters["factor"]
+            original_max_position = rope_parameters["original_max_position_embeddings"]
             extra_kwargs = {
                 k: v
-                for k, v in rope_scaling.items()
+                for k, v in rope_parameters.items()
                 if k
                 in (
                     "extrapolation_factor",
@@ -199,7 +199,7 @@ def get_rope(
                     "apply_yarn_scaling",
                 )
             }
-            if "mrope_section" in rope_scaling:
+            if "mrope_section" in rope_parameters:
                 extra_kwargs.pop("apply_yarn_scaling", None)
                 rotary_emb = MRotaryEmbedding(
                     head_size,
@@ -208,8 +208,8 @@ def get_rope(
                     base,
                     is_neox_style,
                     dtype,
-                    mrope_section=rope_scaling["mrope_section"],
-                    mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
+                    mrope_section=rope_parameters["mrope_section"],
+                    mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
                     scaling_factor=scaling_factor,
                     **extra_kwargs,
                 )
@@ -225,12 +225,12 @@ def get_rope(
                     **extra_kwargs,
                 )
         elif scaling_type == "deepseek_yarn":
-            scaling_factor = rope_scaling["factor"]
-            original_max_position = rope_scaling["original_max_position_embeddings"]
+            scaling_factor = rope_parameters["factor"]
+            original_max_position = rope_parameters["original_max_position_embeddings"]
             # assert max_position == original_max_position * scaling_factor
             extra_kwargs = {
                 k: v
-                for k, v in rope_scaling.items()
+                for k, v in rope_parameters.items()
                 if k
                 in (
                     "extrapolation_factor",
@@ -252,12 +252,12 @@ def get_rope(
                 **extra_kwargs,
             )
         elif scaling_type == "longrope":
-            short_factor = rope_scaling["short_factor"]
-            long_factor = rope_scaling["long_factor"]
-            original_max_position = rope_scaling["original_max_position_embeddings"]
+            short_factor = rope_parameters["short_factor"]
+            long_factor = rope_parameters["long_factor"]
+            original_max_position = rope_parameters["original_max_position_embeddings"]
             extra_kwargs = {
                 k: v
-                for k, v in rope_scaling.items()
+                for k, v in rope_parameters.items()
                 if k in ("short_mscale", "long_mscale")
             }
             rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py
index 6f654f47495f7..4eb5665a71fc8 100644
--- a/vllm/model_executor/models/afmoe.py
+++ b/vllm/model_executor/models/afmoe.py
@@ -5,7 +5,6 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -171,8 +170,6 @@ class AfmoeAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 131072,
         head_dim: int | None = None,
         rms_norm_eps: float = 1e-05,
@@ -202,7 +199,6 @@ class AfmoeAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         # Check if this is a local attention layer
@@ -246,8 +242,7 @@ class AfmoeAttention(nn.Module):
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=max_position_embeddings,
-                base=rope_theta,
-                rope_scaling=rope_scaling,
+                rope_parameters=config["rope_parameters"],
                 is_neox_style=True,
             )
         else:
@@ -303,14 +298,6 @@ class AfmoeDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
 
         # DecoderLayers are created with `make_layers` which passes the prefix
@@ -323,8 +310,6 @@ class AfmoeDecoderLayer(nn.Module):
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             head_dim=config.head_dim,
             rms_norm_eps=config.rms_norm_eps,
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 0a8f21abb0a35..b75e91319bbad 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -27,7 +27,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -118,8 +117,6 @@ class ApertusAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -155,7 +152,6 @@ class ApertusAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -176,9 +172,7 @@ class ApertusAttention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
-        self._init_rotary_emb(
-            config, rope_scaling=rope_scaling, quant_config=quant_config
-        )
+        self._init_rotary_emb(config, quant_config=quant_config)
 
         sliding_window = None
         if layer_types := getattr(config, "layer_types", None):
@@ -224,7 +218,6 @@ class ApertusAttention(nn.Module):
     def _init_rotary_emb(
         self,
         config: ApertusConfig,
-        rope_scaling: dict[str, Any] | None,
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
@@ -236,8 +229,7 @@ class ApertusAttention(nn.Module):
             self.head_dim,
             rotary_dim=int(self.partial_rotary_factor * self.head_dim),
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
             partial_rotary_factor=self.partial_rotary_factor,
         )
@@ -253,14 +245,6 @@ class ApertusDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -288,8 +272,6 @@ class ApertusDecoderLayer(nn.Module):
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index 20c3ff0754506..b3887b16f4d74 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -103,15 +103,6 @@ class ArceeDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Rotary embedding parameters (reuse LLaMA defaults)
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Determine if attention bias is needed (some variants use bias terms)
         attention_bias = getattr(config, "attention_bias", False) or getattr(
@@ -133,8 +124,6 @@ class ArceeDecoderLayer(nn.Module):
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index b5cc07a56535d..b75a254761d4e 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -292,7 +292,6 @@ class ArcticAttention(nn.Module):
         self.kv_size = self.num_kv_heads * self.head_dim
 
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
         self.scaling = self.head_dim**-0.5
 
         self.qkv_proj = QKVParallelLinear(
@@ -317,7 +316,7 @@ class ArcticAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=int(self.rope_theta),
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 8991ef4c606b6..edf47270e5277 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -136,7 +136,7 @@ class BaiChuanAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         position_embedding: str,
-        rope_theta: float = 10000,
+        rope_parameters: dict,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -150,7 +150,6 @@ class BaiChuanAttention(nn.Module):
         self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
         self.head_dim = hidden_size // self.total_num_heads
         self.position_embedding = position_embedding
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         # pylint: disable=invalid-name
@@ -192,7 +191,7 @@ class BaiChuanAttention(nn.Module):
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=self.max_position_embeddings,
-                base=self.rope_theta,
+                rope_parameters=rope_parameters,
             )
             self.scaling = self.head_dim**-0.5
             self.attn = Attention(
@@ -229,13 +228,12 @@ class BaiChuanDecoderLayer(nn.Module):
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = BaiChuanAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             position_embedding=position_embedding,
-            rope_theta=rope_theta,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index 024425bb24406..cc10e936a2d3d 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -135,9 +135,8 @@ class BailingAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.rotary_dim,
             max_position=config.max_position_embeddings,
-            base=config.rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
-            rope_scaling=config.rope_scaling,
             partial_rotary_factor=self.partial_rotary_factor,
         )
 
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index c6cc83487fec2..4422bb5da98f4 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -156,8 +156,6 @@ class BambaAttentionDecoderLayer(nn.Module):
         prefix: str = "",
     ) -> None:
         super().__init__()
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -178,7 +176,6 @@ class BambaAttentionDecoderLayer(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         if hasattr(config, "partial_rotary_factor"):
@@ -192,8 +189,7 @@ class BambaAttentionDecoderLayer(nn.Module):
             head_size=self.head_dim,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            rope_scaling=rope_scaling,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
             dtype=torch.get_default_dtype(),  # see impl of get_rope
         )
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 3c87bbfefab3d..b5a6d00dc309f 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -265,8 +265,7 @@ class ChameleonAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any],
         max_position_embeddings: int = 4096,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -293,7 +292,6 @@ class ChameleonAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -318,8 +316,7 @@ class ChameleonAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
 
         self.attn = Attention(
@@ -369,14 +366,6 @@ class ChameleonDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
 
         self.self_attn = ChameleonAttention(
@@ -385,8 +374,7 @@ class ChameleonDecoderLayer(nn.Module):
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=False,
@@ -439,14 +427,6 @@ class ChameleonSwinDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
 
         self.self_attn = ChameleonAttention(
@@ -455,8 +435,7 @@ class ChameleonSwinDecoderLayer(nn.Module):
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=False,
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 5d6f5e9125a28..dbfcd62d0bcab 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -99,6 +99,7 @@ class GLMAttention(nn.Module):
         # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
         rope_ratio = getattr(config, "rope_ratio", 1.0)
         max_positions = getattr(config, "seq_length", 8192)
+        rope_parameters = {"rope_type": "default", "rope_theta": 10000 * rope_ratio}
         # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False,
         # which is equivalent to is_neox_style=True
         is_neox_style = not config.original_rope
@@ -106,7 +107,7 @@ class GLMAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim // 2,
             max_position=max_positions,
-            base=10000 * rope_ratio,
+            rope_parameters=rope_parameters,
             is_neox_style=is_neox_style,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 77bb178519813..5ed920927c772 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -156,8 +156,6 @@ class CohereAttention(nn.Module):
         self.max_position_embeddings = getattr(
             config, "model_max_length", None
         ) or getattr(config, "max_position_embeddings", 8192)
-        self.rope_theta = config.rope_theta
-        self.rope_scaling = getattr(config, "rope_scaling", None)
         self.use_qk_norm = getattr(config, "use_qk_norm", False)
         self.qkv_proj = QKVParallelLinear(
             self.hidden_size,
@@ -179,8 +177,7 @@ class CohereAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=False,
         )
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 66b246878b0aa..3cf4bf991e667 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -8,6 +8,7 @@ import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
+from vllm.transformers_utils.config import set_default_rope_theta
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
@@ -46,8 +47,7 @@ class GteNewModelConfig(VerifyAndUpdateConfig):
             "head_size": head_dim,
             "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
             "max_position": config.max_position_embeddings,
-            "base": config.rope_theta,
-            "rope_scaling": getattr(config, "rope_scaling", None),
+            "rope_parameters": config.rope_parameters,
         }
 
 
@@ -78,12 +78,13 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
             if not model_config.enforce_eager:
                 max_position = round_up(max_position, 8)
 
+            set_default_rope_theta(config, default_theta=config.rotary_emb_base)
+
             config.rotary_kwargs = {
                 "head_size": head_dim,
                 "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
                 "max_position": max_position,
-                "base": getattr(config, "rope_theta", config.rotary_emb_base),
-                "rope_scaling": getattr(config, "rope_scaling", None),
+                "rope_parameters": config.rope_parameters,
             }
 
 
@@ -117,18 +118,20 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
         head_dim = config.hidden_size // config.num_attention_heads
         rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
         max_trained_positions = getattr(config, "max_trained_positions", 2048)
+
+        set_default_rope_theta(config, default_theta=config.rotary_emb_base)
+
         config.rotary_kwargs = {
             "head_size": head_dim,
             "rotary_dim": rotary_emb_dim,
             "max_position": max_trained_positions,
-            "base": getattr(config, "rope_theta", config.rotary_emb_base),
-            "rope_scaling": getattr(config, "rope_scaling", None),
+            "rope_parameters": config.rope_parameters,
         }
 
         # we ignore config.rotary_scaling_factor so that for datasets shorter
         # than max_trained_positions 2048, the results are consistent
         # with SentenceTransformer.
-        # The context extension uses vllm style rope_theta and rope_scaling.
+        # The context extension uses vllm style rope_theta and rope_parameters.
         # See #17785 #18755
         if (
             not vllm_config.model_config.hf_overrides
@@ -172,7 +175,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
             if hasattr(hf_text_config, "max_model_len"):
                 delattr(hf_text_config, "max_model_len")
             hf_text_config.max_position_embeddings = max_trained_positions
-            hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"]
+            hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"]
 
             # The priority of sentence_bert_config.json is higher
             # than max_position_embeddings
@@ -246,8 +249,7 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
             "head_size": head_dim,
             "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
             "max_position": config.max_position_embeddings,
-            "base": config.rope_theta,
-            "rope_scaling": getattr(config, "rope_scaling", None),
+            "rope_parameters": config.rope_parameters,
         }
 
 
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 528ef4f76742d..2c729019081a4 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -197,7 +197,10 @@ class DbrxAttention(nn.Module):
         self.head_dim = self.d_model // self.total_num_heads
         self.total_num_kv_heads = config.attn_config.kv_n_heads
         self.clip_qkv = config.attn_config.clip_qkv
-        self.rope_theta = config.attn_config.rope_theta
+        rope_parameters = {
+            "rope_type": "default",
+            "rope_theta": int(config.attn_config.rope_theta),
+        }
         self.max_position = config.max_seq_len
 
         # pylint: disable=invalid-name
@@ -221,7 +224,7 @@ class DbrxAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position,
-            base=int(self.rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index e8ee9951d6119..6675b2133f386 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -27,7 +27,6 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -111,8 +110,6 @@ class DeepseekAttention(nn.Module):
         config: DeepseekV2Config | DeepseekV3Config,
         hidden_size: int,
         num_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -139,7 +136,6 @@ class DeepseekAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -162,8 +158,7 @@ class DeepseekAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -409,8 +404,6 @@ class DeepseekV2Attention(nn.Module):
         v_head_dim: int,
         q_lora_rank: int,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -430,7 +423,6 @@ class DeepseekV2Attention(nn.Module):
         assert num_heads % tp_size == 0
         self.num_local_heads = num_heads // tp_size
         self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         assert topk_indices_buffer is None, (
             "topk_indices_buffer is not \
@@ -485,21 +477,20 @@ class DeepseekV2Attention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.o_proj",
         )
-        if rope_scaling:
-            rope_scaling["rope_type"] = "deepseek_yarn"
+        if config.rope_parameters["rope_type"] != "default":
+            config.rope_parameters["rope_type"] = "deepseek_yarn"
 
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=False,
         )
 
-        if rope_scaling:
-            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
-            scaling_factor = rope_scaling["factor"]
+        if config.rope_parameters["rope_type"] != "default":
+            mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
+            scaling_factor = config.rope_parameters["factor"]
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
@@ -903,8 +894,6 @@ class DeepseekV2MLAAttention(nn.Module):
         v_head_dim: int,
         q_lora_rank: int | None,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -927,7 +916,6 @@ class DeepseekV2MLAAttention(nn.Module):
         self.num_local_heads = num_heads // tp_size
 
         self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         if self.q_lora_rank is not None:
@@ -981,19 +969,18 @@ class DeepseekV2MLAAttention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
-        if rope_scaling:
-            rope_scaling["rope_type"] = "deepseek_yarn"
+        if config.rope_parameters["rope_type"] != "default":
+            config.rope_parameters["rope_type"] = "deepseek_yarn"
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=False,
         )
-        if rope_scaling:
-            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
-            scaling_factor = rope_scaling["factor"]
+        if config.rope_parameters["rope_type"] != "default":
+            mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
+            scaling_factor = config.rope_parameters["factor"]
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
@@ -1073,8 +1060,6 @@ class DeepseekV2DecoderLayer(nn.Module):
         parallel_config = vllm_config.parallel_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         moe_layer_freq = getattr(config, "moe_layer_freq", 1)
         # DecoderLayers are created with `make_layers` which passes the prefix
@@ -1107,8 +1092,6 @@ class DeepseekV2DecoderLayer(nn.Module):
             v_head_dim=v_head_dim,
             q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None,
             kv_lora_rank=kv_lora_rank,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index d24da0c42a254..e65c275106a4e 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -27,7 +27,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -202,8 +201,6 @@ class Dots1Attention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         config: Dots1Config,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -229,7 +226,6 @@ class Dots1Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         attention_bias = config.attention_bias
 
@@ -255,8 +251,7 @@ class Dots1Attention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -296,8 +291,6 @@ class Dots1DecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         layer_idx = int(prefix.split(sep=".")[-1])
         self.layer_idx = layer_idx
@@ -307,8 +300,6 @@ class Dots1DecoderLayer(nn.Module):
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             config=config,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index f2999968669f6..a7df3509e3ecd 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -62,6 +62,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (
@@ -232,9 +233,8 @@ class Ernie4_5_MoeAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict[str, Any],
         head_dim: int | None = None,
-        rope_theta: float = 500000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 131072,
         rms_norm_eps: float = 1e-05,
         qkv_bias: bool = False,
@@ -266,7 +266,6 @@ class Ernie4_5_MoeAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -291,9 +290,8 @@ class Ernie4_5_MoeAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=rope_parameters,
             is_neox_style=False,
-            rope_scaling=rope_scaling,
         )
         self.attn = Attention(
             self.num_heads,
@@ -333,16 +331,14 @@ class Ernie4_5_MoeDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 500000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=500000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
         self.self_attn = Ernie4_5_MoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             head_dim=getattr(config, "head_dim", None),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
             qkv_bias=getattr(config, "use_bias", False),
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index e8ef86f9b7f01..50e033d77606d 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -58,6 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .ernie45_moe import Ernie4_5_MoeMLP
 from .interfaces import SupportsPP
@@ -91,9 +92,8 @@ class Ernie4_5_VLMoeAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict[str, Any],
         head_dim: int | None = None,
-        rope_theta: float = 500000,
-        rope_scaling: dict[str, Any] | None = None,
         freq_allocation: int = 20,
         max_position_embeddings: int = 131072,
         rms_norm_eps: float = 1e-05,
@@ -126,7 +126,6 @@ class Ernie4_5_VLMoeAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -155,7 +154,7 @@ class Ernie4_5_VLMoeAttention(nn.Module):
             head_size=self.head_dim,
             rotary_dim=self.head_dim,
             max_position_embeddings=max_position_embeddings,
-            base=rope_theta,
+            base=rope_parameters["rope_theta"],
             is_neox_style=False,
             dtype=torch.get_default_dtype(),
             mrope_section=[h_rope, w_rope, t_rope],
@@ -413,8 +412,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 500000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=500000)
         freq_allocation = getattr(config, "freq_allocation", 20)
         max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
 
@@ -423,8 +421,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             head_dim=getattr(config, "head_dim", None),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             freq_allocation=freq_allocation,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 6c56bfc433c7a..d13275488fe99 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -27,7 +27,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -113,8 +112,6 @@ class ExaoneAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -144,7 +141,6 @@ class ExaoneAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -173,8 +169,7 @@ class ExaoneAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
         )
         self.attn = Attention(
@@ -207,8 +202,6 @@ class ExaoneBlockAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -221,8 +214,6 @@ class ExaoneBlockAttention(nn.Module):
             hidden_size=hidden_size,
             num_heads=num_heads,
             num_kv_heads=num_kv_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=bias,
@@ -251,14 +242,6 @@ class ExaoneDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -272,8 +255,6 @@ class ExaoneDecoderLayer(nn.Module):
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index b89e168ada20e..70f3cce2b7c56 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -23,7 +23,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -52,6 +51,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
@@ -110,8 +110,6 @@ class Exaone4Attention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 1000000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -141,7 +139,6 @@ class Exaone4Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -176,12 +173,12 @@ class Exaone4Attention(nn.Module):
         # apply rotary embeddings to every layer in full attention models
         self.apply_rope_all_layers = "sliding_attention" not in config.layer_types
 
+        set_default_rope_theta(config, default_theta=1000000)
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
         )
         self.attn = Attention(
@@ -227,14 +224,6 @@ class Exaone4DecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -249,8 +238,6 @@ class Exaone4DecoderLayer(nn.Module):
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 85acdff3d96b4..dc2d51f340c8c 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -164,13 +164,12 @@ class FalconAttention(nn.Module):
         )
 
         if self.use_rotary:
-            rope_theta = getattr(config, "rope_theta", 10000)
             max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
             self.rotary_emb = get_rope(
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=max_position_embeddings,
-                base=rope_theta,
+                rope_parameters=config.rope_parameters,
             )
             self.attn = Attention(
                 self.num_heads,
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index b985847af5daf..9433f0d1b4a49 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -35,6 +35,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import (
     HasInnerState,
@@ -214,8 +215,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
         prefix: str = "",
     ) -> None:
         super().__init__()
-        rope_theta = getattr(config, "rope_theta", 1e11)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=1e11)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -240,7 +240,6 @@ class FalconH1AttentionDecoderLayer(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         if hasattr(config, "partial_rotary_factor"):
@@ -254,8 +253,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
             head_size=self.head_dim,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            rope_scaling=rope_scaling,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
             dtype=None,  # see impl of get_rope
         )
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 7aaae7c503b58..00c7f59a08094 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -20,6 +20,7 @@
 from collections.abc import Iterable
 from functools import cache
 from itertools import islice
+from typing import Any
 
 import torch
 from torch import nn
@@ -127,8 +128,8 @@ class GemmaAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         head_dim: int,
+        rope_parameters: dict[str, Any],
         max_position_embeddings: int = 8192,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -153,7 +154,6 @@ class GemmaAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -176,7 +176,7 @@ class GemmaAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -218,7 +218,7 @@ class GemmaDecoderLayer(nn.Module):
             num_kv_heads=config.num_key_value_heads,
             head_dim=config.head_dim,
             max_position_embeddings=config.max_position_embeddings,
-            rope_theta=config.rope_theta,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 4d5d6cbb37c62..9b6cfe6932300 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -107,7 +107,6 @@ class Gemma2Attention(nn.Module):
         num_kv_heads: int,
         head_dim: int,
         max_position_embeddings: int,
-        rope_theta: float,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         attn_logits_soft_cap: float | None = None,
@@ -134,7 +133,6 @@ class Gemma2Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = config.query_pre_attn_scalar**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -156,7 +154,7 @@ class Gemma2Attention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
 
@@ -206,7 +204,6 @@ class Gemma2DecoderLayer(nn.Module):
             num_kv_heads=config.num_key_value_heads,
             head_dim=config.head_dim,
             max_position_embeddings=config.max_position_embeddings,
-            rope_theta=config.rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
             attn_logits_soft_cap=config.attn_logit_softcapping,
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 357e61a4e78bf..565719ae7faeb 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -155,25 +155,28 @@ class Gemma3Attention(nn.Module):
         self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
 
         layer_idx = extract_layer_index(prefix)
-        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        layer_type = config.layer_types[layer_idx]
+        self.is_sliding = layer_type == "sliding_attention"
         sliding_window = config.sliding_window if self.is_sliding else None
 
         # Initialize the rotary embedding.
-        if self.is_sliding:
-            # Local attention. Override the values in config.json.
-            self.rope_theta = config.rope_local_base_freq
-            self.rope_scaling = {"rope_type": "default"}
+        if layer_type in config.rope_parameters:
+            # Transformers v5 rope config.
+            rope_parameters = config.rope_parameters[layer_type]
         else:
+            # Transformers v4 rope config.
             # Global attention. Use the values in config.json.
-            self.rope_theta = config.rope_theta
-            self.rope_scaling = config.rope_scaling
+            rope_parameters = config.rope_parameters.copy()
+            # Local attention. Override the values in config.json.
+            if self.is_sliding:
+                rope_parameters["rope_theta"] = config.rope_local_base_freq
+
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=rope_parameters,
             is_neox_style=True,
-            rope_scaling=self.rope_scaling,
         )
 
         if getattr(config, "is_causal", True):
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 64443190f53ed..8f1447ba34a81 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -332,18 +332,21 @@ class Gemma3nAttention(nn.Module):
         )
 
         layer_idx = extract_layer_index(prefix)
-        is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        layer_type = config.layer_types[layer_idx]
+        is_sliding = layer_type == "sliding_attention"
         self.sliding_window = config.sliding_window if is_sliding else None
 
         # Initialize the rotary embedding.
-        if is_sliding:
-            # Local attention. Override the values in config.json.
-            rope_theta = config.rope_local_base_freq
-            rope_scaling = {"rope_type": "default"}
+        if layer_type in config.rope_parameters:
+            # Transformers v5 rope config.
+            rope_parameters = config.rope_parameters[layer_type]
         else:
+            # Transformers v4 rope config.
             # Global attention. Use the values in config.json.
-            rope_theta = config.rope_theta
-            rope_scaling = config.rope_scaling
+            rope_parameters = config.rope_parameters.copy()
+            # Local attention. Override the values in config.json.
+            if is_sliding:
+                rope_parameters["rope_theta"] = config.rope_local_base_freq
 
         first_kv_shared_layer_idx = (
             config.num_hidden_layers - config.num_kv_shared_layers
@@ -383,9 +386,8 @@ class Gemma3nAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=rope_parameters,
             is_neox_style=True,
-            rope_scaling=rope_scaling,
         )
 
         self.attn = Attention(
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index faa0674a2e43d..f8ef3b0385fb1 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -57,10 +57,8 @@ class Glm4Attention(nn.Module):
         max_position: int = 4096 * 32,
         head_dim: int | None = None,
         qkv_bias: bool = False,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -86,7 +84,6 @@ class Glm4Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
             self.head_dim,
@@ -107,8 +104,7 @@ class Glm4Attention(nn.Module):
             self.head_dim,
             rotary_dim=self.rotary_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=partial_rotary_factor,
             is_neox_style=False,
         )
@@ -150,8 +146,6 @@ class Glm4DecoderLayer(nn.Module):
         quant_config = vllm_config.quant_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
 
         self.self_attn = Glm4Attention(
             config=config,
@@ -159,12 +153,10 @@ class Glm4DecoderLayer(nn.Module):
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             qkv_bias=getattr(config, "attention_bias", False),
             head_dim=getattr(config, "head_dim", None),
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
             prefix=f"{prefix}.self_attn",
             attn_type=AttentionType.DECODER,
         )
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 7a4fee76ae6b3..6581bbda6d609 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -703,7 +703,6 @@ class Glm4vVisionTransformer(nn.Module):
             head_size=head_dim,
             rotary_dim=head_dim // 2,
             max_position=8192,
-            base=10000.0,
             is_neox_style=True,
         )
         self.blocks = nn.ModuleList(
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 1422dbe9b3cd0..5aa51af54a00b 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -26,7 +26,6 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -233,8 +232,6 @@ class Glm4MoeAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 131072,
         head_dim: int | None = None,
         rms_norm_eps: float = 1e-05,
@@ -264,7 +261,6 @@ class Glm4MoeAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.use_qk_norm = use_qk_norm
 
@@ -291,8 +287,7 @@ class Glm4MoeAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=partial_rotary_factor,
         )
         self.attn = Attention(
@@ -341,8 +336,6 @@ class Glm4MoeDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
@@ -354,8 +347,6 @@ class Glm4MoeDecoderLayer(nn.Module):
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             head_dim=config.head_dim,
             rms_norm_eps=config.rms_norm_eps,
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index e416ecde0c1e0..e94de8952fa63 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -95,13 +95,12 @@ class GPTJAttention(nn.Module):
         scaling = self.head_size**-0.5
         assert getattr(config, "rotary", True)
         assert config.rotary_dim % 2 == 0
-        rope_theta = getattr(config, "rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             self.head_size,
             rotary_dim=config.rotary_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=False,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index af0c9209231cb..815c2fba4d9fe 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -92,13 +92,12 @@ class GPTNeoXAttention(nn.Module):
         scaling = self.head_size**-0.5
         rotary_dim = int(self.head_size * config.rotary_pct)
         assert rotary_dim % 2 == 0
-        rope_theta = getattr(config, "rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             self.head_size,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 7df3b087ccb88..f310f71af92d9 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -67,16 +67,16 @@ class OAIAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=config.max_position_embeddings,
-            base=config.rope_theta,
             dtype=torch.float32,
-            rope_scaling={
+            rope_parameters={
+                "rope_theta": config.rope_parameters["rope_theta"],
                 "rope_type": "yarn",
-                "factor": config.rope_scaling["factor"],
-                "original_max_position_embeddings": config.rope_scaling[
+                "factor": config.rope_parameters["factor"],
+                "original_max_position_embeddings": config.rope_parameters[
                     "original_max_position_embeddings"
                 ],
-                "beta_fast": config.rope_scaling["beta_fast"],
-                "beta_slow": config.rope_scaling["beta_slow"],
+                "beta_fast": config.rope_parameters["beta_fast"],
+                "beta_slow": config.rope_parameters["beta_slow"],
             },
             is_neox_style=True,
         )
@@ -90,7 +90,6 @@ class OAIAttention(nn.Module):
         self.q_size = self.num_attention_heads * self.head_dim // tp_size
         self.kv_size = self.num_key_value_heads * self.head_dim // tp_size
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = config.rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size=self.hidden_size,
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index c44b4021471ef..1dc205b47753d 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -26,7 +26,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -112,8 +111,6 @@ class GraniteAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -143,7 +140,6 @@ class GraniteAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = config.attention_multiplier
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -167,8 +163,7 @@ class GraniteAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -204,14 +199,6 @@ class GraniteDecoderLayer(nn.Module):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.residual_multiplier = config.residual_multiplier
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -225,8 +212,6 @@ class GraniteDecoderLayer(nn.Module):
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 5c6759ded0669..8f4139d63c3f6 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -141,8 +141,7 @@ class GraniteMoeAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         attention_multiplier: float | None = None,
@@ -172,7 +171,6 @@ class GraniteMoeAttention(nn.Module):
             if attention_multiplier is not None
             else self.head_dim**-1
         )
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -194,9 +192,8 @@ class GraniteMoeAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=int(self.rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
-            rope_scaling=rope_scaling,
         )
         self.attn = Attention(
             self.num_heads,
@@ -235,16 +232,12 @@ class GraniteMoeDecoderLayer(nn.Module):
         parallel_config = vllm_config.parallel_config
 
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         self.self_attn = GraniteMoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index a340112ec62ae..9d5eeef198a61 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -273,10 +273,7 @@ class GraniteMoeHybridAttention(nn.Module):
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=config.max_position_embeddings,
-                base=int(config.rope_theta),
-                rope_scaling=config.rope_scaling
-                if hasattr(config, "rope_scaling") and config.rope_scaling is not None
-                else None,
+                rope_parameters=config.rope_parameters,
                 is_neox_style=True,
             )
         else:
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index 926c539af33be..fd346db7e35aa 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -84,16 +84,12 @@ class GraniteMoeSharedDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         self.self_attn = GraniteMoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index 9dc231863f74f..4bf23cd6fd19a 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -25,6 +25,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -134,7 +135,7 @@ class Grok1Attention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
+        rope_parameters: dict[str, Any] | None = None,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -161,7 +162,6 @@ class Grok1Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -183,7 +183,7 @@ class Grok1Attention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=int(self.rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
         )
 
@@ -234,15 +234,12 @@ class Grok1DecoderLayer(nn.Module):
             if not self.use_fp8 and hasattr(quant_config, "is_fp8"):
                 self.use_fp8 = quant_config.is_fp8
 
-        # Requires transformers > 4.32.0
-        # Default rope_theta value if not in config
-        rope_theta = 10000
         self.attn = Grok1Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 1eadcbe67ade3..9fa5e2bd33f21 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -27,7 +27,6 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any
 
 import regex as re
 import torch
@@ -142,8 +141,6 @@ class HunYuanAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -177,7 +174,6 @@ class HunYuanAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.use_qk_norm = getattr(config, "use_qk_norm", False)
         self.layer_id = layer_id
@@ -204,8 +200,7 @@ class HunYuanAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -254,8 +249,6 @@ class HunYuanCrossAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -289,7 +282,6 @@ class HunYuanCrossAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.use_qk_norm = getattr(config, "use_qk_norm", False)
         self.layer_id = layer_id
@@ -314,8 +306,7 @@ class HunYuanCrossAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -494,14 +485,6 @@ class HunYuanDecoderLayer(nn.Module):
             if isinstance(config.intermediate_size, int)
             else config.intermediate_size[layer_id]
         )
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         attention_bias = getattr(config, "attention_bias", False) or getattr(
             config, "bias", False
@@ -520,8 +503,6 @@ class HunYuanDecoderLayer(nn.Module):
                 num_kv_heads=getattr(
                     config, "num_key_value_heads", config.num_attention_heads
                 ),
-                rope_theta=rope_theta,
-                rope_scaling=rope_scaling,
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 bias=attention_bias,
@@ -537,8 +518,6 @@ class HunYuanDecoderLayer(nn.Module):
                 num_kv_heads=getattr(
                     config, "num_key_value_heads", config.num_attention_heads
                 ),
-                rope_theta=rope_theta,
-                rope_scaling=rope_scaling,
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 bias=attention_bias,
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 60fbeb842dd4b..dc8f821bd134f 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -91,8 +91,7 @@ class InternLM2Attention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -120,7 +119,6 @@ class InternLM2Attention(nn.Module):
         self.kv_size = self.num_kv_heads * self.head_dim
         self.key_value_groups = int(self.num_heads / self.num_kv_heads)
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.wqkv = QKVParallelLinear(
@@ -144,8 +142,7 @@ class InternLM2Attention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -204,15 +201,12 @@ class InternLMDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.attention = InternLM2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 6dc081e34157b..a57db82242af9 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -30,15 +30,12 @@ class InternLM2VEDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.attention = InternLM2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py
index f3675075a48f4..4562b2202c5ec 100644
--- a/vllm/model_executor/models/kimi_linear.py
+++ b/vllm/model_executor/models/kimi_linear.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Any
 
 import torch
 from torch import nn
@@ -190,9 +189,7 @@ class KimiMLAAttention(nn.Module):
         v_head_dim: int,
         q_lora_rank: int | None,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
         use_nope: bool = False,
-        rope_scaling: dict[str, Any] | None = None,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -210,11 +207,9 @@ class KimiMLAAttention(nn.Module):
         tp_size = get_tensor_model_parallel_world_size()
         self.num_local_heads = num_heads // tp_size
         self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
         self.use_nope = use_nope
         assert self.use_nope is True
         assert self.q_lora_rank is None
-        assert rope_scaling is None
         assert num_heads % tp_size == 0
         self.kv_a_proj_with_mqa = ReplicatedLinear(
             self.hidden_size,
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index aeb25602f11a4..74bdde27ece5c 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 import torch.nn as nn
@@ -96,8 +95,6 @@ class Lfm2Attention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -126,7 +123,6 @@ class Lfm2Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -149,8 +145,7 @@ class Lfm2Attention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -199,14 +194,6 @@ class Lfm2AttentionDecoderLayer(nn.Module):
         self.config = config
         self.layer_idx = layer_idx
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
 
         self.self_attn = Lfm2Attention(
@@ -215,8 +202,6 @@ class Lfm2AttentionDecoderLayer(nn.Module):
             hidden_size=config.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index 6b7b5564ee989..c088a08211527 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 import torch.nn as nn
@@ -189,8 +188,6 @@ class Lfm2MoeAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -219,7 +216,6 @@ class Lfm2MoeAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -242,8 +238,7 @@ class Lfm2MoeAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -293,14 +288,6 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module):
         self.config = config
         self.layer_idx = layer_idx
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
 
         self.self_attn = Lfm2MoeAttention(
@@ -309,8 +296,6 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module):
             hidden_size=config.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 0a3f37c30ab5f..d5b49d2fb4c26 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -26,7 +26,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -120,8 +119,6 @@ class LlamaAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -157,7 +154,6 @@ class LlamaAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         llama_4_scaling_config = getattr(config, "llama_4_scaling", None)
@@ -186,9 +182,7 @@ class LlamaAttention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
-        self._init_rotary_emb(
-            config, rope_scaling=rope_scaling, quant_config=quant_config
-        )
+        self._init_rotary_emb(config, quant_config=quant_config)
 
         sliding_window = None
         if layer_types := getattr(config, "layer_types", None):
@@ -258,7 +252,6 @@ class LlamaAttention(nn.Module):
     def _init_rotary_emb(
         self,
         config: LlamaConfig,
-        rope_scaling: dict[str, Any] | None,
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
@@ -270,8 +263,7 @@ class LlamaAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
             partial_rotary_factor=self.partial_rotary_factor,
         )
@@ -291,14 +283,6 @@ class LlamaDecoderLayer(nn.Module):
         quant_config = self.get_quant_config(vllm_config)
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -326,8 +310,6 @@ class LlamaDecoderLayer(nn.Module):
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index a7e0732ec71e2..4c6d1d4244755 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -19,7 +19,6 @@
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Any
 
 import torch
 from torch import nn
@@ -171,8 +170,6 @@ class Llama4Attention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -208,7 +205,6 @@ class Llama4Attention(nn.Module):
 
         self.floor_scale = getattr(config, "floor_scale", 8192.0)
         self.attn_scale = getattr(config, "attn_scale", 0.1)
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.n_rep = self.num_heads // self.num_kv_heads
         self.qk_norm = (
@@ -248,8 +244,7 @@ class Llama4Attention(nn.Module):
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=max_position_embeddings,
-                base=int(rope_theta),
-                rope_scaling=rope_scaling if rope_scaling != "default" else None,
+                rope_parameters=config.rope_parameters,
                 is_neox_style=is_neox_style,
             )
             if not self.nope
@@ -331,8 +326,6 @@ class Llama4DecoderLayer(nn.Module):
         self.layer_idx = extract_layer_index(prefix)
         self.global_layer = config.no_rope_layers[self.layer_idx] == 0
         self.hidden_size = config.hidden_size
-        rope_theta = config.rope_theta
-        rope_scaling = config.rope_scaling
         max_position_embeddings = config.max_position_embeddings
 
         self.self_attn = Llama4Attention(
@@ -340,8 +333,6 @@ class Llama4DecoderLayer(nn.Module):
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=False,
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index 5de10e7086830..fafe97cd2be7e 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -108,8 +108,7 @@ class FlashConfig(PretrainedConfig):
         eos_token_id=100001,
         pretraining_tp=1,
         tie_word_embeddings=False,
-        rope_theta=1000000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         attention_bias=False,
         attention_dropout=0.0,
         mla_scale_q_lora=False,
@@ -162,8 +161,13 @@ class FlashConfig(PretrainedConfig):
         self.rms_norm_eps = rms_norm_eps
         self.pretraining_tp = pretraining_tp
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 1000000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.mla_scale_q_lora = mla_scale_q_lora
@@ -336,15 +340,7 @@ class FlashDecoderLayer(nn.Module):
         super().__init__()
         self.layer_idx = int(prefix.split(sep=".")[-1])
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
 
         # Dual attention structure
         self.self_attn = nn.ModuleList(
@@ -361,8 +357,6 @@ class FlashDecoderLayer(nn.Module):
                         config.q_lora_rank if hasattr(config, "q_lora_rank") else None
                     ),
                     kv_lora_rank=config.kv_lora_rank,
-                    rope_theta=rope_theta,
-                    rope_scaling=rope_scaling,
                     max_position_embeddings=max_position_embeddings,
                     cache_config=cache_config,
                     quant_config=None
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 914b097fe199e..04923833065f3 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -230,8 +230,7 @@ class MiniCPMAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -257,7 +256,6 @@ class MiniCPMAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -281,8 +279,7 @@ class MiniCPMAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
 
         self.attn = Attention(
@@ -324,8 +321,6 @@ class MiniCPMDecoderLayer(nn.Module):
         self.cache_config = cache_config
         self.quant_config = quant_config
         self.hidden_size = config.hidden_size
-        self.rope_theta = getattr(config, "rope_theta", 10000)
-        self.rope_scaling = getattr(config, "rope_scaling", None)
         self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.prefix = prefix
         self._init_attn_block()
@@ -339,8 +334,7 @@ class MiniCPMDecoderLayer(nn.Module):
             hidden_size=self.hidden_size,
             num_heads=self.config.num_attention_heads,
             num_kv_heads=self.config.num_key_value_heads,
-            rope_theta=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=self.config.rope_parameters,
             max_position_embeddings=self.max_position_embeddings,
             cache_config=self.cache_config,
             quant_config=self.quant_config,
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index d3b6966ee3a7f..2d775219fc972 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -25,8 +25,6 @@
 # limitations under the License.
 """Inference-only MiniCPM3 model compatible with HuggingFace weights."""
 
-from typing import Any
-
 import torch
 from torch import nn
 from transformers import PretrainedConfig
@@ -62,8 +60,6 @@ class MiniCPM3Attention(nn.Module):
         v_head_dim: int,
         q_lora_rank: int,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -84,7 +80,6 @@ class MiniCPM3Attention(nn.Module):
         self.num_local_heads = num_heads // tp_size
 
         self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.q_a_proj = ReplicatedLinear(
@@ -127,8 +122,7 @@ class MiniCPM3Attention(nn.Module):
             self.qk_rope_head_dim,
             rotary_dim=self.qk_rope_head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_local_heads,
@@ -204,8 +198,6 @@ class MiniCPM3DecoderLayer(MiniCPMDecoderLayer):
             v_head_dim=self.config.v_head_dim,
             q_lora_rank=self.config.q_lora_rank,
             kv_lora_rank=self.config.kv_lora_rank,
-            rope_theta=self.rope_theta,
-            rope_scaling=self.rope_scaling,
             max_position_embeddings=self.max_position_embeddings,
             cache_config=self.cache_config,
             quant_config=self.quant_config,
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index d0cdb70aa8574..e6bccfcac4f1a 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -69,8 +69,6 @@ class EagleMiniCPMDecoderLayer(nn.Module):
         self.cache_config = cache_config
         self.quant_config = quant_config
         self.hidden_size = config.hidden_size
-        self.rope_theta = getattr(config, "rope_theta", 10000)
-        self.rope_scaling = getattr(config, "rope_scaling", None)
         self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.prefix = prefix
         self._init_attn_block()
@@ -84,8 +82,7 @@ class EagleMiniCPMDecoderLayer(nn.Module):
             hidden_size=self.hidden_size,
             num_heads=self.config.num_attention_heads,
             num_kv_heads=self.config.num_key_value_heads,
-            rope_theta=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=self.config.rope_parameters,
             max_position_embeddings=self.max_position_embeddings,
             cache_config=self.cache_config,
             quant_config=self.quant_config,
diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py
index 49d2f2d261969..4955c68c0cda8 100644
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@@ -149,8 +149,7 @@ class MiniMaxM2Attention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rotary_dim: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         attn_window_size: int | None = None,
         max_position_embeddings: int = 8192,
         head_dim: int | None = None,
@@ -180,7 +179,6 @@ class MiniMaxM2Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -205,8 +203,7 @@ class MiniMaxM2Attention(nn.Module):
             self.head_dim,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -252,8 +249,6 @@ class MiniMaxM2DecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int):
             max_position_embeddings = max(
@@ -269,8 +264,7 @@ class MiniMaxM2DecoderLayer(nn.Module):
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             rotary_dim=config.rotary_dim,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
             qkv_bias=getattr(config, "attention_bias", False),
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index bf1ecc822756d..50f7396e2de60 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -188,7 +188,7 @@ class MiniMaxText01Attention(nn.Module):
         num_kv_heads: int,
         rotary_dim: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
+        rope_parameters: dict | None = None,
         sliding_window: int | None = None,
         quant_config: QuantizationConfig | None = None,
         layer_idx: int = None,
@@ -214,7 +214,6 @@ class MiniMaxText01Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.sliding_window = sliding_window
         self.prefix = prefix
 
@@ -247,7 +246,7 @@ class MiniMaxText01Attention(nn.Module):
             head_size=self.head_dim,
             rotary_dim=rotary_dim,
             max_position=max_position,
-            base=int(rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
             dtype=torch.float32,
         )
@@ -287,8 +286,6 @@ class MiniMaxText01DecoderLayer(nn.Module):
         self.hidden_size = config.hidden_size
         self.expert_num = expert_num
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-
         head_dim = getattr(config, "head_dim", None)
         if head_dim is None:
             head_dim = config.hidden_size // config.num_attention_heads
@@ -328,7 +325,7 @@ class MiniMaxText01DecoderLayer(nn.Module):
                 else head_dim,
                 num_kv_heads=config.num_key_value_heads,
                 max_position=max_position_embeddings,
-                rope_theta=rope_theta,
+                rope_parameters=config.rope_parameters,
                 sliding_window=config.sliding_window,
                 quant_config=quant_config,
                 layer_idx=self._ilayer,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index d7a1cb82fb4fb..54ab8dd493e73 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -161,7 +161,6 @@ class MixtralAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -189,7 +188,6 @@ class MixtralAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -211,7 +209,7 @@ class MixtralAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=int(self.rope_theta),
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -248,15 +246,12 @@ class MixtralDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
         self.self_attn = MixtralAttention(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index e25a104d822a7..286859d188d34 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -292,13 +292,17 @@ class Llama4VisionAttention(nn.Module):
                 prefix=f"{prefix}.o_proj",
             )
 
+        rope_parameters = {
+            "rope_type": "mllama4",
+            "rope_theta": config.rope_parameters["rope_theta"],
+        }
+
         self.rotary_emb = get_rope(
             head_size=self.head_dim,
             rotary_dim=config.hidden_size // config.num_attention_heads // 2,
             # number of image patches
             max_position=(config.image_size // config.patch_size) ** 2,
-            base=config.rope_theta,
-            rope_scaling={"rope_type": "mllama4"},
+            rope_parameters=rope_parameters,
             is_neox_style=False,
             dtype=torch.complex64,  # important
         )
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index ab83a271e30a0..dc06938d5d6e1 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -410,7 +410,6 @@ class MolmoAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
 
         # Attention input projection. Projects x -> (q, k, v)
         self.qkv_proj = QKVParallelLinear(
@@ -437,7 +436,7 @@ class MolmoAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=config.rope_parameters,
         )
         self.scaling = self.head_dim**-0.5
         self.attn = Attention(
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 92dcf5ea57008..c3337bd1ea699 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -26,7 +26,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -150,8 +149,6 @@ class NemotronAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -181,7 +178,6 @@ class NemotronAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.partial_rotary_factor = config.partial_rotary_factor
         self.max_position_embeddings = max_position_embeddings
 
@@ -206,8 +202,7 @@ class NemotronAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=self.partial_rotary_factor,
         )
         self.attn = Attention(
@@ -243,14 +238,6 @@ class NemotronDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -264,8 +251,6 @@ class NemotronDecoderLayer(nn.Module):
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index b839206a3094d..2eebe38051cbd 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -26,7 +26,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -82,8 +81,6 @@ class DeciLMAttention(LlamaAttention):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -97,8 +94,6 @@ class DeciLMAttention(LlamaAttention):
             hidden_size,
             num_heads,
             num_kv_heads,
-            rope_theta,
-            rope_scaling,
             max_position_embeddings,
             quant_config,
             bias,
@@ -111,7 +106,6 @@ class DeciLMAttention(LlamaAttention):
     def _init_rotary_emb(
         self,
         config,
-        rope_scaling: dict[str, Any] | None,
         quant_config: QuantizationConfig | None,
     ) -> None:
         # Enables YARN for Mistral and LLaMA4 derivatives.
@@ -126,8 +120,7 @@ class DeciLMAttention(LlamaAttention):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
             partial_rotary_factor=self.partial_rotary_factor,
         )
@@ -148,14 +141,6 @@ class DeciLMDecoderLayer(nn.Module):
         self._is_no_op_ffn = block_config.ffn.no_op
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -176,8 +161,6 @@ class DeciLMDecoderLayer(nn.Module):
                 hidden_size=self.hidden_size,
                 num_heads=config.num_attention_heads,
                 num_kv_heads=num_kv_heads,
-                rope_theta=rope_theta,
-                rope_scaling=rope_scaling,
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 bias=attention_bias,
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 487e3f671a455..bd8a8e317544f 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -87,7 +87,6 @@ class OlmoAttention(nn.Module):
         self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
         self.clip_qkv = config.clip_qkv
 
         # Attention input projection. Projects x -> (q, k, v)
@@ -105,7 +104,7 @@ class OlmoAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=config.rope_parameters,
         )
         self.scaling = self.head_dim**-0.5
         self.attn = Attention(
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 045582c889ee4..f0f6b2f6b3e6d 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -99,7 +99,6 @@ class Olmo2Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.max_position_embeddings = self.config.max_position_embeddings
-        self.rope_theta = self.config.rope_theta
 
         # Attention input projection. Projects x -> (q, k, v)
         self.qkv_proj = QKVParallelLinear(
@@ -139,15 +138,17 @@ class Olmo2Attention(nn.Module):
             prefix=f"{prefix}.attn",
         )
 
-        # Rotary embeddings. Rope scaling is only applied on full attention
-        # layers.
-        self.rope_scaling = self.config.rope_scaling if sliding_window is None else None
+        # Rotary embeddings. Rope scaling is only applied on full attention layers.
+        if sliding_window is None:
+            rope_parameters = self.config.rope_parameters
+        else:
+            rope_theta = self.config.rope_parameters["rope_theta"]
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,  # type: ignore
-            rope_scaling=self.rope_scaling,
+            rope_parameters=rope_parameters,
         )
 
         # Attention output projection.
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 499eb05de76e4..c39e338d72e22 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -123,8 +123,6 @@ class OlmoeAttention(nn.Module):
         quant_config = vllm_config.quant_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
 
         num_heads = config.num_attention_heads
@@ -148,7 +146,6 @@ class OlmoeAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -176,8 +173,7 @@ class OlmoeAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index d13a745beffeb..f814cdfec5a22 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -77,6 +77,7 @@ from vllm.model_executor.models.utils import (
     sequence_parallel_chunk,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 
 def check_ffn_act_fn(act_fn: str):
@@ -259,7 +260,6 @@ class OpenPanguMLAAttention(nn.Module):
         v_head_dim: int,
         q_lora_rank: int | None,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -274,8 +274,6 @@ class OpenPanguMLAAttention(nn.Module):
         self.v_head_dim = v_head_dim
         self.q_lora_rank = q_lora_rank
         self.kv_lora_rank = kv_lora_rank
-        self.rope_theta = rope_theta
-
         self.tp_size = get_tensor_model_parallel_world_size()
         if num_heads % self.tp_size != 0:
             raise ValueError(
@@ -339,7 +337,9 @@ class OpenPanguMLAAttention(nn.Module):
         )
 
         # TODO: remove hard coding
-        rope_scaling = {
+        set_default_rope_theta(config, default_theta=10000)
+        rope_parameters = {
+            "rope_theta": config.rope_parameters["rope_theta"],
             "beta_fast": 32,
             "beta_slow": 1,
             "factor": 1,
@@ -353,8 +353,7 @@ class OpenPanguMLAAttention(nn.Module):
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             is_neox_style=False,
         )
 
@@ -407,8 +406,6 @@ class OpenPanguEmbeddedAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -454,7 +451,6 @@ class OpenPanguEmbeddedAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -475,9 +471,7 @@ class OpenPanguEmbeddedAttention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
-        self._init_rotary_emb(
-            config, rope_scaling=rope_scaling, quant_config=quant_config
-        )
+        self._init_rotary_emb(config, quant_config=quant_config)
 
         if hasattr(config, "interleaved_sliding_window"):
             interleaved_sliding_window = config.interleaved_sliding_window
@@ -521,7 +515,6 @@ class OpenPanguEmbeddedAttention(nn.Module):
     def _init_rotary_emb(
         self,
         config: PretrainedConfig,
-        rope_scaling: dict[str, Any] | None,
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
@@ -533,8 +526,7 @@ class OpenPanguEmbeddedAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
         )
 
@@ -555,7 +547,6 @@ class OpenPanguDecoderLayer(nn.Module):
         parallel_config = vllm_config.parallel_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
 
         layer_idx = int(prefix.split(sep=".")[-1])
@@ -579,7 +570,6 @@ class OpenPanguDecoderLayer(nn.Module):
                     config.q_lora_rank if hasattr(config, "q_lora_rank") else None
                 ),
                 kv_lora_rank=config.kv_lora_rank,
-                rope_theta=rope_theta,
                 max_position_embeddings=max_position_embeddings,
                 cache_config=cache_config,
                 quant_config=quant_config,
@@ -607,8 +597,6 @@ class OpenPanguDecoderLayer(nn.Module):
                 num_kv_heads=getattr(
                     config, "num_key_value_heads", config.num_attention_heads
                 ),
-                rope_theta=rope_theta,
-                rope_scaling=getattr(config, "rope_scaling", None),
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 bias=attention_bias,
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 859cd2cecf897..b30be93ca726f 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -88,8 +88,7 @@ class OrionAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -115,7 +114,6 @@ class OrionAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -139,8 +137,7 @@ class OrionAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -175,15 +172,12 @@ class OrionDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = OrionAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py
index 9db6c317c26a8..63d2fff6ec8bc 100644
--- a/vllm/model_executor/models/ouro.py
+++ b/vllm/model_executor/models/ouro.py
@@ -112,10 +112,8 @@ class OuroAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
         dual_chunk_attention_config: dict[str, Any] | None = None,
@@ -140,7 +138,6 @@ class OuroAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
         # Get total_ut_steps from config, default to 4 if not specified
@@ -170,8 +167,7 @@ class OuroAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.attn = nn.ModuleList()
@@ -226,9 +222,6 @@ class OuroDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
         )
@@ -244,10 +237,8 @@ class OuroDecoderLayer(nn.Module):
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 3bf6a1d9763d0..98963d52e4848 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -106,7 +106,6 @@ class PersimmonAttention(nn.Module):
         self.num_heads = self.total_num_heads // tensor_parallel_world_size
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
         self.partial_rotary_factor = config.partial_rotary_factor
         self.is_causal = True
 
@@ -138,7 +137,7 @@ class PersimmonAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=self.partial_rotary_factor,
         )
         self.scaling = self.head_dim**-0.5
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 8fee53c23fb4b..da476f621627b 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -115,16 +115,12 @@ class PhiAttention(nn.Module):
         )
         assert rotary_dim % 2 == 0
 
-        # pylint: disable=C0301
-        # Refer to:
-        # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518
-        rope_theta = getattr(config, "rope_theta", 10000.0)
         max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
         self.rotary_emb = get_rope(
             self.head_size,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 92fd858b608bc..8ffac95d93960 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -86,7 +86,7 @@ class PhiMoEConfig(PretrainedConfig):
         bos_token_id=1,
         eos_token_id=2,
         tie_word_embeddings=False,
-        rope_theta=1e6,
+        rope_parameters=None,
         sliding_window=None,
         attention_dropout=0.0,
         num_experts_per_tok=2,
@@ -119,7 +119,9 @@ class PhiMoEConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
+        if rope_parameters is None:
+            rope_theta = kwargs.pop("rope_theta", 1e6)
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
         self.attention_dropout = attention_dropout
 
         self.num_experts_per_tok = num_experts_per_tok
@@ -302,12 +304,11 @@ class PhiMoEAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict,
         head_dim: int | None = None,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: dict | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -332,8 +333,6 @@ class PhiMoEAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -355,9 +354,8 @@ class PhiMoEAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=int(self.rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
-            rope_scaling=self.rope_scaling,
         )
         self.attn = Attention(
             self.num_heads,
@@ -393,7 +391,6 @@ class PhiMoEDecoderLayer(nn.Module):
         super().__init__()
         self.hidden_size = config.hidden_size
         # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
         self.self_attn = PhiMoEAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -402,10 +399,9 @@ class PhiMoEDecoderLayer(nn.Module):
             head_dim=getattr(
                 config, "head_dim", self.hidden_size // config.num_attention_heads
             ),
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=config.rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
         )
         self.block_sparse_moe = PhiMoE(
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 52c9755e0e0ea..22f9c87fc905b 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -567,10 +567,6 @@ class Plamo2AttentionMixer(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
-        self.rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000
-        self.rope_scaling = (
-            config.rope_scaling if hasattr(config, "rope_scaling") else None
-        )
         max_position = config.max_position_embeddings
         if hasattr(vllm_config.model_config, "max_model_len") and isinstance(
             vllm_config.model_config.max_model_len, int
@@ -581,8 +577,7 @@ class Plamo2AttentionMixer(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.q_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps)
         self.q_norm.weight = torch.nn.Parameter(
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 50a125c3f5973..c973e79170982 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -83,8 +83,7 @@ class QWenAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         max_position_embeddings: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -117,8 +116,7 @@ class QWenAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -153,14 +151,11 @@ class QWenBlock(nn.Module):
         super().__init__()
         self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         self.attn = QWenAttention(
             config.hidden_size,
             config.num_attention_heads,
             config.max_position_embeddings,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 1bbb969ce5aa3..32b6d6dd07b83 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -57,7 +57,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.config import is_interleaved
+from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta
 
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .utils import (
@@ -114,11 +114,10 @@ class Qwen2Attention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict[str, Any],
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
         dual_chunk_attention_config: dict[str, Any] | None = None,
@@ -143,7 +142,6 @@ class Qwen2Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
         self.qkv_proj = QKVParallelLinear(
@@ -167,8 +165,7 @@ class Qwen2Attention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         attn_cls = (
@@ -216,9 +213,7 @@ class Qwen2DecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=1000000)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
         )
@@ -237,10 +232,9 @@ class Qwen2DecoderLayer(nn.Module):
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 5b5d50ec8935a..8e3c0e84dfe51 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -641,7 +641,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
             head_size=head_dim,
             rotary_dim=head_dim // 2,
             max_position=8192,
-            base=10000.0,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 2ff0d19df238c..6b97d0b2ca2e3 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -194,8 +194,7 @@ class Qwen2MoeAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -222,7 +221,6 @@ class Qwen2MoeAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
@@ -248,8 +246,7 @@ class Qwen2MoeAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.attn = Attention(
@@ -291,8 +288,6 @@ class Qwen2MoeDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
         )
@@ -301,8 +296,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index cda8eaf5377f1..d25ff2785bfef 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -643,7 +643,6 @@ class Qwen2VisionTransformer(nn.Module):
             head_size=head_dim,
             rotary_dim=head_dim // 2,
             max_position=8192,
-            base=10000.0,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 8d7f22a33fe6c..93a629d81e8ff 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -42,6 +42,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2MLP as Qwen3MLP
@@ -57,14 +58,13 @@ class Qwen3Attention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict,
         max_position: int = 4096 * 32,
         head_dim: int | None = None,
         rms_norm_eps: float = 1e-06,
         qkv_bias: bool = False,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
         dual_chunk_attention_config: dict[str, Any] | None = None,
@@ -89,7 +89,6 @@ class Qwen3Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
         self.qkv_proj = QKVParallelLinear(
@@ -113,8 +112,7 @@ class Qwen3Attention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.attn = Attention(
@@ -166,9 +164,7 @@ class Qwen3DecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=1000000)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
         )
@@ -187,13 +183,12 @@ class Qwen3DecoderLayer(nn.Module):
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             rms_norm_eps=config.rms_norm_eps,
             qkv_bias=getattr(config, "attention_bias", False),
             head_dim=getattr(config, "head_dim", None),
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 96751fee800bb..8ee3dd99e11db 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -216,8 +216,7 @@ class Qwen3MoeAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any],
         max_position_embeddings: int = 8192,
         head_dim: int | None = None,
         rms_norm_eps: float = 1e-06,
@@ -247,7 +246,6 @@ class Qwen3MoeAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
@@ -273,8 +271,7 @@ class Qwen3MoeAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.attn = Attention(
@@ -326,8 +323,6 @@ class Qwen3MoeDecoderLayer(nn.Module):
         quant_config = vllm_config.quant_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
@@ -336,8 +331,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
             qkv_bias=getattr(config, "attention_bias", False),
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index ad631f61e4b93..bfed64728305e 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -748,8 +748,7 @@ class Qwen3NextAttention(nn.Module):
             head_size=self.head_dim,
             rotary_dim=self.head_dim,
             max_position=config.max_position_embeddings,
-            base=config.rope_theta,
-            rope_scaling=config.rope_scaling,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=config.partial_rotary_factor,
             dual_chunk_attention_config=self.dual_chunk_attention_config,
         )
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index d2fd74a5e41ad..54ef56f83344e 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -338,7 +338,6 @@ class Qwen3Omni_VisionTransformer(nn.Module):
             head_size=head_dim,
             rotary_dim=head_dim // 2,
             max_position=8192,
-            base=10000.0,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 0c546309400b7..c10aeaec5ab83 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -345,7 +345,6 @@ class Qwen3_VisionTransformer(nn.Module):
             head_size=head_dim,
             rotary_dim=head_dim // 2,
             max_position=8192,
-            base=10000.0,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index bf211d28f1844..4744d8e44f390 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -54,6 +54,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
@@ -112,11 +113,10 @@ class SeedOssAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         head_dim: int,
+        rope_parameters: dict,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -140,7 +140,6 @@ class SeedOssAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -163,8 +162,7 @@ class SeedOssAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -200,9 +198,7 @@ class SeedOssDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=1000000)
 
         # By default, SeedOss uses causal attention as it is a
         # decoder-only model.
@@ -219,10 +215,9 @@ class SeedOssDecoderLayer(nn.Module):
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
             head_dim=config.head_dim,
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
         )
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 4ec855f794446..7e9fc51036d2e 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -25,7 +25,6 @@
 """Inference-only Solar model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Any
 
 import torch
 from torch import nn
@@ -111,8 +110,6 @@ class SolarAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -142,7 +139,6 @@ class SolarAttention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -166,8 +162,7 @@ class SolarAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -202,15 +197,6 @@ class SolarDecoderLayer(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -224,8 +210,6 @@ class SolarDecoderLayer(nn.Module):
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 06eb7201c1a89..a738fcbb4ee28 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -153,7 +153,7 @@ class StablelmAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.config.max_position_embeddings,
-            base=self.config.rope_theta,
+            rope_parameters=self.config.rope_parameters,
             partial_rotary_factor=self.partial_rotary_factor,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 0f2942acd5006..1118fca3cac91 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -91,7 +91,6 @@ class Starcoder2Attention(nn.Module):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = config.rope_theta
         self.max_position_embeddings = config.max_position_embeddings
         self.use_bias = config.use_bias
 
@@ -115,7 +114,7 @@ class Starcoder2Attention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=int(self.rope_theta),
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index 4fff356b29e28..3c377a2c539df 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -36,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.step3_vl import Step3TextConfig
 
 from .interfaces import SupportsPP
 from .utils import (
@@ -144,9 +145,8 @@ class Step3TextAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         norm_eps: float,
-        rope_theta: int,
+        rope_parameters: dict[str, Any],
         share_q_dim: int | None = None,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embedding: int = 8192,
         head_dim: int = 256,
         cache_config: CacheConfig | None = None,
@@ -198,8 +198,7 @@ class Step3TextAttention(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embedding,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         scaling = self.head_dim**-0.5
         self.attn = Attention(
@@ -227,15 +226,13 @@ class Step3TextAttention(nn.Module):
 class Step3TextDecoderLayer(nn.Module):
     def __init__(
         self,
-        config: ModelConfig,
+        config: Step3TextConfig,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
-        config = config.hf_config
         self.hidden_size = config.hidden_size
-        rope_scaling = getattr(config, "rope_scaling", None)
 
         self.self_attn = Step3TextAttention(
             hidden_size=self.hidden_size,
@@ -247,8 +244,7 @@ class Step3TextDecoderLayer(nn.Module):
             max_position_embedding=config.max_position_embedding,
             head_dim=config.head_dim,
             share_q_dim=config.share_q_dim,
-            rope_theta=config.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
         )
 
@@ -338,7 +334,7 @@ class Step3TextModel(nn.Module):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: Step3TextDecoderLayer(
-                config=vllm_config.model_config,
+                config=config,
                 cache_config=cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py
index 517eb54d53ac6..b807f45b5d52b 100644
--- a/vllm/model_executor/models/transformers/utils.py
+++ b/vllm/model_executor/models/transformers/utils.py
@@ -22,6 +22,7 @@ from typing import TYPE_CHECKING, Literal
 
 import torch
 from torch import nn
+from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 from vllm.config.utils import getattr_iter
 from vllm.logger import init_logger
@@ -203,5 +204,10 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool:
     """
     text_config = vllm_config.model_config.hf_config.get_text_config()
     # Dynamic rope scaling is not compatible with torch.compile
-    rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {}
-    return rope_scaling.get("rope_type") != "dynamic"
+    rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {}
+    if rope_parameters:
+        # Nest rope_parameters if not nested already to simplify logic
+        if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+            rope_parameters = {"": rope_parameters}
+        return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values())
+    return True
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 729a9655d0879..653b5b9beef7b 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -128,7 +128,6 @@ class Zamba2Attention(nn.Module):
         tp_size = get_tensor_model_parallel_world_size()
         self.config = config
         self.num_hybrid_layers = num_hybrid_layers
-        self.rope_theta = config.rope_theta
 
         self.attention_hidden_size = config.attention_hidden_size
         self.total_num_attention_heads = config.num_attention_heads
@@ -233,8 +232,7 @@ class Zamba2Attention(nn.Module):
                 head_size=self.attention_head_dim,
                 rotary_dim=self.attention_head_dim,
                 max_position=config.max_position_embeddings,
-                base=self.rope_theta,
-                rope_scaling=None,
+                rope_parameters=config.rope_parameters,
                 is_neox_style=True,
             )
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index ac4a71648cec8..4ca155af03dca 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -7,8 +7,9 @@ import time
 from collections.abc import Callable
 from dataclasses import asdict
 from functools import cache, partial
+from importlib.metadata import version
 from pathlib import Path
-from typing import Any, Literal, TypeVar
+from typing import Any, Literal, TypeAlias, TypeVar
 
 import huggingface_hub
 from huggingface_hub import (
@@ -24,7 +25,9 @@ from huggingface_hub.utils import (
     RepositoryNotFoundError,
     RevisionNotFoundError,
 )
+from packaging.version import Version
 from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig
+from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 from transformers.models.auto.image_processing_auto import get_image_processor_config
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@@ -390,21 +393,61 @@ def file_or_path_exists(
     )
 
 
-def patch_rope_scaling(config: PretrainedConfig) -> None:
+def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> None:
+    """Some models may have no rope_theta in their config but still use RoPE.
+    This function sets a default rope_theta if it's missing."""
+    if getattr(config, "rope_parameters", None) is None:
+        config.rope_parameters = {"rope_type": "default"}
+    if "rope_theta" not in config.rope_parameters:
+        config.rope_parameters["rope_theta"] = default_theta
+
+
+def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
-    text_config = getattr(config, "text_config", None)
-    if text_config is not None:
-        patch_rope_scaling(text_config)
+    # Retrieve rope_parameters differently based on Transformers version
+    if Version(version("transformers")) >= Version("5.0.0.dev0"):
+        from transformers.modeling_rope_utils import RopeParameters
 
-    rope_scaling = getattr(config, "rope_scaling", None)
-    if rope_scaling is not None:
-        patch_rope_scaling_dict(rope_scaling)
+        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr(
+            config, "rope_parameters", None
+        )
+    elif hasattr(config, "rope_parameters"):
+        # We are in Transformers v4 and rope_parameters
+        # has already been patched for this config
+        return
+    else:
+        # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters
+        rope_theta: float | None = getattr(config, "rope_theta", None)
+        rope_scaling: dict | None = getattr(config, "rope_scaling", None)
+        rope_parameters = rope_scaling
+        # Move rope_theta into rope_parameters
+        if rope_theta is not None:
+            rope_parameters = rope_parameters or {"rope_type": "default"}
+            rope_parameters["rope_theta"] = rope_theta
+        # Add original_max_position_embeddings if present
+        if rope_parameters and (
+            ompe := getattr(config, "original_max_position_embeddings", None)
+        ):
+            rope_parameters["original_max_position_embeddings"] = ompe
+        # Write back to config
+        config.rope_parameters = rope_parameters
+
+    # No RoPE parameters to patch
+    if rope_parameters is None:
+        return
+
+    # Handle nested rope_parameters in interleaved sliding attention models
+    if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+        for rope_parameters_layer_type in rope_parameters.values():
+            patch_rope_parameters_dict(rope_parameters_layer_type)
+    else:
+        patch_rope_parameters_dict(rope_parameters)
 
 
-def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
-    if "rope_type" in rope_scaling and "type" in rope_scaling:
-        rope_type = rope_scaling["rope_type"]
-        rope_type_legacy = rope_scaling["type"]
+def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
+    if "rope_type" in rope_parameters and "type" in rope_parameters:
+        rope_type = rope_parameters["rope_type"]
+        rope_type_legacy = rope_parameters["type"]
         if rope_type != rope_type_legacy:
             raise ValueError(
                 f"Found conflicts between 'rope_type={rope_type}' (modern "
@@ -412,28 +455,28 @@ def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
                 "You should only specify one of them."
             )
 
-    if "rope_type" not in rope_scaling and "type" in rope_scaling:
-        rope_scaling["rope_type"] = rope_scaling["type"]
+    if "rope_type" not in rope_parameters and "type" in rope_parameters:
+        rope_parameters["rope_type"] = rope_parameters["type"]
         logger.info("Replacing legacy 'type' key with 'rope_type'")
 
-    if "rope_type" not in rope_scaling:
-        raise ValueError("rope_scaling should have a 'rope_type' key")
+    if "rope_type" not in rope_parameters:
+        raise ValueError("rope_parameters should have a 'rope_type' key")
 
-    if rope_scaling["rope_type"] == "su":
-        rope_scaling["rope_type"] = "longrope"
+    if rope_parameters["rope_type"] == "su":
+        rope_parameters["rope_type"] = "longrope"
         logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
-    elif rope_scaling["rope_type"] == "mrope":
-        assert "mrope_section" in rope_scaling
-        rope_scaling["rope_type"] = "default"
+    elif rope_parameters["rope_type"] == "mrope":
+        assert "mrope_section" in rope_parameters
+        rope_parameters["rope_type"] = "default"
         logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
 
 
 def _uses_mrope(config: PretrainedConfig) -> bool:
-    rope_scaling = getattr(config, "rope_scaling", None)
-    if rope_scaling is None:
+    rope_parameters = getattr(config, "rope_parameters", None)
+    if rope_parameters is None:
         return False
 
-    return "mrope_section" in rope_scaling
+    return "mrope_section" in rope_parameters
 
 
 def uses_mrope(config: PretrainedConfig) -> bool:
@@ -690,7 +733,14 @@ def get_config(
         logger.debug("Overriding HF config with %s", hf_overrides_fn)
         config = hf_overrides_fn(config)
 
-    patch_rope_scaling(config)
+    # Exhaustively patch RoPE parameters everywhere they might be
+    patch_rope_parameters(config)
+    patch_rope_parameters(config.get_text_config())
+    SubConfigs: TypeAlias = dict[str, PretrainedConfig]
+    sub_configs: SubConfigs | None = getattr(config, "sub_configs", None)
+    if sub_configs:
+        for sub_config in sub_configs:
+            patch_rope_parameters(getattr(config, sub_config))
 
     if trust_remote_code:
         maybe_register_config_serialize_by_value()
diff --git a/vllm/transformers_utils/configs/afmoe.py b/vllm/transformers_utils/configs/afmoe.py
index 9b634fd037a33..47fee9882f9fc 100644
--- a/vllm/transformers_utils/configs/afmoe.py
+++ b/vllm/transformers_utils/configs/afmoe.py
@@ -24,7 +24,7 @@ class AfmoeConfig(PretrainedConfig):
         rms_norm_eps: float = 1e-5,
         use_cache: bool = True,
         tie_word_embeddings: bool = False,
-        rope_theta: float = 10000.0,
+        rope_parameters: dict | None = None,
         rope_scaling: dict | None = None,
         num_experts: int = 64,
         num_experts_per_tok: int = 6,
@@ -56,7 +56,10 @@ class AfmoeConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        self.rope_parameters = rope_parameters
         self.rope_scaling = rope_scaling
 
         self.moe_intermediate_size = moe_intermediate_size
diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py
index 1707e15285c89..ba4b1a8f701f0 100644
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -85,8 +85,15 @@ class ArcticConfig(PretrainedConfig):
             The id of the "end-of-sequence" token.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE embeddings.
+        rope_parameters (`dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_theta` (`float`): The base period of the RoPE embeddings.
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
         sliding_window (`int`, *optional*):
             Sliding window attention window size. If not specified, will default to `4096`.
         attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -132,7 +139,7 @@ class ArcticConfig(PretrainedConfig):
         bos_token_id=1,
         eos_token_id=2,
         tie_word_embeddings=False,
-        rope_theta=1e6,
+        rope_parameters: dict[str, Any] | None = None,
         sliding_window=None,
         attention_dropout=0.0,
         num_experts_per_tok=1,
@@ -165,7 +172,10 @@ class ArcticConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
+        rope_theta = kwargs.pop("rope_theta", 1e6)
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        self.rope_parameters = rope_parameters
         self.attention_dropout = attention_dropout
 
         self.num_experts_per_tok = num_experts_per_tok
diff --git a/vllm/transformers_utils/configs/flex_olmo.py b/vllm/transformers_utils/configs/flex_olmo.py
index 1f2f4d446288b..c343dc0999a87 100644
--- a/vllm/transformers_utils/configs/flex_olmo.py
+++ b/vllm/transformers_utils/configs/flex_olmo.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
 
 from transformers.configuration_utils import PretrainedConfig
 
@@ -25,8 +26,7 @@ class FlexOlmoConfig(PretrainedConfig):
         bos_token_id=None,
         eos_token_id=100257,
         tie_word_embeddings=False,
-        rope_theta=500000.0,
-        rope_scaling=None,
+        rope_parameters: dict[str, Any] | None = None,
         attention_bias=False,
         attention_dropout=0.0,
         num_experts_per_tok=5,
@@ -62,8 +62,13 @@ class FlexOlmoConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 500000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.num_experts_per_tok = num_experts_per_tok
@@ -73,5 +78,5 @@ class FlexOlmoConfig(PretrainedConfig):
         self.norm_topk_prob = norm_topk_prob
         # Validate the correctness of rotary position embeddings parameters
         # BC: if there is a 'type' field, move it to 'rope_type'.
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        if self.rope_parameters is not None and "type" in self.rope_parameters:
+            self.rope_parameters["rope_type"] = self.rope_parameters["type"]
diff --git a/vllm/transformers_utils/configs/kimi_linear.py b/vllm/transformers_utils/configs/kimi_linear.py
index 65ddf48c5249b..14894816801d1 100644
--- a/vllm/transformers_utils/configs/kimi_linear.py
+++ b/vllm/transformers_utils/configs/kimi_linear.py
@@ -29,8 +29,7 @@ class KimiLinearConfig(PretrainedConfig):
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
-        rope_theta=10000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         tie_word_embeddings=False,
         moe_intermediate_size: int | None = None,
         moe_renormalize: bool = True,
@@ -73,8 +72,13 @@ class KimiLinearConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
 
         self.q_lora_rank = q_lora_rank
         self.kv_lora_rank = kv_lora_rank
diff --git a/vllm/transformers_utils/configs/lfm2_moe.py b/vllm/transformers_utils/configs/lfm2_moe.py
index 37c038e12db80..b399a03c030f0 100644
--- a/vllm/transformers_utils/configs/lfm2_moe.py
+++ b/vllm/transformers_utils/configs/lfm2_moe.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
 
 from transformers.configuration_utils import PretrainedConfig
 
@@ -35,8 +36,8 @@ class Lfm2MoeConfig(PretrainedConfig):
             End of stream token id.
         tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE embeddings.
+        rope_parameters (`dict`, *optional*):
+            The parameters of the RoPE embeddings.
         max_position_embeddings (`int`, *optional*, defaults to 128000):
             The maximum sequence length that this model might ever be used with.
         use_cache (`bool`, *optional*, defaults to `True`):
@@ -100,7 +101,7 @@ class Lfm2MoeConfig(PretrainedConfig):
         bos_token_id: int = 1,
         eos_token_id: int = 2,
         tie_word_embeddings: bool = True,
-        rope_theta: float = 1000000.0,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 128_000,
         use_cache: bool = True,
         norm_eps: float = 0.00001,
@@ -121,7 +122,10 @@ class Lfm2MoeConfig(PretrainedConfig):
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
-        self.rope_theta = rope_theta
+        rope_theta = kwargs.pop("rope_theta", 1000000.0)
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        self.rope_parameters = rope_parameters
         self.max_position_embeddings = max_position_embeddings
         self.use_cache = use_cache
         self.norm_eps = norm_eps
diff --git a/vllm/transformers_utils/configs/midashenglm.py b/vllm/transformers_utils/configs/midashenglm.py
index e49bd26b2b00c..f1bbd057103e4 100644
--- a/vllm/transformers_utils/configs/midashenglm.py
+++ b/vllm/transformers_utils/configs/midashenglm.py
@@ -98,6 +98,6 @@ class MiDashengLMConfig(PretrainedConfig):
             if text_config
             else Qwen2_5OmniTextConfig()
         )
-        self.text_config.rope_scaling = None  # uses_mrope is false
+        self.text_config.rope_parameters = None  # uses_mrope is false
         self.audio_token_id = audio_token_id
         super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index c6f04febe37e1..8f72f0b28b0de 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -86,13 +86,13 @@ def _remap_mistral_yarn_args(config: dict) -> dict:
         "apply_scale": "apply_yarn_scaling",
     }
     yarn_config = config.get("yarn") or {}
-    config["rope_scaling"] = {
+    config["rope_parameters"] = {
         "rope_type": "yarn",
         "mscale_all_dim": 1,
     }
     for old_name, new_name in yarn_config_map.items():
         if old_name in yarn_config:
-            config["rope_scaling"][new_name] = yarn_config.pop(old_name)
+            config["rope_parameters"][new_name] = yarn_config.pop(old_name)
 
     assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"
 
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 60eed549561fb..d112c71d7d20b 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -88,8 +88,8 @@ class NemotronConfig(PretrainedConfig):
             End of stream token id.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
+        rope_parameters (`dict`, *optional*):
+            The parameters of the RoPE embeddings.
         partial_rotary_factor (`float`, *optional*, defaults to 0.5):
             Percentage of the query and keys which will have rotary embedding.
         attention_bias (`bool`, *optional*, defaults to `False`):
@@ -132,8 +132,7 @@ class NemotronConfig(PretrainedConfig):
         bos_token_id=2,
         eos_token_id=3,
         tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         partial_rotary_factor=0.5,
         attention_bias=False,
         attention_dropout=0.0,
@@ -160,8 +159,13 @@ class NemotronConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.norm_eps = norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         # for backward compatibility
         partial_rotary_factor = (
             kwargs.get("rope_percent")
@@ -169,7 +173,7 @@ class NemotronConfig(PretrainedConfig):
             or partial_rotary_factor
         )
         self.partial_rotary_factor = partial_rotary_factor
-        self._rope_scaling_validation()
+        self._rope_parameters_validation()
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.mlp_bias = mlp_bias
@@ -182,31 +186,29 @@ class NemotronConfig(PretrainedConfig):
             **kwargs,
         )
 
-    def _rope_scaling_validation(self):
+    def _rope_parameters_validation(self):
         """
-        Validate the `rope_scaling` configuration.
+        Validate the `rope_parameters` configuration.
         """
-        if self.rope_scaling is None:
+        if self.rope_parameters is None:
             return
 
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+        rope_type: str | None = self.rope_parameters.get("rope_type", None)
+        factor: float | None = self.rope_parameters.get("factor", None)
+
+        if rope_type not in {"default", "linear", "dynamic"}:
             raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, "
-                f"`type` and `factor`, got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                "`rope_scaling`'s type field must be one of ['linear', "
-                f"'dynamic'], got {rope_scaling_type}"
-            )
-        if (
-            rope_scaling_factor is None
-            or not isinstance(rope_scaling_factor, float)
-            or rope_scaling_factor <= 1.0
-        ):
-            raise ValueError(
-                "`rope_scaling`'s factor field must be a float > 1, got "
-                f"{rope_scaling_factor}"
+                "`rope_type` must be one of ['default', 'linear', 'dynamic'], "
+                f"got {rope_type}"
             )
+        if rope_type != "default":
+            if factor is None:
+                raise ValueError(
+                    "If `rope_type` is not 'default', `rope_parameters` "
+                    "must include a `factor` field. Got `None`."
+                )
+            if not isinstance(factor, float) or factor <= 1.0:
+                raise ValueError(
+                    "`rope_parameters`'s factor field must be a float > 1, got "
+                    f"{factor}"
+                )
diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py
index f5a9a7cd36bdb..c4691b661af39 100644
--- a/vllm/transformers_utils/configs/olmo3.py
+++ b/vllm/transformers_utils/configs/olmo3.py
@@ -24,8 +24,7 @@ class Olmo3Config(PretrainedConfig):
         bos_token_id=None,
         eos_token_id=50279,
         tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         attention_bias=False,
         attention_dropout=0.0,
         rms_norm_eps=1e-5,
@@ -63,8 +62,13 @@ class Olmo3Config(PretrainedConfig):
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
 
diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py
index 21750bde2f878..d2fe58d48da6f 100644
--- a/vllm/transformers_utils/configs/qwen3_next.py
+++ b/vllm/transformers_utils/configs/qwen3_next.py
@@ -66,13 +66,12 @@ class Qwen3NextConfig(PretrainedConfig):
             relevant if `config.is_decoder=True`.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
+        rope_parameters (`dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
             and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
             accordingly.
             Expected contents:
+                `rope_theta` (`float`): The base period of the RoPE embeddings.
                 `rope_type` (`str`):
                     The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                     'llama3'], with 'default' being the original RoPE implementation.
@@ -199,8 +198,7 @@ class Qwen3NextConfig(PretrainedConfig):
         rms_norm_eps=1e-6,
         use_cache=True,
         tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         partial_rotary_factor=0.25,
         attention_bias=False,
         attention_dropout=0.0,
@@ -236,8 +234,13 @@ class Qwen3NextConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.partial_rotary_factor = partial_rotary_factor
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py
index 637b82d88e265..0ee650a70451f 100644
--- a/vllm/transformers_utils/configs/step3_vl.py
+++ b/vllm/transformers_utils/configs/step3_vl.py
@@ -52,8 +52,7 @@ class Step3TextConfig(PretrainedConfig):
         moe_intermediate_size: int = 5120,
         moe_num_experts: int = 48,
         moe_top_k: int = 3,
-        rope_theta: float = 500000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embedding: int = 65536,
         share_expert_dim: int = 5120,
         share_q_dim: int = 2048,
@@ -130,8 +129,13 @@ class Step3TextConfig(PretrainedConfig):
         self.moe_intermediate_size = moe_intermediate_size
         self.moe_num_experts = moe_num_experts
         self.moe_top_k = moe_top_k
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 500000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.max_position_embedding = max_position_embedding
         self.share_expert_dim = share_expert_dim
         self.share_q_dim = share_q_dim

From 0c80efd94fb8c17cfc7d1bcb9cdb65f154340994 Mon Sep 17 00:00:00 2001
From: Yuxuan Zhang <2448370773@qq.com>
Date: Thu, 20 Nov 2025 01:32:55 +0800
Subject: [PATCH 024/249] GLM-V video segmentation solution adjustment (#28941)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
---
 vllm/model_executor/models/glm4_1v.py | 94 +++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 6581bbda6d609..d141e95498064 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -37,7 +37,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from transformers import BatchFeature
+from transformers import BatchFeature, Glm4vProcessor
 from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig
 from transformers.models.glm4v.image_processing_glm4v import (
     Glm4vImageProcessor,
@@ -1028,7 +1028,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
 
         return max(max_frames_per_video, 1)
 
-    def _get_video_second_idx(
+    def _get_video_second_idx_glm4v(
         self, metadata: dict[str, Any], total_frames: int
     ) -> list[int]:
         video_processor = self.get_video_processor()
@@ -1079,6 +1079,83 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
             selected_timestamps.append(timestamps_list[idx])
         return selected_timestamps
 
+    def _get_video_second_idx_glm46v(
+        self, metadata: dict[str, Any], total_frames: int
+    ) -> list[int]:
+        video_processor = self.get_video_processor()
+
+        video_fps = metadata["fps"]
+        meta_frames = metadata.get("total_num_frames", total_frames)
+        max_frame_idx = meta_frames - 1
+        duration = metadata.get("duration", round(max_frame_idx / video_fps) + 1)
+
+        do_sample_frames = metadata.get("do_sample_frames", True)
+        if not do_sample_frames:
+            frame_indices = metadata["frames_indices"]
+        else:
+            DYNAMIC_FPS_THRES = {30: 3, 300: 1, 2400: 0.5}
+            MAX_FRAME_COUNT_DYNAMIC = 640
+            MAX_DURATION = 2400
+
+            effective_duration = min(duration, MAX_DURATION)
+            if effective_duration <= 30:
+                target_fps = DYNAMIC_FPS_THRES[30]
+            elif effective_duration <= 300:
+                target_fps = DYNAMIC_FPS_THRES[300]
+            else:
+                target_fps = DYNAMIC_FPS_THRES[2400]
+
+            temporal_patch_size = getattr(video_processor, "temporal_patch_size", 1)
+            extract_t = int(effective_duration * target_fps * temporal_patch_size)
+            extract_t = min(extract_t, MAX_FRAME_COUNT_DYNAMIC)
+
+            duration_per_frame = 1 / video_fps
+            timestamps = [i * duration_per_frame for i in range(meta_frames)]
+            max_second = int(duration)
+
+            if meta_frames < extract_t:
+                frame_indices = np.linspace(
+                    0, meta_frames - 1, extract_t, dtype=int
+                ).tolist()
+            else:
+                frame_indices = []
+                current_second = 0.0
+                inv_fps = 1 / (temporal_patch_size * target_fps)
+                for frame_index in range(meta_frames):
+                    if timestamps[frame_index] >= current_second:
+                        current_second += inv_fps
+                        frame_indices.append(frame_index)
+                        if current_second >= max_second:
+                            break
+
+            if len(frame_indices) < extract_t:
+                if len(frame_indices) == 0:
+                    start, end = 0, max(meta_frames - 1, 0)
+                else:
+                    start, end = frame_indices[0], frame_indices[-1]
+                frame_indices = np.linspace(start, end, extract_t, dtype=int).tolist()
+            elif len(frame_indices) > extract_t:
+                frame_indices = np.linspace(
+                    0, meta_frames - 1, extract_t, dtype=int
+                ).tolist()
+
+        seen, uniq = set(), []
+        for idx in frame_indices:
+            if idx not in seen:
+                seen.add(idx)
+                uniq.append(idx)
+
+        if len(uniq) & 1:
+            uniq.append(uniq[-1])
+
+        frame_indices = uniq
+        full_second_idxs = [int(idx / video_fps) for idx in frame_indices]
+        timestamps_list = full_second_idxs[::2]
+        selected_timestamps = []
+        for idx in range(len(timestamps_list)):
+            selected_timestamps.append(timestamps_list[idx])
+        return selected_timestamps
+
     def _construct_video_placeholder(
         self,
         video_array: np.ndarray,
@@ -1097,9 +1174,18 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
         merge_length = image_processor.merge_size**2
 
         assert isinstance(grid_thw, torch.Tensor)
-        timestamps = self._get_video_second_idx(metadata, len(video_array))
+        timestamps = (
+            self._get_video_second_idx_glm4v(metadata, len(video_array))
+            if isinstance(hf_processor, Glm4vProcessor)
+            else self._get_video_second_idx_glm46v(metadata, len(video_array))
+        )
+
+        timestamp_format = (
+            "{}" if isinstance(hf_processor, Glm4vProcessor) else "{:.1f} seconds"
+        )
         frames_idx_token = [
-            tokenizer.encode(str(i), add_special_tokens=False) for i in timestamps
+            tokenizer.encode(timestamp_format.format(i), add_special_tokens=False)
+            for i in timestamps
         ]
         T, H, W = grid_thw
         num_tokens_per_frame = int(H * W) // merge_length

From 61728cd1dfb03cbbfa03924f2a2cda311cfc13ac Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Wed, 19 Nov 2025 13:32:19 -0500
Subject: [PATCH 025/249] Re-enable FlashInfer for Llama4 on Blackwell in e2e
 fusion tests (#28966)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |  2 ++
 tests/compile/distributed/test_fusions_e2e.py | 12 ++++--------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d4b6f4077ab32..98daebcc06931 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -930,6 +930,8 @@ steps:
   - csrc/quantization/fp4/
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
   - vllm/compilation/
   # can affect pattern matching
   - vllm/model_executor/layers/layernorm.py
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 2e1b595a43895..661172e1965b5 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -47,12 +47,8 @@ if current_platform.is_cuda():
         ModelBackendTestCase(
             # Use smaller model for L40s in CI
             model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-            # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
-            #  so FI attention+fp8_quant is at least tested once
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER
-            if is_blackwell()
-            else AttentionBackendEnum.TRITON_ATTN,
+            backend=AttentionBackendEnum.TRITON_ATTN,
             matches=Matches(
                 attention_fusion=32,
                 allreduce_fusion=65,
@@ -65,9 +61,9 @@ if current_platform.is_cuda():
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
             # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
             # https://github.com/vllm-project/vllm/issues/28568
-            # TODO FlashInfer attn broken on Blackwell for llama4:
-            # https://github.com/vllm-project/vllm/issues/28604
-            backend=AttentionBackendEnum.TRITON_ATTN,
+            backend=AttentionBackendEnum.FLASHINFER
+            if is_blackwell()
+            else AttentionBackendEnum.TRITON_ATTN,
             matches=Matches(
                 attention_fusion=48,
                 allreduce_fusion=96,

From 3319a493fcc3e4733382f0dc812184234e9c3dcb Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Wed, 19 Nov 2025 11:20:22 -0800
Subject: [PATCH 026/249] [Core] Reuse created spec tokens lists to mitigate GC
 cost (#28917)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 vllm/v1/worker/gpu_input_batch.py  | 18 ++++++++++++------
 vllm/v1/worker/gpu_model_runner.py |  3 ++-
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 023b5edb2c340..c1bfe727d86e5 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -251,7 +251,7 @@ class InputBatch:
         self.logitsprocs_need_output_token_ids = logitsprocs_need_output_token_ids
 
         # Store last speculative tokens for sampler.
-        self.spec_token_ids: list[list[int] | None] = []
+        self.spec_token_ids: list[list[int]] = [[] for _ in range(max_num_reqs)]
 
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
@@ -313,7 +313,7 @@ class InputBatch:
         else:
             self._req_ids[req_index] = req_id
             self.req_output_token_ids[req_index] = request.output_token_ids
-            self.spec_token_ids[req_index] = []
+            self.spec_token_ids[req_index].clear()
 
         self.req_id_to_index[req_id] = req_index
 
@@ -462,7 +462,7 @@ class InputBatch:
         self.batch_update_builder.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
-        self.spec_token_ids[req_index] = None
+        self.spec_token_ids[req_index].clear()
 
         # LoRA
         lora_id = self.request_lora_mapping[req_index]
@@ -654,9 +654,15 @@ class InputBatch:
             self.req_output_token_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
-            spec_token_ids = self.spec_token_ids[last_req_index]
-            self.spec_token_ids[empty_index] = spec_token_ids
-            self.spec_token_ids[last_req_index] = None
+            if last_req_index != empty_index:
+                (
+                    self.spec_token_ids[last_req_index],
+                    self.spec_token_ids[empty_index],
+                ) = (
+                    self.spec_token_ids[empty_index],
+                    self.spec_token_ids[last_req_index],
+                )
+                self.spec_token_ids[last_req_index].clear()
 
             num_tokens = self.num_tokens[last_req_index]
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 3b00085b6bb99..0c35f1330e9f0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -892,7 +892,8 @@ class GPUModelRunner(
             # conform to the schema. This can result in
             # scheduler_output.scheduled_spec_decode_tokens being empty,
             # even when speculative decoding is enabled.
-            self.input_batch.spec_token_ids[req_index] = spec_token_ids
+            self.input_batch.spec_token_ids[req_index].clear()
+            self.input_batch.spec_token_ids[req_index].extend(spec_token_ids)
 
             # there are no draft tokens with async scheduling,
             # we clear the spec_decoding info in scheduler_output and

From fe69f331f84d99541564dfe4852dd45220ed7875 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Wed, 19 Nov 2025 14:23:54 -0500
Subject: [PATCH 027/249] [Kernels] Improve H200 Fused MoE Config (#28992)

Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 122 +++++++++---------
 1 file changed, 61 insertions(+), 61 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 6fcf408755f5d..532c16e899269 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,11 +1,11 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "2": {
         "BLOCK_SIZE_M": 16,
@@ -13,82 +13,82 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "48": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
     "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
     "256": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -96,10 +96,10 @@
         "num_stages": 3
     },
     "512": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -109,7 +109,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
@@ -117,21 +117,21 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     }

From 9d2d5612573c20f8bf00242a8525c2a5dcfe4c06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=B0=E5=85=AE?=
 <38908462+zhyajie@users.noreply.github.com>
Date: Thu, 20 Nov 2025 03:30:57 +0800
Subject: [PATCH 028/249] [Bugfix]  Fix precision corruption when
 shared_experts_stream=None (#28942)

Signed-off-by: zhyajie <yajizhan@amd.com>
Co-authored-by: zhyajie <yajizhan@amd.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 11 +++++++----
 vllm/utils/torch_utils.py                     |  3 +--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index c41995e4a9136..8e9bba3442873 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -371,8 +371,8 @@ class FusedMoE(CustomOp):
             logger.info_once("Disabling MoE shared_experts cuda stream")
             self.shared_experts_stream = None
         else:
-            # TODO(rob): enable shared expert overlap with non-cuda.
-            # aux_stream() returns None on non-cuda platforms.
+            # TODO(rob): enable shared expert overlap with non-cuda-alike.
+            # aux_stream() returns None on non-cuda-alike platforms.
             self.shared_experts_stream = aux_stream()
             if self.shared_experts_stream is not None:
                 logger.info_once("Enabled separate cuda stream for MoE shared_experts")
@@ -1865,6 +1865,11 @@ class FusedMoE(CustomOp):
                 hidden_states_combined, router_logits = get_ep_group().dispatch(
                     hidden_states, router_logits, self.is_sequence_parallel
                 )
+            # Run shared experts before matrix multiply.
+            # because matrix multiply maybe modify the hidden_states.
+            if has_separate_shared_experts and not use_shared_experts_stream:
+                assert self.shared_experts is not None
+                shared_output = self.shared_experts(hidden_states)
 
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
@@ -1908,8 +1913,6 @@ class FusedMoE(CustomOp):
                         # conflict with the main stream
                         shared_output = self.shared_experts(hidden_states_clone)
                     current_stream().wait_stream(self.shared_experts_stream)
-                else:
-                    shared_output = self.shared_experts(hidden_states)
 
                 final_hidden_states = (
                     shared_output,
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index 7c094e14cff72..3661dfd09047a 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -426,8 +426,7 @@ def aux_stream() -> torch.cuda.Stream | None:
 
     from vllm.platforms import current_platform
 
-    # TODO: validate this works properly on ROCm platform.
-    if _aux_stream is None and current_platform.is_cuda():
+    if _aux_stream is None and current_platform.is_cuda_alike():
         _aux_stream = torch.cuda.Stream()
 
     return _aux_stream

From ac10fd3c6900228e3c0a8fae20d039668c132446 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Wed, 19 Nov 2025 11:59:30 -0800
Subject: [PATCH 029/249] Upstreaming aiter triton attention backend as a new
 backend (#28701)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
---
 vllm/attention/backends/registry.py           |  3 +
 vllm/platforms/rocm.py                        |  4 +-
 .../backends/mla/aiter_triton_mla.py          | 74 +++++++++++++++++++
 3 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 vllm/v1/attention/backends/mla/aiter_triton_mla.py

diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index 51899b0235915..91e1cad01f4fd 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -46,6 +46,9 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"
     ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
     ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"
+    ROCM_AITER_TRITON_MLA = (
+        "vllm.v1.attention.backends.mla.aiter_triton_mla.AiterTritonMLABackend"
+    )
     ROCM_AITER_FA = (
         "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
     )
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index bb116792fed54..f07f068a9249b 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -234,7 +234,6 @@ class RocmPlatform(Platform):
                     if rocm_aiter_ops.is_mla_enabled() or block_size == 1
                     else AttentionBackendEnum.TRITON_MLA
                 )
-
             if selected_backend == AttentionBackendEnum.TRITON_MLA:
                 if block_size != 1:
                     logger.info_once("Using Triton MLA backend.")
@@ -246,6 +245,9 @@ class RocmPlatform(Platform):
             if selected_backend == AttentionBackendEnum.ROCM_AITER_MLA:
                 logger.info("Using AITER MLA backend.")
                 return AttentionBackendEnum.ROCM_AITER_MLA.get_path()
+            if selected_backend == AttentionBackendEnum.ROCM_AITER_TRITON_MLA:
+                logger.info("Using AITER TRITON MLA backend.")
+                return AttentionBackendEnum.ROCM_AITER_TRITON_MLA.get_path()
 
             raise ValueError(
                 f" The selected backend, {selected_backend.name},"
diff --git a/vllm/v1/attention/backends/mla/aiter_triton_mla.py b/vllm/v1/attention/backends/mla/aiter_triton_mla.py
new file mode 100644
index 0000000000000..8a92152a0ca53
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/aiter_triton_mla.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.attention.backends.mla.common import MLACommonBackend
+from vllm.v1.attention.backends.mla.rocm_aiter_mla import (
+    AiterMLAImpl,
+    AiterMLAMetadataBuilder,
+)
+
+
+class AiterTritonMLABackend(MLACommonBackend):
+    @staticmethod
+    def get_name() -> str:
+        return "AITER_TRITON_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> type["AiterTritonMLAImpl"]:
+        return AiterTritonMLAImpl
+
+    @staticmethod
+    def get_builder_cls() -> type["AiterMLAMetadataBuilder"]:
+        return AiterMLAMetadataBuilder
+
+
+class AiterTritonMLAImpl(AiterMLAImpl):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **mla_args,
+        )
+        from aiter.ops.triton.mha import flash_attn_varlen_func
+
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+
+    def _flash_attn_varlen_diff_headdims(
+        self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
+    ):
+        result = self.flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            softmax_scale=softmax_scale,
+            return_lse=return_softmax_lse,
+            **kwargs,
+        )
+        # Transpose the LSE if Triton MHA is used:
+        # (q.shape[0], num_q_heads) to (num_q_heads, q.shape[0])
+        if type(result) is tuple and return_softmax_lse:
+            output, lse = result
+            lse = lse.T.contiguous()
+            return (output, lse)
+        return result

From 02f5903b84cfdf0b7cb31d46e995e3d4b9ad9e53 Mon Sep 17 00:00:00 2001
From: Izzy Putterman <carlipp176@gmail.com>
Date: Wed, 19 Nov 2025 12:01:05 -0800
Subject: [PATCH 030/249] Eagle: MM Cuda Graphs with MRope (#28896)

Signed-off-by: Izzy Putterman <iputterman@nvidia.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/llama_eagle3.py | 14 ++++++--------
 vllm/v1/spec_decode/eagle.py               | 13 +++++++++++--
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 75c671311b491..3eaf2d80082f1 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -23,7 +23,6 @@ from vllm.model_executor.model_loader.weight_utils import (
     maybe_remap_kv_scale_name,
 )
 from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM
-from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
 
 from .utils import (
@@ -121,13 +120,12 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
 
 
 @support_torch_compile(
-    # torch.compile is disabled for multimodal EAGLE3 models due to constraint
-    # violations with dynamic shapes during tensor concatenation operations.
-    # See: https://github.com/vllm-project/vllm/pull/22872/files#r2362028132
-    # Non-multimodal EAGLE3 models can still use torch.compile safely.
-    enable_if=lambda vllm_config: not MULTIMODAL_REGISTRY.supports_multimodal_inputs(
-        vllm_config.model_config
-    ),
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "hidden_states": 0,
+        "input_embeds": 0,
+    }
 )
 class LlamaModel(nn.Module):
     def __init__(
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 5bf2503c3027d..406bb696bd4cf 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -116,9 +116,18 @@ class EagleProposer:
         )
         self.uses_mrope = self.vllm_config.model_config.uses_mrope
         if self.uses_mrope:
-            # M-RoPE need (3, max_num_tokens)
+            # NOTE: `mrope_positions` is implemented with one additional dummy
+            # position on purpose to make it non-contiguous so that it can work
+            # with torch compile.
+            # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923
+
+            # NOTE: When M-RoPE is enabled, position ids are 3D regardless of
+            # the modality of inputs. For text-only inputs, each dimension has
+            # identical position IDs, making M-RoPE functionally equivalent to
+            # 1D-RoPE.
+            # See page 5 of https://arxiv.org/abs/2409.12191
             self.mrope_positions = torch.zeros(
-                (3, self.max_num_tokens), dtype=torch.int64, device=device
+                (3, self.max_num_tokens + 1), dtype=torch.int64, device=device
             )
         else:
             # RoPE need (max_num_tokens,)

From 2fd893b4cec0975a2a8430077fd9b4f294eb3561 Mon Sep 17 00:00:00 2001
From: Qiu <qiuchunshuo@huawei.com>
Date: Thu, 20 Nov 2025 04:52:44 +0800
Subject: [PATCH 031/249] [Feature] Prefill Context Parallel (PCP) basic
 support (#28718)

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
Signed-off-by: FENP <yuanyongjie.yyj@antgroup.com>
Signed-off-by: LookAround <lixushi@huawei.com>
Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com>
Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Co-authored-by: FENP <yuanyongjie.yyj@antgroup.com>
Co-authored-by: LookAround <lixushi@huawei.com>
Co-authored-by: Jingchun Gao <gaojingchun1@huawei.com>
Co-authored-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Co-authored-by: Jingchun Gao <63247409+gjc0824@users.noreply.github.com>
---
 tests/distributed/test_context_parallel.py    | 12 +--
 .../moe/modular_kernel_tools/common.py        |  7 +-
 tests/v1/worker/test_gpu_model_runner.py      |  4 +-
 vllm/attention/backends/abstract.py           | 17 +++++
 vllm/attention/ops/common.py                  | 40 +++++++++-
 vllm/config/parallel.py                       | 40 +++++++---
 vllm/config/vllm.py                           | 32 ++++++--
 vllm/distributed/parallel_state.py            | 74 +++++++++++++++----
 vllm/engine/arg_utils.py                      | 22 ++++++
 .../model_executor/layers/fused_moe/config.py | 59 ++++++++++-----
 vllm/model_executor/layers/fused_moe/layer.py | 32 ++++++++
 vllm/model_executor/models/gpt_oss.py         |  9 ++-
 vllm/v1/attention/backends/flash_attn.py      |  6 +-
 vllm/v1/attention/backends/mla/common.py      |  6 +-
 vllm/v1/attention/backends/utils.py           | 18 ++---
 vllm/v1/core/kv_cache_coordinator.py          | 17 +++++
 vllm/v1/core/kv_cache_manager.py              |  9 +--
 vllm/v1/core/kv_cache_utils.py                | 13 +++-
 vllm/v1/core/sched/scheduler.py               |  2 +
 vllm/v1/core/single_type_kv_cache_manager.py  | 19 ++++-
 vllm/v1/engine/core.py                        |  1 +
 vllm/v1/executor/multiproc_executor.py        | 23 ++++--
 vllm/v1/kv_cache_interface.py                 |  5 +-
 vllm/v1/worker/block_table.py                 | 35 +++++----
 vllm/v1/worker/gpu_input_batch.py             |  4 +-
 vllm/v1/worker/gpu_model_runner.py            |  4 +-
 vllm/v1/worker/gpu_worker.py                  |  3 +
 27 files changed, 399 insertions(+), 114 deletions(-)

diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index b16fd0d06b145..7e4713b8aece0 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -31,7 +31,7 @@ class ParallelSetup(NamedTuple):
     tp_size: int
     pp_size: int
     dcp_size: int
-    dcp_kv_cache_interleave_size: int
+    cp_kv_cache_interleave_size: int
     eager_mode: bool
     chunked_prefill: bool
 
@@ -55,7 +55,7 @@ class CPTestSettings:
         tp_base: int = 4,
         pp_base: int = 1,
         dcp_base: int = 1,
-        dcp_kv_cache_interleave_size: int = 1,
+        cp_kv_cache_interleave_size: int = 1,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
         load_format: str | None = None,
@@ -71,7 +71,7 @@ class CPTestSettings:
                                 tp_size=tp_base,
                                 pp_size=pp_multiplier * pp_base,
                                 dcp_size=int(dcp_multiplier * tp_base),
-                                dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+                                cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
                                 eager_mode=eager_mode_val,
                                 chunked_prefill=chunked_prefill_val,
                             )
@@ -116,7 +116,7 @@ def _compare_cp_with_tp(
         tp_size,
         pp_size,
         dcp_size,
-        dcp_kv_cache_interleave_size,
+        cp_kv_cache_interleave_size,
         eager_mode,
         chunked_prefill,
     ) = parallel_setup
@@ -197,7 +197,7 @@ def _compare_cp_with_tp(
         "--decode-context-parallel-size",
         str(dcp_size),
         "--dcp-kv-cache-interleave-size",
-        str(dcp_kv_cache_interleave_size),
+        str(cp_kv_cache_interleave_size),
         "--distributed-executor-backend",
         distributed_backend,
     ]
@@ -227,7 +227,7 @@ CP_TEXT_GENERATION_MODELS = {
     "deepseek-ai/DeepSeek-V2-Lite-Chat": [
         CPTestSettings.detailed(),
         CPTestSettings.detailed(tp_base=2),
-        CPTestSettings.detailed(tp_base=2, dcp_kv_cache_interleave_size=64),
+        CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64),
     ],
     "bigcode/gpt_bigcode-santacoder": [
         CPTestSettings.detailed(),
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index 1d925dc1bea8f..d95c22fdf0a5b 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -15,7 +15,11 @@ from tests.kernels.quantization.nvfp4_utils import (
 )
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig
-from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size
+from vllm.distributed import (
+    get_dp_group,
+    get_pcp_group,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
@@ -561,6 +565,7 @@ def make_modular_kernel(
     # make moe config
     moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
         tp_size_=get_tensor_model_parallel_world_size(),
+        pcp_size_=get_pcp_group().world_size,
         dp_size_=get_dp_group().world_size,
         vllm_parallel_config=vllm_config.parallel_config,
     )
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index b95c8df3469b3..824e458978350 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -956,7 +956,7 @@ def test_hybrid_block_table_initialization():
     max_num_reqs = 10
     max_num_blocks_per_req = 20
     max_num_batched_tokens = 512
-    dcp_kv_cache_interleave_size = 8
+    cp_kv_cache_interleave_size = 8
 
     block_table = BlockTable(
         block_size=block_size,
@@ -966,7 +966,7 @@ def test_hybrid_block_table_initialization():
         pin_memory=False,
         device=torch.device(DEVICE),
         kernel_block_size=kernel_block_sizes[0],
-        dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+        cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
     )
 
     # Verify hybrid block configuration
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 9275d70fd86a4..d28bc065852db 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -266,6 +266,12 @@ class AttentionImpl(ABC, Generic[T]):
     dcp_world_size: int
     dcp_rank: int
 
+    pcp_world_size: int
+    pcp_rank: int
+
+    total_cp_world_size: int
+    total_cp_rank: int
+
     def __new__(cls, *args, **kwargs):
         # use __new__ so that all subclasses will call this
         self = super().__new__(cls)
@@ -278,6 +284,17 @@ class AttentionImpl(ABC, Generic[T]):
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
+        try:
+            from vllm.distributed.parallel_state import get_pcp_group
+
+            self.pcp_world_size = get_pcp_group().world_size
+            self.pcp_rank = get_pcp_group().rank_in_group
+        except AssertionError:
+            self.pcp_world_size = 1
+            self.pcp_rank = 0
+        self.total_cp_world_size = self.pcp_world_size * self.dcp_world_size
+        self.total_cp_rank = self.pcp_rank * self.dcp_world_size + self.dcp_rank
+
         self.need_to_return_lse_for_decode = (
             self.dcp_world_size > 1 and self.can_return_lse_for_decode
         )
diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index 2cbb5c91cc3b3..67c5f7dbba9c0 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -169,12 +169,11 @@ def correct_attn_out(
     return out, lse
 
 
-def cp_lse_ag_out_rs(
+def _cp_lse_common(
     cp_attn_out: torch.Tensor,
     cp_attn_lse: torch.Tensor,
     cp_group: GroupCoordinator,
-    ctx: CPTritonContext = None,
-    return_lse=False,
+    ctx: CPTritonContext | None = None,
 ):
     """
     cp_attn_out: [ B, H, D ]
@@ -195,6 +194,22 @@ def cp_lse_ag_out_rs(
     cp_attn_lse = cp_attn_lse.contiguous()
     lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
     out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
+    assert out.is_contiguous()
+    return out, lse
+
+
+def cp_lse_ag_out_rs(
+    cp_attn_out: torch.Tensor,
+    cp_attn_lse: torch.Tensor,
+    cp_group: GroupCoordinator,
+    ctx: CPTritonContext | None = None,
+    return_lse: bool = False,
+):
+    """
+    cp_attn_out: [ B, H, D ]
+    cp_attn_lse: [ B, H ]
+    """
+    out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx)
     out = cp_group.reduce_scatter(out, dim=1)
 
     if return_lse:
@@ -205,6 +220,25 @@ def cp_lse_ag_out_rs(
     return out
 
 
+def cp_lse_ag_out_ar(
+    cp_attn_out: torch.Tensor,
+    cp_attn_lse: torch.Tensor,
+    cp_group: GroupCoordinator,
+    ctx: CPTritonContext | None = None,
+    return_lse: bool = False,
+):
+    """
+    cp_attn_out: [ B, H, D ]
+    cp_attn_lse: [ B, H ]
+    """
+    out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx)
+    out = cp_group.all_reduce(out)
+
+    if return_lse:
+        return out, lse
+    return out
+
+
 @triton.jit
 def _pack_seq_kernel(
     x_ptr,  # [N, D]
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 0f107a7a3ef83..4b0236d8de3f5 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -71,6 +71,8 @@ class ParallelConfig:
     """Number of pipeline parallel groups."""
     tensor_parallel_size: int = 1
     """Number of tensor parallel groups."""
+    prefill_context_parallel_size: int = 1
+    """Number of prefill context parallel groups."""
     data_parallel_size: int = 1
     """Number of data parallel groups. MoE layers will be sharded according to
     the product of the tensor parallel size and data parallel size."""
@@ -239,14 +241,25 @@ class ParallelConfig:
     needs to be divisible by dcp_size."""
 
     dcp_kv_cache_interleave_size: int = 1
-    """Interleave size of kv_cache storage while using dcp or cp > 1,
-    store interleave_size tokens on (d)cp i,
-    then store next interleave_size tokens on (d)cp i+1.
-    Interleave_size=1: token-level align, token i is stored on rank i % (d)cp_size.
-    Interleave_size=block_size: block-level align, first fill the block on first rank,
-    token is stored on rank i+1 block j after rank i block j is full.
-    Block_size should be greater than or equal to dcp_kv_cache_interleave_size.
-    Block_size should be divisible by dcp_kv_cache_interleave_size.
+    """
+    Interleave size of kv_cache storage while using DCP.
+    dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size,
+    and will be deprecated when PCP is fully supported.
+
+    """
+    cp_kv_cache_interleave_size: int = 1
+    """Interleave size of kv_cache storage while using DCP or PCP.
+    For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
+        and `total_cp_world_size = pcp_world_size * dcp_world_szie`.
+    store interleave_size tokens on total_cp_rank i,
+    then store next interleave_size tokens on taotal_cp_rank i+1.
+    Interleave_size=1: token-level alignment, where token `i` is stored on
+        total_cp_rank `i % total_cp_world_size`.
+    Interleave_size=block_size: block-level alignment, where tokens are
+        first populated to the preceding ranks. Tokens are then stored
+        in (rank i+1, block j) only after (rank i, block j) is fully occupied.
+    Block_size should be greater than or equal to cp_kv_cache_interleave_size.
+    Block_size should be divisible by cp_kv_cache_interleave_size.
     """
 
     _api_process_count: int = Field(default=1, gt=0)
@@ -311,6 +324,11 @@ class ParallelConfig:
                     "num_redundant_experts."
                 )
 
+        if self.prefill_context_parallel_size > 1:
+            raise ValueError(
+                "Prefill context parallelism is not fully supported. "
+                "Please set prefill_context_parallel_size to 1."
+            )
         return self
 
     @property
@@ -529,7 +547,11 @@ class ParallelConfig:
             )
 
         # Continue with the rest of the initialization
-        self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size
+        self.world_size = (
+            self.pipeline_parallel_size
+            * self.tensor_parallel_size
+            * self.prefill_context_parallel_size
+        )
 
         if self.distributed_executor_backend == "external_launcher":
             logger.info("Using external launcher for distributed inference.")
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 672b004c4aa56..d64e315b4fe39 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -481,6 +481,14 @@ class VllmConfig:
                         "Overriding cudagraph_mode to PIECEWISE."
                     )
                     self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+                # prefill context parallel do not support full cudagraphs
+                elif self.parallel_config.prefill_context_parallel_size > 1:
+                    logger.warning_once(
+                        "Prefill context parallel (PCP) is enabled, which is "
+                        "incompatible with full CUDA graphs. "
+                        "Overriding cudagraph_mode to PIECEWISE."
+                    )
+                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
                 elif self.model_config is not None:
                     if self.model_config.pooler_config is not None:
                         logger.warning_once(
@@ -610,22 +618,34 @@ class VllmConfig:
 
         # If DCP, ensure the block size is right.
         if self.parallel_config.decode_context_parallel_size > 1:
+            if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
+                self.parallel_config.cp_kv_cache_interleave_size
+                != self.parallel_config.dcp_kv_cache_interleave_size
+            ):
+                self.parallel_config.cp_kv_cache_interleave_size = (
+                    self.parallel_config.dcp_kv_cache_interleave_size
+                )
+                logger.warning_once(
+                    "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
+                    "_interleave_size. And dcp-kv-cache-interleave-size will be "
+                    "deprecated when PCP is fully supported."
+                )
             assert (
-                self.parallel_config.dcp_kv_cache_interleave_size
+                self.parallel_config.cp_kv_cache_interleave_size
                 <= self.cache_config.block_size
                 and self.cache_config.block_size
-                % self.parallel_config.dcp_kv_cache_interleave_size
+                % self.parallel_config.cp_kv_cache_interleave_size
                 == 0
             ), (
                 f"Block_size({self.cache_config.block_size}) should be greater "
-                "than or equal to and divisible by dcp_kv_cache_interleave_size "
-                f"({self.parallel_config.dcp_kv_cache_interleave_size})."
+                "than or equal to and divisible by cp_kv_cache_interleave_size "
+                f"({self.parallel_config.cp_kv_cache_interleave_size})."
             )
 
         assert (
-            self.parallel_config.dcp_kv_cache_interleave_size == 1
+            self.parallel_config.cp_kv_cache_interleave_size == 1
             or self.speculative_config is None
-        ), "MTP with dcp_kv_cache_interleave_size > 1 is not supported now."
+        ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
 
         # Do this after all the updates to compilation_config.mode
         if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 852c4c644433f..f81612fd1f4a3 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1098,6 +1098,12 @@ get_context_model_parallel_group = get_dcp_group
 
 _PP: GroupCoordinator | None = None
 
+
+def get_pp_group() -> GroupCoordinator:
+    assert _PP is not None, "pipeline model parallel group is not initialized"
+    return _PP
+
+
 _DP: GroupCoordinator | None = None
 
 
@@ -1114,9 +1120,12 @@ def get_ep_group() -> GroupCoordinator:
     return _EP
 
 
-def get_pp_group() -> GroupCoordinator:
-    assert _PP is not None, "pipeline model parallel group is not initialized"
-    return _PP
+_PCP: GroupCoordinator | None = None
+
+
+def get_pcp_group() -> GroupCoordinator:
+    assert _PCP is not None, "prefill context parallel group is not initialized"
+    return _PCP
 
 
 @deprecated(
@@ -1276,6 +1285,7 @@ def init_distributed_environment(
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
+    prefill_context_model_parallel_size: int = 1,
     decode_context_model_parallel_size: int | None = 1,
     backend: str | None = None,
 ) -> None:
@@ -1325,7 +1335,11 @@ def initialize_model_parallel(
     # to get group_ranks for each dimension, transpose that dimension to the
     # last dimension, then reshape to 2D, then unbind the last dimension
     all_ranks = torch.arange(world_size).reshape(
-        -1, data_parallel_size, pipeline_model_parallel_size, tensor_model_parallel_size
+        -1,
+        data_parallel_size,
+        pipeline_model_parallel_size,
+        prefill_context_model_parallel_size,
+        tensor_model_parallel_size,
     )  # noqa
 
     # Build the tensor model-parallel groups.
@@ -1360,11 +1374,23 @@ def initialize_model_parallel(
         group_name="dcp",
     )
 
+    global _PCP
+    assert _PCP is None, "prefill context parallel group is already initialized"
+    group_ranks = (
+        all_ranks.transpose(3, 4)
+        .reshape(-1, prefill_context_model_parallel_size)
+        .unbind(0)
+    )
+    group_ranks = [x.tolist() for x in group_ranks]
+    _PCP = init_model_parallel_group(
+        group_ranks, get_world_group().local_rank, backend, group_name="pcp"
+    )
+
     # Build the pipeline model-parallel groups.
     global _PP
     assert _PP is None, "pipeline model parallel group is already initialized"
     group_ranks = (
-        all_ranks.transpose(2, 3).reshape(-1, pipeline_model_parallel_size).unbind(0)
+        all_ranks.transpose(2, 4).reshape(-1, pipeline_model_parallel_size).unbind(0)
     )
     group_ranks = [x.tolist() for x in group_ranks]
     _PP = init_model_parallel_group(
@@ -1373,7 +1399,7 @@ def initialize_model_parallel(
 
     global _DP
     assert _DP is None, "data parallel group is already initialized"
-    group_ranks = all_ranks.transpose(1, 3).reshape(-1, data_parallel_size).unbind(0)
+    group_ranks = all_ranks.transpose(1, 4).reshape(-1, data_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
     _DP = init_model_parallel_group(
         group_ranks, get_world_group().local_rank, backend, group_name="dp"
@@ -1383,7 +1409,12 @@ def initialize_model_parallel(
     assert _EP is None, "expert parallel group is already initialized"
     group_ranks = (
         all_ranks.transpose(1, 2)
-        .reshape(-1, data_parallel_size * tensor_model_parallel_size)
+        .reshape(
+            -1,
+            data_parallel_size
+            * prefill_context_model_parallel_size
+            * tensor_model_parallel_size,
+        )
         .unbind(0)
     )
     group_ranks = [x.tolist() for x in group_ranks]
@@ -1393,11 +1424,13 @@ def initialize_model_parallel(
 
     logger.info_once(
         "rank %s in world size %s is assigned as "
-        "DP rank %s, PP rank %s, TP rank %s, EP rank %s",
+        "DP rank %s, PP rank %s, PCP rank %s, "
+        "TP rank %s, EP rank %s",
         rank,
         world_size,
         _DP.rank_in_group,
         _PP.rank_in_group,
+        _PCP.rank_in_group,
         _TP.rank_in_group,
         _EP.rank_in_group,
     )
@@ -1406,6 +1439,7 @@ def initialize_model_parallel(
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
+    prefill_context_model_parallel_size: int = 1,
     decode_context_model_parallel_size: int | None = 1,
     backend: str | None = None,
 ) -> None:
@@ -1418,6 +1452,7 @@ def ensure_model_parallel_initialized(
         initialize_model_parallel(
             tensor_model_parallel_size,
             pipeline_model_parallel_size,
+            prefill_context_model_parallel_size,
             decode_context_model_parallel_size,
             backend,
         )
@@ -1434,6 +1469,12 @@ def ensure_model_parallel_initialized(
         f"got: {pp_world_size=} vs. "
         f"wanted: {pipeline_model_parallel_size=}"
     )
+    pcp_world_size = get_pcp_group().world_size
+    assert pcp_world_size == prefill_context_model_parallel_size, (
+        "prefill context parallel group already initialized, but of unexpected size: "
+        f"{pcp_world_size=} vs. "
+        f"{prefill_context_model_parallel_size=}"
+    )
 
 
 def prepare_communication_buffer_for_model(model: torch.nn.Module):
@@ -1445,6 +1486,8 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module):
     """
     if _TP is not None:
         _TP.prepare_communication_buffer_for_model(model)
+    if _PCP is not None:
+        _PCP.prepare_communication_buffer_for_model(model)
     if _PP is not None:
         _PP.prepare_communication_buffer_for_model(model)
     if _DP is not None:
@@ -1520,16 +1563,21 @@ def destroy_model_parallel():
         _TP.destroy()
     _TP = None
 
-    global _PP
-    if _PP:
-        _PP.destroy()
-    _PP = None
-
     global _DCP
     if _DCP:
         _DCP.destroy()
     _DCP = None
 
+    global _PCP
+    if _PCP:
+        _PCP.destroy()
+    _PCP = None
+
+    global _PP
+    if _PP:
+        _PP.destroy()
+    _PP = None
+
     global _DP
     if _DP:
         _DP.destroy()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e2f7326448b3a..68205b6079d78 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -389,8 +389,10 @@ class EngineArgs:
     nnodes: int = ParallelConfig.nnodes
     node_rank: int = ParallelConfig.node_rank
     tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
+    prefill_context_parallel_size: int = ParallelConfig.prefill_context_parallel_size
     decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size
     dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size
+    cp_kv_cache_interleave_size: int = ParallelConfig.cp_kv_cache_interleave_size
     data_parallel_size: int = ParallelConfig.data_parallel_size
     data_parallel_rank: int | None = None
     data_parallel_start_rank: int | None = None
@@ -770,6 +772,15 @@ class EngineArgs:
             "--dcp-kv-cache-interleave-size",
             **parallel_kwargs["dcp_kv_cache_interleave_size"],
         )
+        parallel_group.add_argument(
+            "--cp-kv-cache-interleave-size",
+            **parallel_kwargs["cp_kv_cache_interleave_size"],
+        )
+        parallel_group.add_argument(
+            "--prefill-context-parallel-size",
+            "-pcp",
+            **parallel_kwargs["prefill_context_parallel_size"],
+        )
         parallel_group.add_argument(
             "--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"]
         )
@@ -1600,6 +1611,7 @@ class EngineArgs:
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
+            prefill_context_parallel_size=self.prefill_context_parallel_size,
             data_parallel_size=self.data_parallel_size,
             data_parallel_rank=self.data_parallel_rank or 0,
             data_parallel_external_lb=data_parallel_external_lb,
@@ -1631,6 +1643,7 @@ class EngineArgs:
             worker_extension_cls=self.worker_extension_cls,
             decode_context_parallel_size=self.decode_context_parallel_size,
             dcp_kv_cache_interleave_size=self.dcp_kv_cache_interleave_size,
+            cp_kv_cache_interleave_size=self.cp_kv_cache_interleave_size,
             _api_process_count=self._api_process_count,
             _api_process_rank=self._api_process_rank,
         )
@@ -1952,6 +1965,15 @@ class EngineArgs:
             default_prefix_caching,
         ) = self.get_chunked_prefill_prefix_caching_defaults(model_config)
 
+        if self.prefill_context_parallel_size > 1:
+            default_chunked_prefill = False
+            default_prefix_caching = False
+            logger.warning(
+                "--prefill-context-parallel-size > 1 is not compatible with "
+                "chunked prefill and prefix caching now. Chunked prefill "
+                "and prefix caching have been disabled by default."
+            )
+
         if self.enable_chunked_prefill is None:
             self.enable_chunked_prefill = default_chunked_prefill
 
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index a7bd64b1c65e9..21eb4d590a7d1 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -8,7 +8,11 @@ import torch
 
 import vllm.envs as envs
 from vllm.config import ParallelConfig
-from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank
+from vllm.distributed import (
+    get_dp_group,
+    get_pcp_group,
+    get_tensor_model_parallel_rank,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
     OCP_MX_DTYPES,
@@ -684,9 +688,11 @@ FUSED_MOE_UNQUANTIZED_CONFIG: FusedMoEQuantConfig = FusedMoEQuantConfig.make()
 @dataclass
 class FusedMoEParallelConfig:
     tp_size: int
+    pcp_size: int
     dp_size: int
     ep_size: int
     tp_rank: int
+    pcp_rank: int
     dp_rank: int
     ep_rank: int
 
@@ -713,19 +719,22 @@ class FusedMoEParallelConfig:
         return self.use_all2all_kernels and self.all2all_backend == "deepep_low_latency"
 
     @staticmethod
-    def flatten_tp_across_dp(
-        tp_size: int, dp_size: int, dp_rank: int
+    def flatten_tp_across_dp_and_pcp(
+        tp_size: int, dp_size: int, dp_rank: int, pcp_size: int, pcp_rank: int
     ) -> tuple[int, int]:
         tp_rank = 0 if tp_size == 1 else get_tensor_model_parallel_rank()
-        # There are actually dp_size * tp_size devices. Update tp_size
-        # and tp_rank so we shard across all devices.
-        flatten_tp_size = dp_size * tp_size
-        flatten_tp_rank = dp_rank * tp_size + tp_rank
+        # There are actually dp_size * pcp_size * tp_size devices.
+        # Update tp_size and tp_rank so we shard across all devices.
+        flatten_tp_size = dp_size * pcp_size * tp_size
+        flatten_tp_rank = dp_rank * pcp_size * tp_size + pcp_rank * tp_size + tp_rank
         return flatten_tp_size, flatten_tp_rank
 
     @staticmethod
     def make(
-        tp_size_: int, dp_size_: int, vllm_parallel_config: ParallelConfig
+        tp_size_: int,
+        pcp_size_: int,
+        dp_size_: int,
+        vllm_parallel_config: ParallelConfig,
     ) -> "FusedMoEParallelConfig":
         """
         Determine MoE parallel configuration. Based on the input `tp_size_`,
@@ -734,19 +743,22 @@ class FusedMoEParallelConfig:
 
         Args:
             tp_size_ (int): `tp_size` passed into the FusedMoE constructor.
+            pcp_size_ (int): `pcp_size` passed into the FusedMoE constructor.
             dp_size_ (int): `dp_size` passed into the FusedMoE constructor.
             vllm_parallel_config (ParallelConfig): vLLM's parallel config
                 object which contains the `enable_expert_parallel` flag.
 
         Examples:
             When there is no parallelism requested,
-            i.e. `tp_size_` = `dp_size_` = 1, we simply return the sizes
+            i.e. `tp_size_` = `pcp_size_` = `dp_size_` = 1, we simply return the sizes
             unaltered and the ranks set to 0.
 
-            Expert Parallelism is considered only when either `dp_size_` or
+            Expert Parallelism is considered only when either `dp_size_`, `pcp_size_` or
             `tp_size_` is non trivial.
 
-            When TP = 2, DP = 1 and EP = False, the configuration on different
+            Note that PCP serves the same function as DP here.
+
+            When TP = 2, DP(PCP) = 1 and EP = False, the configuration on different
             devices:
 
             - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
@@ -754,7 +766,7 @@ class FusedMoEParallelConfig:
             - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
             - Comment : Tensors are sharded across 2 devices.
 
-            When TP = 1, DP = 2 and EP = False, the configuration on different
+            When TP = 1, DP(PCP) = 2 and EP = False, the configuration on different
                 devices:
 
             - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
@@ -762,7 +774,7 @@ class FusedMoEParallelConfig:
             - Comment: There are 2 engine instances and the tensors are sharded
                 across 2 decvices.
 
-            When TP = 2, DP = 2 and EP = False, the configuration on different
+            When TP = 2, DP(PCP) = 2 and EP = False, the configuration on different
                 devices:
 
             - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
@@ -772,14 +784,14 @@ class FusedMoEParallelConfig:
             - Comment: There are 2 engine instances and the tensors are sharded
                 across 4 devices.
 
-            When, TP = 2, DP = 1 and EP = True, the configuration on different
+            When, TP = 2, DP(PCP) = 1 and EP = True, the configuration on different
                 devices:
 
             - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
             - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
             - Comment: The experts are split between the 2 devices.
 
-            When, TP = 1, DP = 2 and EP = True, the configuration on different
+            When, TP = 1, DP(PCP) = 2 and EP = True, the configuration on different
                 devices:
 
             - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
@@ -787,7 +799,7 @@ class FusedMoEParallelConfig:
             - Comment: There are 2 engine instances and the experts are split
                 between the 2 devices.
 
-            When TP = 2, DP = 2 and EP = True, the configuration on different
+            When TP = 2, DP(PCP) = 2 and EP = True, the configuration on different
                 devices:
 
             - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
@@ -798,18 +810,25 @@ class FusedMoEParallelConfig:
                 between the 4 devices.
         """
 
-        use_ep = dp_size_ * tp_size_ > 1 and vllm_parallel_config.enable_expert_parallel
+        use_ep = (
+            dp_size_ * pcp_size_ * tp_size_ > 1
+            and vllm_parallel_config.enable_expert_parallel
+        )
 
         dp_size = dp_size_
         dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
-        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp(
-            tp_size_, dp_size_, dp_rank
+        pcp_size = pcp_size_
+        pcp_rank = get_pcp_group().rank_in_group if pcp_size > 1 else 0
+        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
+            tp_size_, dp_size_, dp_rank, pcp_size_, pcp_rank
         )
 
         if not use_ep:
             return FusedMoEParallelConfig(
                 tp_size=tp_size,
                 tp_rank=tp_rank,
+                pcp_size=pcp_size,
+                pcp_rank=pcp_rank,
                 dp_size=dp_size,
                 dp_rank=dp_rank,
                 ep_size=1,
@@ -826,6 +845,8 @@ class FusedMoEParallelConfig:
         return FusedMoEParallelConfig(
             tp_size=1,
             tp_rank=0,
+            pcp_size=pcp_size,
+            pcp_rank=pcp_rank,
             dp_size=dp_size,
             dp_rank=dp_rank,
             ep_size=ep_size,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 8e9bba3442873..7b15e63e9e350 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -18,6 +18,7 @@ from vllm.config.parallel import ExpertPlacementStrategy
 from vllm.distributed import (
     get_dp_group,
     get_ep_group,
+    get_pcp_group,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
@@ -343,6 +344,7 @@ class FusedMoE(CustomOp):
         tp_size: int | None = None,
         ep_size: int | None = None,
         dp_size: int | None = None,
+        pcp_size: int | None = None,
         prefix: str = "",
         custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
@@ -398,12 +400,14 @@ class FusedMoE(CustomOp):
             tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
         )
         dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size
+        pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size
 
         self.is_sequence_parallel = is_sequence_parallel
         self.sp_size = tp_size_ if is_sequence_parallel else 1
 
         self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
             tp_size_=tp_size_,
+            pcp_size_=pcp_size_,
             dp_size_=dp_size_,
             vllm_parallel_config=vllm_config.parallel_config,
         )
@@ -679,6 +683,10 @@ class FusedMoE(CustomOp):
     def dp_size(self):
         return self.moe_parallel_config.dp_size
 
+    @property
+    def pcp_size(self):
+        return self.moe_parallel_config.pcp_size
+
     @property
     def ep_size(self):
         return self.moe_parallel_config.ep_size
@@ -691,6 +699,10 @@ class FusedMoE(CustomOp):
     def dp_rank(self):
         return self.moe_parallel_config.dp_rank
 
+    @property
+    def pcp_rank(self):
+        return self.moe_parallel_config.pcp_rank
+
     @property
     def ep_rank(self):
         return self.moe_parallel_config.ep_rank
@@ -1871,6 +1883,19 @@ class FusedMoE(CustomOp):
                 assert self.shared_experts is not None
                 shared_output = self.shared_experts(hidden_states)
 
+            # NOTE: Similar with DP, PCP also needs dispatch and combine. For
+            # simplicity, AgRsAll2All was added separately for PCP here. Maybe
+            # we should modify All2AllManager abstract to better support PCP.
+            if self.pcp_size > 1:
+                hidden_states = get_pcp_group().all_gather(
+                    hidden_states,
+                    dim=0,
+                )
+                router_logits = get_pcp_group().all_gather(
+                    router_logits,
+                    dim=0,
+                )
+
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
                 layer=self,
@@ -1925,6 +1950,13 @@ class FusedMoE(CustomOp):
             def combine_output(states: torch.Tensor) -> torch.Tensor:
                 if do_naive_dispatch_combine:
                     states = get_ep_group().combine(states, self.is_sequence_parallel)
+
+                if self.pcp_size > 1:
+                    states = get_pcp_group().reduce_scatter(
+                        states,
+                        dim=0,
+                    )
+
                 return states
 
             if self.shared_experts is not None:
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index f310f71af92d9..25048330f7974 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -13,6 +13,7 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
     get_dp_group,
     get_ep_group,
+    get_pcp_group,
     get_pp_group,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -322,10 +323,12 @@ class GptOssModel(nn.Module):
 
         # In MoE, we need to flatten the tensor parallel size across the data
         # parallel size when EP is disabled.
-        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp(
+        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
             tp_size=get_tensor_model_parallel_world_size(),
             dp_size=get_dp_group().world_size,
             dp_rank=get_dp_group().rank_in_group,
+            pcp_size=get_pcp_group().world_size,
+            pcp_rank=get_pcp_group().rank_in_group,
         )
 
         intermediate_size = self.config.intermediate_size
@@ -507,10 +510,12 @@ class GptOssModel(nn.Module):
 
         # In MoE, we need to flatten the tensor parallel size across the data
         # parallel size when EP is disabled.
-        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp(
+        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
             tp_size=get_tensor_model_parallel_world_size(),
             dp_size=get_dp_group().world_size,
             dp_rank=get_dp_group().rank_in_group,
+            pcp_size=get_pcp_group().world_size,
+            pcp_rank=get_pcp_group().rank_in_group,
         )
 
         intermediate_size = self.config.intermediate_size
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index fdc99a0df1c8a..cf3c1d05f5b3f 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -265,8 +265,8 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
             self.dcp_world_size = 1
             self.dcp_rank = 0
 
-        self.dcp_kv_cache_interleave_size = (
-            self.parallel_config.dcp_kv_cache_interleave_size
+        self.cp_kv_cache_interleave_size = (
+            self.parallel_config.cp_kv_cache_interleave_size
         )
 
         self.use_full_cuda_graph = (
@@ -388,7 +388,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
                 dcp_context_kv_lens_cpu,
                 self.dcp_world_size,
                 self.dcp_rank,
-                self.dcp_kv_cache_interleave_size,
+                self.cp_kv_cache_interleave_size,
             )
             dcp_context_kv_lens = dcp_context_kv_lens_cpu.to(self.device)
             max_dcp_context_kv_len = dcp_context_kv_lens.max().item()
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index e328049b53c7e..32f406980f2ed 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -536,7 +536,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
-        self.dcp_local_block_size = parallel_config.dcp_kv_cache_interleave_size
+        self.dcp_local_block_size = parallel_config.cp_kv_cache_interleave_size
         self.dcp_virtual_block_size = self.dcp_local_block_size * self.dcp_world_size
 
         # Don't try to access the runner on AMD
@@ -1289,8 +1289,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 get_current_vllm_config()
             )
         )
-        self.dcp_kv_cache_interleave_size: int = (
-            get_current_vllm_config().parallel_config.dcp_kv_cache_interleave_size
+        self.cp_kv_cache_interleave_size: int = (
+            get_current_vllm_config().parallel_config.cp_kv_cache_interleave_size
         )
 
     def _flash_attn_varlen_diff_headdims(
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 0dd1896331291..540a8e2b1d016 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -1080,9 +1080,9 @@ def compute_causal_conv1d_metadata(query_start_loc_p: torch.Tensor):
 
 def get_dcp_local_seq_lens(
     seq_lens: torch.Tensor,
-    dcp_world_size: int = 1,
+    dcp_size: int = 1,
     dcp_rank: int | None = None,
-    dcp_kv_cache_interleave_size: int = 1,
+    cp_kv_cache_interleave_size: int = 1,
 ) -> torch.Tensor:
     """While using dcp, kv_cache size stored on each rank may be different,
     use this function to calculate split decode seq_lens of each dcp rank.
@@ -1091,7 +1091,7 @@ def get_dcp_local_seq_lens(
     num_requests = seq_lens.size(0)
     if dcp_rank is None:
         rank_offsets = (
-            torch.arange(dcp_world_size, dtype=torch.int32)
+            torch.arange(dcp_size, dtype=torch.int32)
             .unsqueeze(0)
             .repeat(num_requests, 1)
         )
@@ -1102,15 +1102,15 @@ def get_dcp_local_seq_lens(
     )
     base = (
         seq_lens_tiled
-        // dcp_kv_cache_interleave_size
-        // dcp_world_size
-        * dcp_kv_cache_interleave_size
+        // cp_kv_cache_interleave_size
+        // dcp_size
+        * cp_kv_cache_interleave_size
     )
-    remainder = seq_lens_tiled - base * dcp_world_size
+    remainder = seq_lens_tiled - base * dcp_size
     remainder = torch.clip(
-        remainder - rank_offsets * dcp_kv_cache_interleave_size,
+        remainder - rank_offsets * cp_kv_cache_interleave_size,
         0,
-        dcp_kv_cache_interleave_size,
+        cp_kv_cache_interleave_size,
     )
     dcp_local_seq_lens = base + remainder
     return dcp_local_seq_lens.squeeze(1)
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 137e5e0cdb6d2..1531b61f88fe2 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -27,6 +27,7 @@ class KVCacheCoordinator(ABC):
         enable_caching: bool,
         enable_kv_cache_events: bool,
         dcp_world_size: int,
+        pcp_world_size: int,
     ):
         self.kv_cache_config = kv_cache_config
         self.max_model_len = max_model_len
@@ -44,6 +45,7 @@ class KVCacheCoordinator(ABC):
                 block_pool=self.block_pool,
                 kv_cache_group_id=i,
                 dcp_world_size=dcp_world_size,
+                pcp_world_size=pcp_world_size,
             )
             for i, kv_cache_group in enumerate(self.kv_cache_config.kv_cache_groups)
         )
@@ -210,6 +212,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
         use_eagle: bool,
         enable_kv_cache_events: bool,
         dcp_world_size: int,
+        pcp_world_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -218,6 +221,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
             False,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
         )
         self.num_single_type_manager = len(self.single_type_managers)
 
@@ -250,6 +254,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
         enable_caching: bool,
         enable_kv_cache_events: bool,
         dcp_world_size: int,
+        pcp_world_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -258,12 +263,16 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
             enable_caching,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
         )
         self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec
         self.block_size = self.kv_cache_spec.block_size
         self.dcp_world_size = dcp_world_size
+        self.pcp_world_size = pcp_world_size
         if dcp_world_size > 1:
             self.block_size *= dcp_world_size
+        if pcp_world_size > 1:
+            self.block_size *= pcp_world_size
         assert len(self.kv_cache_config.kv_cache_groups) == 1, (
             "UnitaryKVCacheCoordinator assumes only one kv cache group"
         )
@@ -281,6 +290,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
             kv_cache_spec=self.kv_cache_spec,
             use_eagle=self.use_eagle,
             dcp_world_size=self.dcp_world_size,
+            pcp_world_size=self.pcp_world_size,
         )
         return hit_blocks, len(hit_blocks[0]) * self.block_size
 
@@ -302,6 +312,7 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
         enable_caching: bool,
         enable_kv_cache_events: bool,
         dcp_world_size: int,
+        pcp_world_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -310,8 +321,10 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
             enable_caching,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
         )
         assert dcp_world_size == 1, "DCP not support hybrid attn now."
+        assert pcp_world_size == 1, "PCP not support hybrid attn now."
         self.verify_and_split_kv_cache_groups()
 
     def verify_and_split_kv_cache_groups(self) -> None:
@@ -452,6 +465,7 @@ def get_kv_cache_coordinator(
     enable_caching: bool,
     enable_kv_cache_events: bool,
     dcp_world_size: int,
+    pcp_world_size: int,
 ) -> KVCacheCoordinator:
     if not enable_caching:
         return KVCacheCoordinatorNoPrefixCache(
@@ -460,6 +474,7 @@ def get_kv_cache_coordinator(
             use_eagle,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
         )
     if len(kv_cache_config.kv_cache_groups) == 1:
         return UnitaryKVCacheCoordinator(
@@ -469,6 +484,7 @@ def get_kv_cache_coordinator(
             enable_caching,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
         )
     return HybridKVCacheCoordinator(
         kv_cache_config,
@@ -477,4 +493,5 @@ def get_kv_cache_coordinator(
         enable_caching,
         enable_kv_cache_events,
         dcp_world_size=dcp_world_size,
+        pcp_world_size=pcp_world_size,
     )
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 7f405fc248ac2..2012c3fef88bc 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -100,6 +100,7 @@ class KVCacheManager:
         log_stats: bool = False,
         enable_kv_cache_events: bool = False,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> None:
         self.max_model_len = max_model_len
 
@@ -124,12 +125,9 @@ class KVCacheManager:
                 0
             ].kv_cache_spec.block_size
 
-            if dcp_world_size > 1:
+            if dcp_world_size * pcp_world_size > 1:
                 assert len(kv_cache_config.kv_cache_groups) == 1
-                # Note(hc): need revisit. When both DCP and any future
-                # PCP are enabled, the block_size may need to be scaled
-                # by a factor of dcp_size × pcp_size?
-                self.block_size *= dcp_world_size
+                self.block_size *= dcp_world_size * pcp_world_size
 
         self.coordinator = get_kv_cache_coordinator(
             kv_cache_config=kv_cache_config,
@@ -138,6 +136,7 @@ class KVCacheManager:
             enable_caching=self.enable_caching,
             enable_kv_cache_events=enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
         )
         self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
         self.block_pool = self.coordinator.block_pool
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 6e026215d4022..01ecd881115df 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1219,11 +1219,16 @@ def _report_kv_cache_config(
         // len(kv_cache_config.kv_cache_groups)
         * min_block_size
     )
-    if vllm_config.parallel_config.decode_context_parallel_size > 1:
-        num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
+    dcp_size = vllm_config.parallel_config.decode_context_parallel_size
+    pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
+    if pcp_size * dcp_size > 1:
+        num_tokens *= pcp_size * dcp_size
         logger.info(
-            "Multiplying the GPU KV cache size by the dcp_world_size %d.",
-            vllm_config.parallel_config.decode_context_parallel_size,
+            "Multiplying the GPU KV cache size by the cp_world_size %d "
+            "(pcp_world_size %d * dcp_world_size %d).",
+            pcp_size * dcp_size,
+            pcp_size,
+            dcp_size,
         )
     num_tokens_str = f"{num_tokens:,}"
     logger.info_once("GPU KV cache size: %s tokens", num_tokens_str, scope="local")
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4323141c435b7..4cc4c29591cc0 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -121,6 +121,7 @@ class Scheduler(SchedulerInterface):
 
         self.block_size = block_size
         self.dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size
+        self.pcp_world_size = vllm_config.parallel_config.prefill_context_parallel_size
 
         # req_id -> Request
         self.requests: dict[str, Request] = {}
@@ -183,6 +184,7 @@ class Scheduler(SchedulerInterface):
             log_stats=self.log_stats,
             enable_kv_cache_events=self.enable_kv_cache_events,
             dcp_world_size=self.dcp_world_size,
+            pcp_world_size=self.pcp_world_size,
         )
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
 
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 14ac83028ee44..d90ec550f7666 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -32,6 +32,7 @@ class SingleTypeKVCacheManager(ABC):
         block_pool: BlockPool,
         kv_cache_group_id: int,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> None:
         """
         Initializes the SingleTypeKVCacheManager.
@@ -42,8 +43,9 @@ class SingleTypeKVCacheManager(ABC):
         """
         self.block_size = kv_cache_spec.block_size
         self.dcp_world_size = dcp_world_size
-        if self.dcp_world_size > 1:
-            self.block_size *= dcp_world_size
+        self.pcp_world_size = pcp_world_size
+        if dcp_world_size * pcp_world_size > 1:
+            self.block_size *= dcp_world_size * pcp_world_size
         self.kv_cache_spec = kv_cache_spec
         self.block_pool = block_pool
 
@@ -212,6 +214,7 @@ class SingleTypeKVCacheManager(ABC):
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         """
         Get the longest cache hit prefix of the blocks that is not longer than
@@ -303,6 +306,7 @@ class FullAttentionManager(SingleTypeKVCacheManager):
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(
             kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec)
@@ -314,8 +318,8 @@ class FullAttentionManager(SingleTypeKVCacheManager):
             [] for _ in range(len(kv_cache_group_ids))
         )
         block_size = kv_cache_spec.block_size
-        if dcp_world_size > 1:
-            block_size *= dcp_world_size
+        if dcp_world_size * pcp_world_size > 1:
+            block_size *= dcp_world_size * pcp_world_size
         max_num_blocks = max_length // block_size
         for block_hash in itertools.islice(block_hashes, max_num_blocks):
             # block_hashes is a chain of block hashes. If a block hash is not
@@ -362,11 +366,13 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, SlidingWindowSpec), (
             "SlidingWindowManager can only be used for sliding window groups"
         )
         assert dcp_world_size == 1, "DCP not support sliding window attn now."
+        assert pcp_world_size == 1, "PCP not support sliding window attn now."
 
         # The number of contiguous blocks needed for prefix cache hit.
         # -1 since the input token itself is also included in the window
@@ -476,6 +482,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         """
         For chunked local attention, we need to find the longest cache hit
@@ -516,6 +523,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
             "Hybrid KV cache is not supported for " + "eagle + chunked local attention."
         )
         assert dcp_world_size == 1, "DCP not support chunked local attn now."
+        assert pcp_world_size == 1, "PCP not support chunked local attn now."
         max_num_blocks = max_length // kv_cache_spec.block_size
         if max_length > 0:
             local_attention_start_idx = (
@@ -611,11 +619,13 @@ class MambaManager(SingleTypeKVCacheManager):
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, MambaSpec), (
             "MambaManager can only be used for mamba groups"
         )
         assert dcp_world_size == 1, "DCP not support mamba now."
+        assert pcp_world_size == 1, "PCP not support mamba now."
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
             [] for _ in range(len(kv_cache_group_ids))
         )
@@ -705,6 +715,7 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, CrossAttentionSpec), (
             "CrossAttentionManager can only be used for cross-attention groups"
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 3a25827cec385..6be19894d332a 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -128,6 +128,7 @@ class EngineCore:
         scheduler_block_size = (
             vllm_config.cache_config.block_size
             * vllm_config.parallel_config.decode_context_parallel_size
+            * vllm_config.parallel_config.prefill_context_parallel_size
         )
 
         self.scheduler: SchedulerInterface = Scheduler(
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index ad2ece50f9815..7e8ebe25c4603 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -35,6 +35,7 @@ from vllm.distributed.parallel_state import (
     get_dp_group,
     get_ep_group,
     get_inner_dp_world_group,
+    get_pcp_group,
     get_pp_group,
     get_tp_group,
 )
@@ -110,12 +111,14 @@ class MultiprocExecutor(Executor):
             f"({self.parallel_config.nnodes_within_dp}). "
         )
         self.local_world_size = self.parallel_config.local_world_size
-        tensor_parallel_size = self.parallel_config.tensor_parallel_size
-        pp_parallel_size = self.parallel_config.pipeline_parallel_size
-        assert self.world_size == tensor_parallel_size * pp_parallel_size, (
+        tp_size = self.parallel_config.tensor_parallel_size
+        pp_size = self.parallel_config.pipeline_parallel_size
+        pcp_size = self.parallel_config.prefill_context_parallel_size
+        assert self.world_size == tp_size * pp_size * pcp_size, (
             f"world_size ({self.world_size}) must be equal to the "
-            f"tensor_parallel_size ({tensor_parallel_size}) x pipeline"
-            f"_parallel_size ({pp_parallel_size}). "
+            f"tensor_parallel_size ({tp_size}) x pipeline"
+            f"_parallel_size ({pp_size}) x prefill_context"
+            f"_parallel_size ({pcp_size}). "
         )
 
         # Set multiprocessing envs
@@ -424,7 +427,11 @@ class MultiprocExecutor(Executor):
         # 16-23, PP rank 2
         # 24-31, PP rank 3
         # so world_size - tp_size = 32 - 8 = 24 should be PP rank = -1 (i.e. 3)
-        return self.world_size - self.parallel_config.tensor_parallel_size
+        return (
+            self.world_size
+            - self.parallel_config.tensor_parallel_size
+            * self.parallel_config.prefill_context_parallel_size
+        )
 
 
 @dataclass
@@ -828,6 +835,8 @@ class WorkerProc:
         dp_rank = get_dp_group().rank_in_group
         pp_size = get_pp_group().world_size
         pp_rank = get_pp_group().rank_in_group
+        pcp_size = get_pcp_group().world_size
+        pcp_rank = get_pcp_group().rank_in_group
         tp_size = get_tp_group().world_size
         tp_rank = get_tp_group().rank_in_group
         dcp_size = get_dcp_group().world_size
@@ -837,6 +846,8 @@ class WorkerProc:
             process_name += f"_DP{dp_rank}"
         if pp_size > 1:
             process_name += f"_PP{pp_rank}"
+        if pcp_size > 1:
+            process_name += f"_PCP{pcp_rank}"
         if tp_size > 1:
             process_name += f"_TP{tp_rank}"
         if dcp_size > 1:
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 7f33eb7e699c7..751862aa9c767 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -95,10 +95,11 @@ class FullAttentionSpec(AttentionSpec):
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
         dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size
+        pcp_world_size = vllm_config.parallel_config.prefill_context_parallel_size
         # Note(hc): each dcp rank only need save
         # (max_model_len//dcp_world_size) tokens locally.
-        if dcp_world_size > 1:
-            max_model_len = cdiv(max_model_len, dcp_world_size)
+        if dcp_world_size * pcp_world_size > 1:
+            max_model_len = cdiv(max_model_len, dcp_world_size * pcp_world_size)
         return cdiv(max_model_len, self.block_size) * self.page_size_bytes
 
     @classmethod
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 9f6c19e464308..76e17f3797a1a 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -4,7 +4,7 @@
 import numpy as np
 import torch
 
-from vllm.distributed import get_dcp_group
+from vllm.distributed import get_dcp_group, get_pcp_group
 from vllm.logger import init_logger
 from vllm.utils.math_utils import cdiv
 from vllm.v1.utils import CpuGpuBuffer
@@ -22,7 +22,7 @@ class BlockTable:
         pin_memory: bool,
         device: torch.device,
         kernel_block_size: int,
-        dcp_kv_cache_interleave_size: int,
+        cp_kv_cache_interleave_size: int,
     ):
         """
         Args:
@@ -80,6 +80,13 @@ class BlockTable:
         else:
             self._kernel_block_arange = None
 
+        try:
+            self.pcp_world_size = get_pcp_group().world_size
+            self.pcp_rank = get_pcp_group().rank_in_group
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.pcp_world_size = 1
+            self.pcp_rank = 0
         try:
             self.dcp_world_size = get_dcp_group().world_size
             self.dcp_rank = get_dcp_group().rank_in_group
@@ -87,7 +94,7 @@ class BlockTable:
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
-        self.dcp_kv_cache_interleave_size = dcp_kv_cache_interleave_size
+        self.cp_kv_cache_interleave_size = cp_kv_cache_interleave_size
 
     def append_row(
         self,
@@ -131,14 +138,16 @@ class BlockTable:
         # NOTE(woosuk): We can't simply use `token_indices // block_size`
         # here because M (max_model_len) is not necessarily divisible by
         # block_size.
-        if self.dcp_world_size > 1:
+        total_cp_world_size = self.pcp_world_size * self.dcp_world_size
+        total_cp_rank = self.pcp_rank * self.dcp_world_size + self.dcp_rank
+        if total_cp_world_size > 1:
             # Note(hc): The DCP implement store kvcache with an interleave
             # style, the kvcache for the token whose token_idx is i is
             # always stored on the GPU whose dcp_rank equals i % cp_world_size:
 
             # Use a "virtual block" which equals to world_size * block_size
             # for block_table_indices calculation.
-            virtual_block_size = self.block_size * self.dcp_world_size
+            virtual_block_size = self.block_size * total_cp_world_size
             block_table_indices = (
                 req_indices * self.max_num_blocks_per_req
                 + positions // virtual_block_size
@@ -150,16 +159,16 @@ class BlockTable:
             virtual_block_offsets = positions % virtual_block_size
             mask = (
                 virtual_block_offsets
-                // self.dcp_kv_cache_interleave_size
-                % self.dcp_world_size
-                == self.dcp_rank
+                // self.cp_kv_cache_interleave_size
+                % total_cp_world_size
+                == total_cp_rank
             )
             # Calculate local block_offsets
             block_offsets = (
                 virtual_block_offsets
-                // (self.dcp_world_size * self.dcp_kv_cache_interleave_size)
-                * self.dcp_kv_cache_interleave_size
-                + virtual_block_offsets % self.dcp_kv_cache_interleave_size
+                // (total_cp_world_size * self.cp_kv_cache_interleave_size)
+                * self.cp_kv_cache_interleave_size
+                + virtual_block_offsets % self.cp_kv_cache_interleave_size
             )
             # Calculate slot_mapping
             slot_mapping = block_numbers * self.block_size + block_offsets
@@ -253,7 +262,7 @@ class MultiGroupBlockTable:
         block_sizes: list[int],
         kernel_block_sizes: list[int],
         num_speculative_tokens: int = 0,
-        dcp_kv_cache_interleave_size: int = 1,
+        cp_kv_cache_interleave_size: int = 1,
     ) -> None:
         # Note(hc): each dcp rank only store
         # (max_model_len//dcp_world_size) tokens in kvcache,
@@ -283,7 +292,7 @@ class MultiGroupBlockTable:
                 pin_memory,
                 device,
                 kernel_block_size,
-                dcp_kv_cache_interleave_size,
+                cp_kv_cache_interleave_size,
             )
             for block_size, kernel_block_size in zip(block_sizes, kernel_block_sizes)
         ]
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index c1bfe727d86e5..7b4bc1d2a2241 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -87,7 +87,7 @@ class InputBatch:
         is_spec_decode: bool = False,
         is_pooling_model: bool = False,
         num_speculative_tokens: int = 0,
-        dcp_kv_cache_interleave_size: int = 1,
+        cp_kv_cache_interleave_size: int = 1,
     ):
         self.is_pooling_model = is_pooling_model
         self.is_spec_decode = is_spec_decode
@@ -141,7 +141,7 @@ class InputBatch:
             block_sizes=block_sizes,
             kernel_block_sizes=kernel_block_sizes,
             num_speculative_tokens=num_speculative_tokens,
-            dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+            cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
         )
 
         # Sampling-related.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0c35f1330e9f0..80f8344d44100 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -426,7 +426,7 @@ class GPUModelRunner(
             # uses output token ids so we set this conservatively.
             logitsprocs_need_output_token_ids=bool(custom_logitsprocs),
             is_pooling_model=self.is_pooling_model,
-            dcp_kv_cache_interleave_size=self.parallel_config.dcp_kv_cache_interleave_size,
+            cp_kv_cache_interleave_size=self.parallel_config.cp_kv_cache_interleave_size,
         )
 
         self.use_async_scheduling = self.scheduler_config.async_scheduling
@@ -1436,7 +1436,7 @@ class GPUModelRunner(
                 self.seq_lens.cpu[:num_reqs],
                 self.dcp_world_size,
                 self.dcp_rank,
-                self.parallel_config.dcp_kv_cache_interleave_size,
+                self.parallel_config.cp_kv_cache_interleave_size,
             )
             self.dcp_local_seq_lens.copy_to_gpu(num_reqs)
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 315f01b68499a..b8339fc4dc8b8 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -26,6 +26,7 @@ from vllm.distributed.kv_transfer import (
     has_kv_transfer_group,
 )
 from vllm.distributed.parallel_state import (
+    get_pcp_group,
     get_pp_group,
     get_tp_group,
 )
@@ -733,6 +734,7 @@ class Worker(WorkerBase):
                 module.global_num_experts = module.moe_config.num_experts
                 module.moe_parallel_config = FusedMoEParallelConfig.make(
                     tp_size_=get_tp_group().world_size,
+                    pcp_size_=get_pcp_group().world_size,
                     dp_size_=get_dp_group().world_size,
                     vllm_parallel_config=parallel_config,
                 )
@@ -886,6 +888,7 @@ def init_worker_distributed_environment(
     ensure_model_parallel_initialized(
         parallel_config.tensor_parallel_size,
         parallel_config.pipeline_parallel_size,
+        parallel_config.prefill_context_parallel_size,
         parallel_config.decode_context_parallel_size,
     )
 

From 68d7231991cc307d6865eac5bfca551c06f67465 Mon Sep 17 00:00:00 2001
From: Ryan Rock <ryan.rock@amd.com>
Date: Wed, 19 Nov 2025 15:04:36 -0600
Subject: [PATCH 032/249] [CI/Build] Fix test_prefix_prefill for AMD (#28905)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
---
 tests/kernels/attention/test_prefix_prefill.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index 78cdbbbf7379d..e041e8c8d2ffa 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -174,11 +174,11 @@ def test_contexted_kv_attention(
     block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to(
+        torch.int32
     )
     for i in range(BS):
         for j in range(query_lens[i]):
@@ -417,11 +417,11 @@ def test_contexted_kv_attention_alibi(
     block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to(
+        torch.int32
     )
     for i in range(BS):
         for j in range(query_lens[i]):

From 1607e664f0de4b7eb113c0259b889edbe73c4341 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 19 Nov 2025 16:18:32 -0500
Subject: [PATCH 033/249] [Bug] Fix Batch Invariant MLA test (#28967)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/v1/determinism/test_batch_invariance.py | 41 +++++++++++++++----
 vllm/model_executor/layers/batch_invariant.py |  2 +-
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index f018ee551dbfe..d4e88891512c4 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -9,13 +9,33 @@ import torch
 from utils import _extract_step_logprobs, _random_prompt, skip_unsupported
 
 from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+
+BACKENDS: list[str] = [
+    "FLASH_ATTN",
+    "FLASHINFER",
+]
+
+if current_platform.is_cuda() and current_platform.is_device_capability(90):
+    BACKENDS.append("FLASH_ATTN_MLA")
+
+DEFAULT_MODEL = "Qwen/Qwen3-1.7B"
+MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+
+
+def resolve_model_name(backend: str) -> str:
+    """Resolve the model name for the given backend, respecting env overrides."""
+    model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL)
+    if backend.endswith("MLA") and model == DEFAULT_MODEL:
+        return MLA_MODEL
+    return model
 
 
 @skip_unsupported
 @pytest.mark.timeout(1000)
 @pytest.mark.parametrize(
     "backend",
-    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+    BACKENDS,
 )
 def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     backend, monkeypatch: pytest.MonkeyPatch
@@ -47,7 +67,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
     # Allow overrides from environment (useful for CI tuning)
     # "facebook/opt-125m" is too small, doesn't reliably test determinism
-    model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model = resolve_model_name(backend)
     num_trials = int(os.getenv("VLLM_NEEDLE_TRIALS", "5"))
     max_batch_size = int(os.getenv("VLLM_NEEDLE_BATCH_SIZE", "128"))
     min_random_prompt = int(os.getenv("VLLM_MIN_PROMPT", "1024"))
@@ -150,7 +170,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
 @skip_unsupported
 @pytest.mark.parametrize(
     "backend",
-    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+    BACKENDS,
 )
 @pytest.mark.forked
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
@@ -160,7 +180,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
 
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
-    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model_name = resolve_model_name(backend)
     tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
 
     # For batch invariance, disable custom all-reduce to ensure deterministic
@@ -369,7 +389,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
 @skip_unsupported
 @pytest.mark.parametrize(
     "backend",
-    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+    BACKENDS,
 )
 def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
     """
@@ -377,7 +397,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
     Useful for quick smoke testing and debugging.
     """
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-    model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model = resolve_model_name(backend)
 
     llm = LLM(
         model=model,
@@ -419,7 +439,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
 @skip_unsupported
 @pytest.mark.parametrize(
     "backend",
-    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+    BACKENDS,
 )
 @pytest.mark.forked
 def test_logprobs_without_batch_invariance_should_fail(
@@ -434,6 +454,9 @@ def test_logprobs_without_batch_invariance_should_fail(
     The test will PASS if we detect differences (proving batch invariance matters).
     The test will FAIL if everything matches (suggesting batch invariance isn't needed).
     """
+    from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
+
+    vllm_is_batch_invariant.cache_clear()
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
 
     # CRITICAL: Disable batch invariance for this test
@@ -441,7 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail(
 
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
-    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model_name = resolve_model_name(backend)
     tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
 
     print(f"\n{'=' * 80}")
@@ -659,7 +682,7 @@ def test_decode_logprobs_match_prefill_logprobs(
 
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
-    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model_name = resolve_model_name(backend)
     tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
 
     from vllm.model_executor.layers.batch_invariant import (
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 7920d117de5e0..5dbeb29174349 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -803,11 +803,11 @@ def override_envs_for_invariance():
         "FLASH_ATTN",  # best supported backend
         "FLASHINFER",
         "FLASH_ATTN_MLA",
-        "FLASHINFER_MLA",
         "TRITON_MLA",
         # Not yet supported MLA backends
         # "FLASHMLA",
         # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance
+        # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967
     ]
     if curr_attn_backend not in supported_backends:
         warning = (

From cdeec2e6067613c501f82463d54e420097f49750 Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Wed, 19 Nov 2025 22:20:58 +0100
Subject: [PATCH 034/249] [BugFix] Ray with multiple nodes (#28873)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
---
 vllm/v1/worker/gpu_worker.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index b8339fc4dc8b8..7f9cdd221224b 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -205,14 +205,14 @@ class Worker(WorkerBase):
                 assert self.local_rank < torch.cuda.device_count(), (
                     f"DP adjusted local rank {self.local_rank} is out of bounds. "
                 )
-            visible_device_count = (
-                torch.cuda.device_count() if torch.cuda.is_available() else 0
-            )
-            assert self.parallel_config.local_world_size <= visible_device_count, (
-                f"local_world_size ({self.parallel_config.local_world_size}) must be "
-                f"less than or equal to the number of visible devices "
-                f"({visible_device_count})."
-            )
+                visible_device_count = (
+                    torch.cuda.device_count() if torch.cuda.is_available() else 0
+                )
+                assert self.parallel_config.local_world_size <= visible_device_count, (
+                    f"local_world_size ({self.parallel_config.local_world_size}) must "
+                    f"be less than or equal to the number of visible devices "
+                    f"({visible_device_count})."
+                )
             self.device = torch.device(f"cuda:{self.local_rank}")
             current_platform.set_device(self.device)
 

From 613abb50d5715ba693ee9d5b727e8385b98e7185 Mon Sep 17 00:00:00 2001
From: Shu Wang <shuw@nvidia.com>
Date: Wed, 19 Nov 2025 15:29:06 -0600
Subject: [PATCH 035/249] [MoE] Nvfp4 Masked Gemm: Add flashinfer
 grouped_gemm_nt_masked (#25990)

Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 tests/kernels/moe/test_cutedsl_moe.py         | 582 ++++++++++++++++++
 vllm/envs.py                                  |   8 +-
 .../fused_moe/deepep_ll_prepare_finalize.py   |  16 +-
 .../fused_moe/flashinfer_cutedsl_moe.py       | 346 +++++++++++
 .../layers/quantization/modelopt.py           |  30 +-
 .../quantization/utils/flashinfer_fp4_moe.py  |  43 +-
 .../quantization/utils/flashinfer_utils.py    |  25 +-
 .../quantization/utils/nvfp4_moe_support.py   |   6 +-
 vllm/utils/flashinfer.py                      |  42 ++
 10 files changed, 1064 insertions(+), 35 deletions(-)
 create mode 100644 tests/kernels/moe/test_cutedsl_moe.py
 create mode 100644 vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 98daebcc06931..5309581d8e81f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -921,6 +921,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
 
 - label: Blackwell Fusion and Compile Tests # 30 min
   timeout_in_minutes: 40
diff --git a/tests/kernels/moe/test_cutedsl_moe.py b/tests/kernels/moe/test_cutedsl_moe.py
new file mode 100644
index 0000000000000..af1a34d17d48b
--- /dev/null
+++ b/tests/kernels/moe/test_cutedsl_moe.py
@@ -0,0 +1,582 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+
+from vllm.platforms import current_platform
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason="Nvfp4 Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
+
+import torch
+from flashinfer import fp4_quantize
+from torch.nn import functional as F
+
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (
+    flashinfer_cutedsl_moe_masked,
+)
+from vllm.utils.flashinfer import (
+    flashinfer_cutedsl_grouped_gemm_nt_masked as cutedsl_gmm_masked,
+)
+from vllm.utils.flashinfer import (
+    scaled_fp4_grouped_quantize,
+)
+
+kE2M1ToFloat = torch.tensor(
+    [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
+)
+
+FLOAT8_E4M3_MAX = 448.0
+FLOAT4_E2M1_MAX = 6.0
+
+
+def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
+    m_tiles = (m + 128 - 1) // 128
+    f = block_size * 4
+    k_tiles = (k + f - 1) // f
+    tmp = torch.reshape(a_sf_swizzled, (1, m_tiles, k_tiles, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    out = tmp.reshape(m_tiles * 128, k_tiles * f // block_size)
+    return out[0:m, 0:k]
+
+
+def dequantize_nvfp4_to_dtype(
+    tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16
+):
+    """Dequantize the fp4 tensor back to high precision."""
+    # Two fp4 values are packed into one uint8.
+    assert tensor_fp4.dtype == torch.uint8
+    m, packed_k = tensor_fp4.shape
+    k = packed_k * 2
+    tensor_f32 = break_fp4_bytes(tensor_fp4, dtype)
+    tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size)
+    tensor_sf = tensor_sf.view(torch.float8_e4m3fn)
+    tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size)
+    tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale
+
+    # scale the tensor
+    out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k)
+    return out.to(dtype=dtype)
+
+
+def break_fp4_bytes(a, dtype):
+    assert a.dtype == torch.uint8
+    m, n = a.shape
+
+    # Vectorized nibble processing
+    a_flat = a.flatten()
+    high = (a_flat & 0xF0) >> 4  # Upper nibbles
+    low = a_flat & 0x0F  # Lower nibbles
+
+    # Combine nibbles for batch processing
+    combined = torch.stack((low, high), dim=1).flatten()
+
+    # Vectorized sign and magnitude extraction
+    signs = (combined & 0x08).to(torch.bool)  # Sign bits
+    abs_vals = (combined & 0x07).to(torch.long)  # Magnitude indices
+
+    # Device-aware lookup and sign application
+    kE2M1 = kE2M1ToFloat.to(device=a.device)
+    values = kE2M1[abs_vals] * torch.where(signs, -1.0, 1.0)
+
+    # Reshape to final form
+    return values.reshape(m, n * 2).to(dtype=dtype)
+
+
+def generate_balanced_routing(
+    hidden_states: torch.Tensor, num_experts: int, top_k: int
+):
+    """
+    Generate routing weights and topk indices such that every expert is active.
+    Returns routing_weights, topk_idx
+    """
+
+    num_tokens, hidden_dim = hidden_states.shape
+    #   num_tokens = batch_size * seq_len
+
+    # First, assign at least one token per expert
+    tokens_per_expert = torch.arange(num_tokens) % num_experts
+    tokens_per_expert = tokens_per_expert[torch.randperm(num_tokens)]  # shuffle
+
+    # Each token has top_k experts — start with one guaranteed expert
+    topk_idx = torch.full((num_tokens, top_k), -1, dtype=torch.long)
+    topk_idx[:, 0] = tokens_per_expert
+
+    # For remaining top_k - 1 experts, pick randomly (allowing repeats)
+    if top_k > 1:
+        random_choices = torch.randint(0, num_experts, (num_tokens, top_k - 1))
+        topk_idx[:, 1:] = random_choices
+
+    # Normalize routing weights so each token's weights sum to 1
+    routing_weights = torch.rand(num_tokens, top_k)
+    routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+    # Reshape back if needed
+    routing_weights = routing_weights.view(num_tokens, top_k)
+    topk_idx = topk_idx.view(num_tokens, top_k)
+
+    return routing_weights, topk_idx
+
+
+def prepare_inputs(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    num_experts: int,
+    topk: int,
+):
+    routing_weights, topk_idx = generate_balanced_routing(
+        router_logits, num_experts, topk
+    )
+
+    masked_m = []
+    for i in range(num_experts):
+        mask = topk_idx.view(-1) == i
+        masked_m.append(mask.sum())
+
+    masked_m = torch.tensor(masked_m, dtype=torch.int32)
+    # Intialize the hidden_states_3d with ones instead of empty to avoid nan
+    # issue.
+    hidden_states_3d = torch.ones(
+        (num_experts, max(masked_m), hidden_states.shape[1]), dtype=hidden_states.dtype
+    )
+    for i in range(num_experts):
+        hidden_states_3d[i, : masked_m[i], :] = hidden_states[topk_idx.view(-1) == i]
+
+    return hidden_states_3d, masked_m, topk_idx, routing_weights
+
+
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (2, 1024, 1536),
+    (2, 3072, 1024),
+    (2, 3072, 1536),
+    (64, 1024, 1024),
+    (64, 1024, 1536),
+    (64, 3072, 1024),
+    (64, 2048, 1024),
+    (224, 1024, 1024),
+    (224, 1024, 1536),
+]
+
+
+# Reference implementation of torch_moe
+def torch_moe(a, w1, w2, score, topk, expert_map):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    if expert_map is not None:
+        topk_ids = expert_map[topk_ids]
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul()(a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(
+                0, 1
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+def torch_moe_nvfp4(a, w1, w2, topk, topk_weight, topk_ids):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            m = w1[i].shape[0]
+            assert m % 2 == 0
+            # Note: w1 and w3 are swapped!
+            w3_expert, w1_expert = w1[i][m // 2 :, :], w1[i][: m // 2, :]
+            inter = F.silu(a[mask] @ w1_expert.t()) * (a[mask] @ w3_expert.t())
+            inter_gs = torch.tensor(1.0).cuda()
+            inter_q, inter_blockscale = fp4_quantize(inter, inter_gs)
+            inter = dequantize_nvfp4_to_dtype(
+                inter_q,
+                inter_blockscale,
+                inter_gs,
+                dtype=inter.dtype,
+                device=inter.device,
+                block_size=16,
+            ).cuda()
+            out[mask] = inter @ w2[i].transpose(0, 1)
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+def grouped_gemm_ref(
+    hidden_states_expanded: torch.Tensor,
+    hidden_states_3d: torch.Tensor,
+    weights: torch.Tensor,
+    topk_idx: torch.Tensor,
+    masked_m: torch.Tensor,
+    B: int,
+    topk: int,
+    num_experts: int,
+    *,
+    block_size: int = 16,
+) -> torch.Tensor:
+    """
+    Computes the reference grouped GEMM (fp4 quantized per-expert loop),
+    computes flashinfer grouped GEMM (for scale consistency),
+    and returns ONLY the repacked reference output: out_ref.
+
+    Returns:
+        out_ref: Tensor [num_experts, max_m, n_out]
+    """
+    device_hs = hidden_states_expanded.device
+    device_w = weights.device
+    out_dtype = weights.dtype
+    n_out = weights.shape[1]
+
+    # Flattened reference output (B*topk, n_out)
+    out = torch.zeros((B * topk, n_out), dtype=out_dtype, device=device_w)
+
+    # Per-expert reference compute loop
+    for i in range(num_experts):
+        mask = topk_idx.view(-1) == i
+        if mask.any():
+            lhs = hidden_states_expanded[mask]
+            rhs = weights[i]
+
+            a_amax = lhs.abs().max().to(torch.float32).to(device_hs)
+            b_amax = rhs.abs().max().to(torch.float32).to(device_w)
+
+            a_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax
+            b_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
+
+            lhsq, lhsq_sf = fp4_quantize(lhs, a_gs)
+            rhsq, rhsq_sf = fp4_quantize(rhs, b_gs)
+
+            lhs_in_dtype = dequantize_nvfp4_to_dtype(
+                lhsq,
+                lhsq_sf,
+                a_gs,
+                dtype=lhs.dtype,
+                device=device_hs,
+                block_size=block_size,
+            )
+            rhs_in_dtype = dequantize_nvfp4_to_dtype(
+                rhsq,
+                rhsq_sf,
+                b_gs,
+                dtype=rhs.dtype,
+                device=device_w,
+                block_size=block_size,
+            )
+
+            out[mask] = lhs_in_dtype @ rhs_in_dtype.t()
+
+    # Determine per-expert max_m
+    max_m_val = int(masked_m.max().item())
+
+    # Repack into [num_experts, max_m, n_out]
+    out_ref = torch.zeros(
+        (num_experts, max_m_val, n_out),
+        dtype=out.dtype,
+        device=out.device,
+    )
+    expert_slot = [0] * num_experts
+
+    for i, expert_id in enumerate(topk_idx.view(-1).tolist()):
+        slot = expert_slot[expert_id]
+        if slot < max_m_val:
+            out_ref[expert_id, slot, :] = out[i]
+            expert_slot[expert_id] += 1
+        else:
+            raise IndexError(
+                f"Expert {expert_id} exceeded max slots ({max_m_val}). "
+                "Increase max_m or check masked_m."
+            )
+
+    return out_ref
+
+
+def flashinfer_cutedsl_grouped_gemm_nt_masked(
+    hidden_states: torch.Tensor,  # 3d
+    input_global_scale: torch.Tensor,  # (l,)
+    weights: torch.Tensor,
+    w_global_scale: torch.Tensor,  # (l,)
+    masked_m: torch.Tensor,
+):
+    # hidden_states: [l, m, k]
+    # weights: [l, n, k]
+    aq, aq_sf = scaled_fp4_grouped_quantize(
+        hidden_states,
+        masked_m.to(hidden_states.device),
+        input_global_scale,
+    )
+    num_experts, n, k = weights.shape
+    bq, bq_sf = scaled_fp4_grouped_quantize(
+        weights,
+        torch.full((num_experts,), n, device=weights.device, dtype=torch.int32),
+        w_global_scale,
+    )
+
+    out = torch.zeros(
+        (num_experts, max(masked_m), n), dtype=weights.dtype, device=aq.device
+    )
+    out = out.permute(1, 2, 0)  # requirement of kernel
+    sf_vec_size = 16
+    ab_dtype = "float4_e2m1fn"
+    sf_dtype = "float8_e4m3fn"
+    c_dtype = "bfloat16"
+    alpha = 1.0 / (input_global_scale * w_global_scale).to(out.dtype).view(
+        1, 1, num_experts
+    )
+
+    def get_cute_dtype(input: torch.Tensor) -> str:
+        if input.dtype == torch.bfloat16:
+            return "bfloat16"
+        elif input.dtype == torch.float16:
+            return "float16"
+        elif input.dtype == torch.float32:
+            return "float32"
+        else:
+            raise ValueError(f"Unsupported cute dtype {input.dtype}")
+
+    cutedsl_gmm_masked(
+        (aq, aq_sf),
+        (bq, bq_sf),
+        out,
+        masked_m.to(aq.device),
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        sf_vec_size=sf_vec_size,
+        alpha=alpha,
+        alpha_dtype=get_cute_dtype(alpha),
+    )
+
+    return out
+
+
+@pytest.mark.parametrize("bs, hidden_dim, inter_dim", [(2, 128, 256), (16, 128, 512)])
+@pytest.mark.parametrize("topk", [1, 2, 4])
+@torch.inference_mode()
+def test_flashinfer_cutedsl_moe_masked(
+    bs: int, hidden_dim: int, inter_dim: int, topk: int
+):
+    torch.manual_seed(42)
+    device = "cuda"
+    num_experts = 8
+    hidden_states = (
+        torch.randn(bs, hidden_dim, dtype=torch.bfloat16, device=device) / 5.0
+    )
+    w1 = (
+        torch.randn(
+            num_experts, 2 * inter_dim, hidden_dim, dtype=torch.bfloat16, device=device
+        )
+        / 10.0
+    )
+    w2 = (
+        torch.randn(
+            num_experts, hidden_dim, inter_dim, dtype=torch.bfloat16, device=device
+        )
+        / 10.0
+    )
+    router_logits = torch.randn(bs, num_experts, dtype=torch.float32)
+
+    hidden_states_expanded = (
+        hidden_states.view(bs, -1, hidden_dim)
+        .repeat(1, topk, 1)
+        .reshape(-1, hidden_dim)
+    )
+    hidden_states_3d, masked_m, topk_idx, routing_weights = prepare_inputs(
+        hidden_states_expanded, router_logits, num_experts, topk
+    )
+
+    w1_amax = w1.abs().amax(dim=(1, 2)).to(torch.float32).to(w1.device)
+    w2_amax = w2.abs().amax(dim=(1, 2)).to(torch.float32).to(w2.device)
+    input_global_scale = torch.ones(
+        (num_experts,), dtype=torch.float32, device=hidden_states.device
+    )
+
+    w1_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax
+    w2_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax
+    a2_global_scale = torch.ones(
+        (num_experts,), dtype=torch.float32, device=hidden_states.device
+    )  # assume intermediate scale is 1.0
+
+    w1_fp4, w1_blockscale = scaled_fp4_grouped_quantize(
+        w1,
+        torch.ones(num_experts, dtype=torch.int32, device=w1.device) * 2 * inter_dim,
+        w1_global_scale,
+    )
+    w2_fp4, w2_blockscale = scaled_fp4_grouped_quantize(
+        w2,
+        torch.ones(num_experts, dtype=torch.int32, device=w2.device) * hidden_dim,
+        w2_global_scale,
+    )
+
+    w1_alpha = 1.0 / (input_global_scale * w1_global_scale)
+    w2_alpha = 1.0 / (a2_global_scale * w2_global_scale)
+
+    out = torch.empty_like(hidden_states_3d)
+    # Note: the 1st dim shouldn't be bs
+    wk = torch.empty(
+        num_experts,
+        hidden_states_3d.shape[1],
+        inter_dim * 2,
+        dtype=hidden_states_3d.dtype,
+        device=hidden_states.device,
+    )
+    flashinfer_cutedsl_moe_masked(
+        hidden_states_3d.to(hidden_states.device),
+        input_global_scale,
+        w1_fp4.permute(2, 0, 1),
+        w1_blockscale,
+        w1_alpha,
+        w2_fp4.permute(2, 0, 1),
+        a2_global_scale,
+        w2_blockscale,
+        w2_alpha,
+        masked_m.to(hidden_states.device),
+        wk,
+        out,
+    )
+
+    # reference
+    a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, input_global_scale)
+    a_in_dtype = dequantize_nvfp4_to_dtype(
+        a_fp4,
+        a_scale_interleaved,
+        input_global_scale,
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+        block_size=16,
+    )
+    w1_d = torch.empty(
+        (num_experts, 2 * inter_dim, hidden_dim), device=w1.device, dtype=w1.dtype
+    )
+    w2_d = torch.empty(
+        (num_experts, hidden_dim, inter_dim), device=w2.device, dtype=w2.dtype
+    )
+
+    for idx in range(0, num_experts):
+        w1_fp4_sliced, w1_blockscale_sliced = fp4_quantize(
+            w1[idx], w1_global_scale[idx]
+        )
+        w2_fp4_sliced, w2_blockscale_sliced = fp4_quantize(
+            w2[idx], w2_global_scale[idx]
+        )
+        w1_d[idx] = dequantize_nvfp4_to_dtype(
+            w1_fp4_sliced,
+            w1_blockscale_sliced,
+            w1_global_scale[idx],
+            dtype=w1.dtype,
+            device=w1.device,
+            block_size=16,
+        )
+        w2_d[idx] = dequantize_nvfp4_to_dtype(
+            w2_fp4_sliced,
+            w2_blockscale_sliced,
+            w2_global_scale[idx],
+            dtype=w2.dtype,
+            device=w2.device,
+            block_size=16,
+        )
+
+    ref_output = torch_moe_nvfp4(
+        a_in_dtype,
+        w1_d,
+        w2_d,
+        topk,
+        routing_weights.to(a_in_dtype.device),
+        topk_idx.to(a_in_dtype.device),
+    )
+    out_weighted = torch.zeros_like(ref_output, device=out.device, dtype=out.dtype)
+
+    positions = torch.nonzero(masked_m[topk_idx], as_tuple=False)
+    rows, cols = positions[:, 0], positions[:, 1]
+    experts = topk_idx[rows, cols]
+    for i in range(num_experts):
+        mask = experts == i
+        if mask.any():
+            idx = torch.nonzero(mask, as_tuple=False).squeeze(-1)
+            r, c = rows[idx], cols[idx]
+            out_weighted[r] += out[i, : len(r), :] * routing_weights[r, c].to(
+                out.device
+            ).unsqueeze(-1)
+    torch.testing.assert_close(
+        out_weighted.cpu(), ref_output.cpu(), atol=2e-1, rtol=2e-1
+    )
+
+
+@pytest.mark.parametrize(
+    "bs, hidden_dim, inter_dim, topk", [(2, 128, 256, 2), (16, 128, 512, 5)]
+)
+@torch.inference_mode()
+def test_grouped_gemm_nt_masked(
+    bs: int, hidden_dim: int, inter_dim: int, topk: int
+) -> None:
+    torch.manual_seed(42)
+    B = bs
+    D = hidden_dim
+    N = inter_dim
+    # CuteDSL group gemm has issue when not all experts are active.
+    # i.e. masked = [2, 3, 0, 0, 1] where the 2nd and 3rd experts are inactive
+    # see https://github.com/flashinfer-ai/flashinfer/issues/1856
+    num_experts = bs
+    hidden_states = torch.randn(B, D, dtype=torch.bfloat16, device="cuda")
+    weights = torch.randn(num_experts, N, D, dtype=torch.bfloat16, device="cuda")
+    router_logits = torch.randn(B, num_experts, dtype=torch.float32)
+
+    hidden_states_expanded = (
+        hidden_states.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    )
+    hidden_states_3d, masked_m, topk_idx, _ = prepare_inputs(
+        hidden_states_expanded, router_logits, num_experts, topk
+    )
+
+    a_amax = (
+        hidden_states_3d.abs()
+        .amax(dim=(1, 2))
+        .to(torch.float32)
+        .to(hidden_states.device)
+    )
+    b_amax = weights.abs().amax(dim=(1, 2)).to(torch.float32).to(weights.device)
+    a_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax
+    b_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
+    out_flashinfer = flashinfer_cutedsl_grouped_gemm_nt_masked(
+        hidden_states_3d.to(hidden_states.device), a_gs, weights, b_gs, masked_m
+    )
+    # reference
+    out_ref = grouped_gemm_ref(
+        hidden_states_expanded=hidden_states_expanded,
+        hidden_states_3d=hidden_states_3d,
+        weights=weights,
+        topk_idx=topk_idx,
+        masked_m=masked_m,
+        B=B,
+        topk=topk,
+        num_experts=num_experts,
+    )
+    # Note: just to compare the masked position due to cutedsl may write nan
+    # into unmasked position.
+    for i in range(num_experts):
+        torch.testing.assert_close(
+            out_flashinfer.permute(2, 0, 1)[i, : masked_m[i]],
+            out_ref.to(out_flashinfer.device)[i, : masked_m[i]],
+            atol=1e-1,
+            rtol=1e-1,
+        )
+
+
+if __name__ == "__main__":
+    test_flashinfer_cutedsl_moe_masked(16, 128, 512, 4)
+    test_grouped_gemm_nt_masked(16, 128, 512, 4)
diff --git a/vllm/envs.py b/vllm/envs.py
index 212d68114e46e..1ff620af57229 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -157,7 +157,9 @@ if TYPE_CHECKING:
     VLLM_USE_FLASHINFER_MOE_FP16: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
-    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "latency"
+    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "masked_gemm"] = (
+        "latency"
+    )
     VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
@@ -1238,7 +1240,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # - "latency":
     #     Uses TensorRT-LLM kernels optimized for low-latency inference.
     "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
-        "VLLM_FLASHINFER_MOE_BACKEND", "latency", ["throughput", "latency"]
+        "VLLM_FLASHINFER_MOE_BACKEND",
+        "latency",
+        ["throughput", "latency", "masked_gemm"],
     ),
     # Control the workspace buffer size for the FlashInfer backend.
     "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE": lambda: int(
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index e0db248958b47..fea9f49c04b89 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -6,6 +6,7 @@ import deep_ep
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
@@ -27,6 +28,8 @@ logger = init_logger(__name__)
 DEEPEP_QUANT_BLOCK_SIZE = 128
 DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE]
 
+logger = init_logger(__name__)
+
 
 def dequant_fp8(
     expert_x_fp8: torch.Tensor, expert_x_scales: torch.Tensor
@@ -187,16 +190,25 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         # TODO (varun): Optimization - Use a batched version of quant
         x = x.view((-1, hidden_dim))
+        q_dtype = quant_config.quant_dtype
+
+        if envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm":
+            logger.info_once(
+                "Skip quantization when using FlashInfer CUTEDSL(masked_gemm) "
+                "for ModelOptNvFp4FusedMoE."
+            )
+            q_dtype = None
+
         x, x_scales = moe_kernel_quantize_input(
             x,
             quant_config.a1_scale,
-            quant_config.quant_dtype,
+            q_dtype,
             quant_config.per_act_token_quant,
             quant_config.block_shape,
         )
         x = x.view((num_experts, -1, hidden_dim))
 
-        if quant_config.quant_dtype is not None:
+        if q_dtype is not None:
             assert x_scales is not None
             x_scales = normalize_batched_scales_shape(x_scales, num_experts)
 
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
new file mode 100644
index 0000000000000..2747ef04a3499
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
@@ -0,0 +1,346 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate,
+)
+from vllm.utils.flashinfer import (
+    flashinfer_cutedsl_grouped_gemm_nt_masked,
+    has_flashinfer_cutedsl_grouped_gemm_nt_masked,
+    scaled_fp4_grouped_quantize,
+    silu_and_mul_scaled_nvfp4_experts_quantize,
+)
+
+logger = init_logger(__name__)
+
+
+def is_valid_flashinfer_cutedsl_fused_moe(
+    hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor
+) -> bool:
+    """
+    Check if the given problem size is supported by the FlashInfer CuteDSL MoE
+    kernel.
+    """
+    if not has_flashinfer_cutedsl_grouped_gemm_nt_masked():
+        logger.debug_once(
+            "FlashInferCuteDSLExperts disabled: "
+            "flashinfer_cutedsl_fused_moe not available."
+        )
+        return False
+    # Data type checks
+    if (
+        w1.dtype != torch.uint8
+        or w2.dtype != torch.uint8
+        or hidden_states.dtype not in [torch.float32, torch.float16, torch.bfloat16]
+    ):
+        logger.debug_once(
+            "FlashInferCuteDSLExperts disabled: w1/w2 must be torch.uint8 "
+            f"(got w1={w1.dtype}, w2={w2.dtype}), hidden_states must be "
+            f"float32, float16, or bfloat16 (got {hidden_states.dtype})."
+        )
+        return False
+    return True
+
+
+class FlashInferCuteDSLExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(
+        self,
+        out_dtype: torch.dtype,
+        quant_config: FusedMoEQuantConfig,
+    ):
+        super().__init__(quant_config)
+        assert quant_config.quant_dtype == "nvfp4", (
+            "Only nvfp4 quantization are currently supported."
+        )
+        self.out_dtype = out_dtype
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.BatchedExperts,
+            mk.FusedMoEActivationFormat.BatchedExperts,
+        )
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    def supports_chunking(self) -> bool:
+        # This refers to TP chunking; DP chunking is handled separately.
+        # TODO(shuw@nvidia.com): Set to False to be consistent with
+        # batched_deep_gemm_moe
+        return False
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # We use global_num_experts due to how moe_align_block_size handles
+        # expert_maps.
+        """
+        Compute the shapes for the temporary and final outputs of the two gemms
+        and activation in the fused expert function.  Since the gemms are
+        independent, the workspace for the first gemm can be shared with the
+        workspace for the last gemm.
+
+        Returns a tuple of:
+        - workspace13 shape tuple: must be large enough to hold the
+          result of either expert gemm.
+        - workspace2 shape tuple: must be large enough to hold the
+          result of the activation function.
+        - output shape tuple: must be exact size of the final gemm output.
+        - Workspace type: The dtype to use for the workspace tensors.
+        - Note: in order for activation chunking to work, the first dimension
+          of each tuple must be the number of tokens.
+        """
+        output_shape = (local_num_experts, M, K)
+        workspace2 = (local_num_experts, M, N)
+        workspace1 = output_shape
+        return (workspace1, workspace2, output_shape)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,  # Not used
+        workspace13: torch.Tensor | None,
+        workspace2: torch.Tensor | None,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool | None,
+    ):
+        assert self.quant_dtype == "nvfp4", (
+            "Only nvfp4 quantization are currently supported."
+        )
+        # Ensure w1_scale and w2_scale are not None before calling view
+        assert self.w1_scale is not None and self.w2_scale is not None, (
+            "w1_scale and w2_scale must not be None for FlashInferExperts"
+        )
+        assert expert_tokens_meta is not None
+        expert_num_tokens = expert_tokens_meta.expert_num_tokens
+        assert hidden_states.ndim == 3
+        assert self.w1_scale.ndim == 3
+        assert self.w2_scale.ndim == 3
+        flashinfer_cutedsl_moe_masked(
+            hidden_states=hidden_states,
+            input_global_scale=self.a1_gscale,
+            w1=w1,
+            w1_blockscale=self.w1_scale,
+            w1_alpha=self.g1_alphas,
+            w2=w2,
+            a2_global_scale=self.a2_gscale,
+            w2_blockscale=self.w2_scale,
+            w2_alpha=self.g2_alphas,
+            masked_m=expert_num_tokens,
+            workspace=workspace2,
+            out=output,
+        )
+
+
+def get_cute_dtype(input: torch.Tensor) -> str:
+    if input.dtype == torch.bfloat16:
+        return "bfloat16"
+    elif input.dtype == torch.float16:
+        return "float16"
+    elif input.dtype == torch.float32:
+        return "float32"
+    else:
+        raise ValueError(f"Unsupported cute dtype {input.dtype}")
+
+
+def flashinfer_cutedsl_moe_masked(
+    hidden_states: torch.Tensor,
+    input_global_scale: torch.Tensor,
+    w1: torch.Tensor,
+    w1_blockscale: torch.Tensor,
+    w1_alpha,
+    w2: torch.Tensor,
+    a2_global_scale: torch.Tensor,
+    w2_blockscale: torch.Tensor,
+    w2_alpha,
+    masked_m: torch.Tensor,
+    workspace: torch.Tensor,
+    out: torch.Tensor,
+):
+    """
+    Perform masked Mixture-of-Experts computation with FlashInfer's CuteDSL
+    kernels.
+
+    Args:
+        hidden_states (torch.Tensor): [num_experts, m, k], bf16
+        input_global_scale (torch.Tensor): (l,)
+        w1 (torch.Tensor): fp4 weights, [l, 2 * n, k // 2], uint8
+        w1_blockscale (torch.Tensor): blockscale factors, e4m3,
+        w1_alpha (torch.Tensor): (l,)
+        w2 (torch.Tensor): fp4 weights, [l, k, n // 2], uint8
+        a2_global_scale (torch.Tensor): (l,)
+        w2_blockscale (torch.Tensor): blockscale factors, e4m3,
+        w2_alpha (torch.Tensor): (l,)
+        masked_m (torch.Tensor): Masked dimension indices
+        workspace (torch.Tensor): For gateup_output
+
+    Notes:
+        - Assumes max(masked_m) <= m.
+    """
+
+    # === Assertions on dtypes ===
+    assert input_global_scale.dtype == torch.float32, (
+        f"input_global_scale must be float32, got {input_global_scale.dtype}"
+    )
+    assert w1.dtype == torch.uint8, f"w1 must be uint8, got {w1.dtype}"
+    assert w1_blockscale.dtype == torch.float8_e4m3fn, (
+        f"w1_blockscale must be float8_e4m3fn, got {w1_blockscale.dtype}"
+    )
+    assert w1_alpha.dtype == torch.float32, (
+        f"w1_alpha must be float32, got {w1_alpha.dtype}"
+    )
+    assert w2.dtype == torch.uint8, f"w2 must be uint8, got {w2.dtype}"
+    assert a2_global_scale.dtype == torch.float32, (
+        f"a2_global_scale must be float32, got {a2_global_scale.dtype}"
+    )
+    assert w2_blockscale.dtype == torch.float8_e4m3fn, (
+        f"w2_blockscale must be float8_e4m3fn, got {w2_blockscale.dtype}"
+    )
+    assert w2_alpha.dtype == torch.float32, (
+        f"w2_alpha must be float32, got {w2_alpha.dtype}"
+    )
+
+    # === Assertions on shapes ===
+    n = w2.shape[-1] * 2  # intermediate dimension
+    num_experts, m, k = hidden_states.shape
+
+    assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n, got {w1.shape}"
+    assert w1.shape[-1] * 2 == k, (
+        f"w1 last dim * 2 must equal k, got {w1.shape[-1]} vs k={k}"
+    )
+    assert w2.shape[-2:] == (
+        k,
+        n // 2,
+    ), f"w2 shape mismatch, got {w2.shape[-2:]}, expected {(k, n // 2)}"
+
+    assert input_global_scale.shape == (num_experts,), (
+        f"input_global_scale must be (l,), got {input_global_scale.shape}"
+    )
+    assert w1_alpha.shape == (num_experts,), (
+        f"w1_alpha must be (l,), got {w1_alpha.shape}"
+    )
+    assert a2_global_scale.shape == (num_experts,), (
+        f"a2_global_scale must be (l,), got {a2_global_scale.shape}"
+    )
+    assert w2_alpha.shape == (num_experts,), (
+        f"w2_alpha must be (l,), got {w2_alpha.shape}"
+    )
+
+    aq, aq_sf = scaled_fp4_grouped_quantize(
+        hidden_states,
+        masked_m,
+        input_global_scale,
+    )
+
+    workspace = workspace.permute(1, 2, 0)  # requirement of kernel
+    sf_vec_size = 16
+    assert aq_sf.dtype == torch.float8_e4m3fn
+    assert aq.dtype == torch.uint8
+    ab_dtype = "float4_e2m1fn"
+    sf_dtype = "float8_e4m3fn"
+
+    c_dtype = get_cute_dtype(hidden_states)
+
+    # Gemm1
+    flashinfer_cutedsl_grouped_gemm_nt_masked(
+        (aq, aq_sf),
+        (w1.permute(1, 2, 0), w1_blockscale),
+        workspace,
+        masked_m,
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        sf_vec_size=sf_vec_size,
+        alpha=w1_alpha.view(1, 1, num_experts),
+        alpha_dtype=get_cute_dtype(w1_alpha),
+    )  # in logical [m, n, l]
+
+    # SILU and quantization
+    diq, diq_sf = silu_and_mul_scaled_nvfp4_experts_quantize(
+        workspace.permute(2, 0, 1),
+        masked_m,
+        a2_global_scale,
+    )
+
+    # Gemm2
+    out = out.permute(1, 2, 0)  # requirement of kernel
+    flashinfer_cutedsl_grouped_gemm_nt_masked(
+        (diq, diq_sf),
+        (w2.permute(1, 2, 0), w2_blockscale),
+        out,
+        masked_m,
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        sf_vec_size=sf_vec_size,
+        alpha=w2_alpha.view(1, 1, num_experts),
+        alpha_dtype=get_cute_dtype(w2_alpha),
+    )  # in logical [m, k, l]
+    out = out.permute(2, 0, 1)
+
+
+def flashinfer_cutedsl_moe_fp4(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    quant_config: FusedMoEQuantConfig,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: torch.Tensor | None = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+        create_flashinfer_prepare_finalize,
+    )
+
+    fused_experts = mk.FusedMoEModularKernel(
+        create_flashinfer_prepare_finalize(use_dp=False),  # could be swapped later
+        FlashInferCuteDSLExperts(
+            out_dtype=hidden_states.dtype,
+            quant_config=quant_config,
+        ),
+    )
+
+    return fused_experts(
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=inplace,
+        activation=activation,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+    )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 38ab7cd4f115c..f684c17452a9b 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1468,7 +1468,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         gemm1_weight = layer.w13_weight.data
         gemm1_weight_scale = layer.w13_weight_scale.data
 
-        if self.allow_flashinfer:
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+        ):
             gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
                 gemm1_weight, gemm1_weight_scale, dim=-2
             )
@@ -1746,17 +1749,26 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 workspace=layer.workspace,
             )
 
-        elif (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-        ):
-            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-                flashinfer_cutlass_moe_fp4,
+        elif self.allow_flashinfer:
+            assert self.flashinfer_moe_backend in (
+                FlashinferMoeBackend.CUTLASS,
+                FlashinferMoeBackend.CUTEDSL,
             )
+            if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+                from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+                    flashinfer_cutlass_moe_fp4,
+                )
+
+                flashinfer_fn_moe_fp4 = flashinfer_cutlass_moe_fp4
+            else:
+                from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (  # noqa: E501
+                    flashinfer_cutedsl_moe_fp4,
+                )
+
+                flashinfer_fn_moe_fp4 = flashinfer_cutedsl_moe_fp4
 
             assert self.moe_quant_config is not None
-
-            return flashinfer_cutlass_moe_fp4(
+            return flashinfer_fn_moe_fp4(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index fdf330329e20c..36e8599dd9484 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -10,6 +10,9 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
+from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (
+    FlashInferCuteDSLExperts,
+)
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
     FlashInferExperts,
 )
@@ -17,10 +20,14 @@ from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize im
     create_flashinfer_prepare_finalize,
 )
 from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+from vllm.utils.flashinfer import (
+    has_flashinfer_cutedsl_grouped_gemm_nt_masked,
+    has_flashinfer_cutlass_fused_moe,
+)
 
 __all__ = [
     "is_flashinfer_fp4_cutlass_moe_available",
+    "is_flashinfer_fp4_cutedsl_moe_available",
     "reorder_w1w3_to_w3w1",
     "build_flashinfer_fp4_cutlass_moe_prepare_finalize",
 ]
@@ -36,6 +43,16 @@ def is_flashinfer_fp4_cutlass_moe_available() -> bool:
     )
 
 
+def is_flashinfer_fp4_cutedsl_moe_available() -> bool:
+    """Return ``True`` when FlashInfer CUTEDSL NV-FP4 kernels can be used."""
+    return (
+        envs.VLLM_USE_FLASHINFER_MOE_FP4
+        and has_flashinfer_cutedsl_grouped_gemm_nt_masked()
+        and current_platform.is_cuda()
+        and current_platform.is_device_capability(100)
+    )
+
+
 def reorder_w1w3_to_w3w1(
     weight: torch.Tensor, scale: torch.Tensor, dim: int = -2
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -72,15 +89,21 @@ def select_nvfp4_gemm_impl(
     """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers"""
 
     if allow_flashinfer:
-        return FlashInferExperts(
-            out_dtype=moe.in_dtype,
-            quant_config=moe_quant_config,
-            ep_rank=moe.moe_parallel_config.ep_rank,
-            ep_size=moe.moe_parallel_config.ep_size,
-            tp_rank=moe.moe_parallel_config.tp_rank,
-            tp_size=moe.moe_parallel_config.tp_size,
-            use_dp=moe.moe_parallel_config.dp_size > 1,
-        )
+        if envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm":
+            return FlashInferCuteDSLExperts(
+                out_dtype=moe.in_dtype,
+                quant_config=moe_quant_config,
+            )
+        elif envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput":
+            return FlashInferExperts(
+                out_dtype=moe.in_dtype,
+                quant_config=moe_quant_config,
+                ep_rank=moe.moe_parallel_config.ep_rank,
+                ep_size=moe.moe_parallel_config.ep_size,
+                tp_rank=moe.moe_parallel_config.tp_rank,
+                tp_size=moe.moe_parallel_config.tp_size,
+                use_dp=moe.moe_parallel_config.dp_size > 1,
+            )
 
     # native cutlass experts currently don't support DP; TP case won't call this
     raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index f22e17945d1f6..7eba8359b92f6 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -25,6 +25,7 @@ logger = init_logger(__name__)
 class FlashinferMoeBackend(Enum):
     TENSORRT_LLM = "TensorRT-LLM"
     CUTLASS = "CUTLASS"
+    CUTEDSL = "CUTEDSL"
 
 
 def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
@@ -273,19 +274,21 @@ def flashinfer_cutlass_moe_fp8(
 
 
 def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
-    flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
-    # Prefer CUTLASS on SM90 to cover both SM90/SM100 generations
-    if flashinfer_moe_backend == "throughput" or current_platform.is_device_capability(
-        90
-    ):
-        return FlashinferMoeBackend.CUTLASS
-    elif flashinfer_moe_backend == "latency":
-        return FlashinferMoeBackend.TENSORRT_LLM
+    backend_map = {
+        "throughput": FlashinferMoeBackend.CUTLASS,
+        "latency": FlashinferMoeBackend.TENSORRT_LLM,
+        "masked_gemm": FlashinferMoeBackend.CUTEDSL,
+    }
+
+    flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
+    if flashinfer_moe_backend in backend_map:
+        return backend_map[flashinfer_moe_backend]
+    elif current_platform.is_device_capability(90):
+        return FlashinferMoeBackend.CUTLASS
 
-    allowed_backends = ["throughput", "latency"]
     raise ValueError(
-        f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
-        f" expected one of {allowed_backends}"
+        f"Unknown flashinfer moe backend: {flashinfer_moe_backend!r}. "
+        f"Expected one of {list(backend_map.keys())}."
     )
 
 
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
index c3f26cc774118..44c5b027daf4f 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
@@ -5,6 +5,7 @@ from dataclasses import dataclass
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
+    is_flashinfer_fp4_cutedsl_moe_available,
     is_flashinfer_fp4_cutlass_moe_available,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
@@ -32,7 +33,10 @@ def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
     """Detect platform support for NV-FP4 fused-MoE path"""
     cutlass_supported = cutlass_fp4_supported()
 
-    allow_flashinfer = cutlass_supported and is_flashinfer_fp4_cutlass_moe_available()
+    allow_flashinfer = cutlass_supported and (
+        is_flashinfer_fp4_cutlass_moe_available()
+        or is_flashinfer_fp4_cutedsl_moe_available()
+    )
 
     if allow_flashinfer:
         _logger.info_once(
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 1209d64901bf5..9f9976d52b4ae 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -114,7 +114,17 @@ flashinfer_trtllm_fp8_per_tensor_scale_moe = _lazy_import_wrapper(
 flashinfer_cutlass_fused_moe = _lazy_import_wrapper(
     "flashinfer.fused_moe", "cutlass_fused_moe"
 )
+flashinfer_cutedsl_grouped_gemm_nt_masked = _lazy_import_wrapper(
+    "flashinfer.cute_dsl.blockscaled_gemm", "grouped_gemm_nt_masked"
+)
 flashinfer_fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize")
+nvfp4_batched_quantize = _lazy_import_wrapper("flashinfer", "nvfp4_batched_quantize")
+silu_and_mul_scaled_nvfp4_experts_quantize = _lazy_import_wrapper(
+    "flashinfer", "silu_and_mul_scaled_nvfp4_experts_quantize"
+)
+scaled_fp4_grouped_quantize = _lazy_import_wrapper(
+    "flashinfer", "scaled_fp4_grouped_quantize"
+)
 nvfp4_block_scale_interleave = _lazy_import_wrapper(
     "flashinfer", "nvfp4_block_scale_interleave"
 )
@@ -166,6 +176,14 @@ def has_flashinfer_moe() -> bool:
     )
 
 
+@functools.cache
+def has_flashinfer_cutedsl() -> bool:
+    """Return ``True`` if FlashInfer cutedsl module is available."""
+    return (
+        has_flashinfer() and importlib.util.find_spec("flashinfer.cute_dsl") is not None
+    )
+
+
 @functools.cache
 def has_flashinfer_cutlass_fused_moe() -> bool:
     """Return `True` if FlashInfer CUTLASS fused MoE is available."""
@@ -187,6 +205,26 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
     return True
 
 
+@functools.cache
+def has_flashinfer_cutedsl_grouped_gemm_nt_masked() -> bool:
+    """Return ``True`` if FlashInfer CUTLASS fused MoE is available."""
+    if not has_flashinfer_cutedsl():
+        return False
+
+    # Check if all required functions are available
+    required_functions = [
+        ("flashinfer.cute_dsl.blockscaled_gemm", "grouped_gemm_nt_masked"),
+        ("flashinfer", "scaled_fp4_grouped_quantize"),
+        ("flashinfer", "silu_and_scaled_nvfp4_experts_quantize"),
+    ]
+
+    for module_name, attr_name in required_functions:
+        mod = _get_submodule(module_name)
+        if not mod or not hasattr(mod, attr_name):
+            return False
+    return True
+
+
 @functools.cache
 def has_nvidia_artifactory() -> bool:
     """Return `True` if NVIDIA's artifactory is accessible.
@@ -472,7 +510,10 @@ __all__ = [
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
     "flashinfer_cutlass_fused_moe",
+    "flashinfer_cutedsl_grouped_gemm_nt_masked",
     "flashinfer_fp4_quantize",
+    "silu_and_mul_scaled_nvfp4_experts_quantize",
+    "scaled_fp4_grouped_quantize",
     "nvfp4_block_scale_interleave",
     "trtllm_fp4_block_scale_moe",
     "autotune",
@@ -480,6 +521,7 @@ __all__ = [
     "has_flashinfer_comm",
     "has_flashinfer_all2all",
     "has_flashinfer_cutlass_fused_moe",
+    "has_flashinfer_cutedsl_grouped_gemm_nt_masked",
     "has_nvidia_artifactory",
     "supports_trtllm_attention",
     "can_use_trtllm_attention",

From 88f5b19f0bc681c016eaaa17502d3bb4e2b59b51 Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Wed, 19 Nov 2025 16:30:04 -0500
Subject: [PATCH 036/249] [DeepSeek] Fix DeepSeek V3.2 Rope Embedding (#28968)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
---
 vllm/model_executor/layers/mla.py         |  6 +++++-
 vllm/model_executor/models/deepseek_v2.py | 14 ++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
index c4c44b83ae6bf..6ebfa47a9dc3f 100644
--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -24,6 +24,7 @@ class MLAModules:
     q_b_proj: torch.nn.Module | None
     q_proj: torch.nn.Module | None
     indexer: torch.nn.Module | None
+    indexer_rotary_emb: torch.nn.Module | None
     is_sparse: bool
     topk_indices_buffer: torch.Tensor | None
 
@@ -80,6 +81,7 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
         self.rotary_emb = mla_modules.rotary_emb
         self.o_proj = mla_modules.o_proj
         self.indexer = mla_modules.indexer
+        self.indexer_rope_emb = mla_modules.indexer_rotary_emb
         self.is_sparse = mla_modules.is_sparse
 
         if self.indexer is not None:
@@ -153,7 +155,9 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
             )
 
         if self.indexer and self.is_sparse:
-            _topk_indices = self.indexer(hidden_states, q_c, positions, self.rotary_emb)
+            _topk_indices = self.indexer(
+                hidden_states, q_c, positions, self.indexer_rope_emb
+            )
 
         attn_out = self.mla_attn(
             q,
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 6675b2133f386..c0ff621d84085 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -837,8 +837,8 @@ class Indexer(nn.Module):
         )
 
         q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
-        q = torch.cat([q_pe, q_nope], dim=-1)
-        k = torch.cat([k_pe.squeeze(1), k_nope], dim=-1)
+        q = torch.cat([q_pe.squeeze(0), q_nope], dim=-1)
+        k = torch.cat([k_pe.squeeze((0, 2)), k_nope], dim=-1)
 
         # we only quant q here since k quant is fused with cache insertion
         q = q.view(-1, self.head_dim)
@@ -987,6 +987,14 @@ class DeepseekV2MLAAttention(nn.Module):
         self.is_v32 = hasattr(config, "index_topk")
 
         if self.is_v32:
+            self.indexer_rope_emb = get_rope(
+                qk_rope_head_dim,
+                rotary_dim=qk_rope_head_dim,
+                max_position=max_position_embeddings,
+                base=rope_theta,
+                rope_scaling=rope_scaling,
+                is_neox_style=True,
+            )
             self.indexer = Indexer(
                 vllm_config,
                 config,
@@ -998,6 +1006,7 @@ class DeepseekV2MLAAttention(nn.Module):
                 f"{prefix}.indexer",
             )
         else:
+            self.indexer_rope_emb = None
             self.indexer = None
 
         mla_modules = MLAModules(
@@ -1015,6 +1024,7 @@ class DeepseekV2MLAAttention(nn.Module):
             q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None,
             q_proj=self.q_proj if self.q_lora_rank is None else None,
             indexer=self.indexer,
+            indexer_rotary_emb=self.indexer_rope_emb,
             is_sparse=self.is_v32,
             topk_indices_buffer=topk_indices_buffer,
         )

From 22e44ad589d951f440ef98141a2a6f9df97f6873 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Wed, 19 Nov 2025 15:31:33 -0600
Subject: [PATCH 037/249] [ROCm][CI] Fix Weight Loading With Multiple GPU Tests
 on ROCm (#28984)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 .buildkite/test-amd.yaml                  | 5 ++---
 tests/weight_loading/models-amd.txt       | 3 +++
 tests/weight_loading/models-large-amd.txt | 3 +++
 3 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 tests/weight_loading/models-amd.txt
 create mode 100644 tests/weight_loading/models-large-amd.txt

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 0049f35403409..37c6bd4276722 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1323,7 +1323,7 @@ steps:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
 
 - label: Weight Loading Multiple GPU Test - Large Models # optional
   mirror_hardwares: [amdexperimental]
@@ -1331,13 +1331,12 @@ steps:
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
-  gpu: a100
   optional: true
   source_file_dependencies:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
 
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
   mirror_hardwares: [amdexperimental]
diff --git a/tests/weight_loading/models-amd.txt b/tests/weight_loading/models-amd.txt
new file mode 100644
index 0000000000000..e31e904c08af4
--- /dev/null
+++ b/tests/weight_loading/models-amd.txt
@@ -0,0 +1,3 @@
+fp8, amd/Meta-Llama-3.1-8B-Instruct-FP8-KV, main
+None, amd/Llama-3.2-1B-Instruct-FP8-KV, main
+fp8, amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV, main
diff --git a/tests/weight_loading/models-large-amd.txt b/tests/weight_loading/models-large-amd.txt
new file mode 100644
index 0000000000000..b6f5b4b16b37f
--- /dev/null
+++ b/tests/weight_loading/models-large-amd.txt
@@ -0,0 +1,3 @@
+fp8, amd/Meta-Llama-3.1-70B-Instruct-FP8-KV, main
+None, microsoft/phi-4, main
+fp8, amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV, main

From 8f4f77a7275ecac594f84bdb41b67c95cf3eb26d Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 19 Nov 2025 16:43:54 -0500
Subject: [PATCH 038/249] [BugFix] Fix false assertion with
 spec-decode=[2,4,..] and TP>2 (#29036)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/config/compilation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index ca01cb3fb55d5..1c3ef502f0f45 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -921,7 +921,7 @@ class CompilationConfig:
         self, uniform_decode_query_len: int, tensor_parallel_size: int
     ):
         multiple_of = uniform_decode_query_len
-        if tensor_parallel_size > 1:
+        if tensor_parallel_size > 1 and self.pass_config.enable_sequence_parallelism:
             multiple_of = max(uniform_decode_query_len, tensor_parallel_size)
             if (
                 multiple_of % uniform_decode_query_len != 0

From cb0a7b4bea26657da989562a10055b7d0b59fd3a Mon Sep 17 00:00:00 2001
From: Max Hu <hyoung2991@gmail.com>
Date: Wed, 19 Nov 2025 16:54:15 -0500
Subject: [PATCH 039/249] [Bugfix] Move flashinfer kernel check into
 ```__init__``` function of ```FusedMoE``` (#29018)

Signed-off-by: Max Hu <hyoung2991@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 7b15e63e9e350..be1910266c878 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -574,6 +574,9 @@ class FusedMoE(CustomOp):
             is_act_and_mul=is_act_and_mul,
             is_lora_enabled=vllm_config.lora_config is not None,
         )
+        self.moe_config_use_flashinfer_cutlass_kernels = (
+            self.moe_config.use_flashinfer_cutlass_kernels
+        )
 
         self.quant_config = quant_config
 
@@ -728,7 +731,7 @@ class FusedMoE(CustomOp):
         return (
             self.moe_quant_config is not None
             and self.moe_quant_config.quant_dtype == "nvfp4"
-            and self.moe_config.use_flashinfer_cutlass_kernels
+            and self.moe_config_use_flashinfer_cutlass_kernels
         )
 
     @property

From 0075bfffd4201d1377f0d048848f82911e917639 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 19 Nov 2025 17:22:43 -0500
Subject: [PATCH 040/249] [CI] Fix precommit `rope_theta` issue (#29040)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/models/deepseek_v2.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index c0ff621d84085..c50fc327e7608 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -991,8 +991,7 @@ class DeepseekV2MLAAttention(nn.Module):
                 qk_rope_head_dim,
                 rotary_dim=qk_rope_head_dim,
                 max_position=max_position_embeddings,
-                base=rope_theta,
-                rope_scaling=rope_scaling,
+                rope_parameters=config.rope_parameters,
                 is_neox_style=True,
             )
             self.indexer = Indexer(

From 8e38e998298364b0a94cddf7ccc59d8466c2396a Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Thu, 20 Nov 2025 00:30:08 +0100
Subject: [PATCH 041/249] [Feature] EPLB on Qwen3VLMoe and
 CompressedTensorsWNA16MoEMethod (#28849)

---
 .../compressed_tensors_moe.py                 | 27 +++++++-
 vllm/model_executor/models/qwen3_vl_moe.py    | 62 +++++++++++++++++--
 2 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 22b3c477f420f..fa254030a271a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1921,9 +1921,20 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsWNA16MoEMethod` yet."
-            )
+            if expert_load_view is None:
+                raise ValueError("enable_eplb=True requiere expert_load_view != None")
+            if logical_to_physical_map is None:
+                raise ValueError(
+                    "enable_eplb=True requiere logical_to_physical_map != None"
+                )
+            if logical_replica_count is None:
+                raise ValueError(
+                    "enable_eplb=True requiere logical_replica_count != None"
+                )
+            if not isinstance(layer, FusedMoE):
+                raise TypeError(
+                    "EPLB is only supported when `layer` is a instance of FusedMoE."
+                )
 
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -1940,6 +1951,12 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
+            num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0),
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
         )
 
         return fused_experts(
@@ -1956,6 +1973,10 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             quant_config=self.moe_quant_config,
         )
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
 
 class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
     """
diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index 5c3205faf9c2f..e2c129120b1a5 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -15,7 +15,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -29,7 +29,9 @@ from collections.abc import Callable, Iterable
 from itertools import islice
 
 import torch
-from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
+from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import (
+    Qwen3VLMoeConfig,
+)
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
@@ -44,7 +46,12 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors
 
-from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel
+from .interfaces import MixtureOfExperts
+from .qwen3_moe import (
+    Qwen3MoeForCausalLM,
+    Qwen3MoeModel,
+    Qwen3MoeSparseMoeBlock,
+)
 from .qwen3_vl import (
     Qwen3_VisionTransformer,
     Qwen3VLDummyInputsBuilder,
@@ -344,12 +351,56 @@ class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM):
         )
 
 
+class Qwen3VLMoeMixtureOfExperts(MixtureOfExperts):
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.language_model.model.layers:
+            if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+
+        self.moe_layers = []
+        example_moe = None
+        for layer in self.language_model.model.layers:
+            if hasattr(layer, "mlp") and isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None:
+            raise RuntimeError("No Qwen3Moe layer found in the language_model.")
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = len(self.moe_layers)
+        self.num_expert_groups = 1
+        self.num_shared_experts = 0
+        self.num_logical_experts = example_moe.n_logical_experts
+        self.num_physical_experts = example_moe.n_physical_experts
+        self.num_local_physical_experts = example_moe.n_local_physical_experts
+        self.num_routed_experts = example_moe.n_routed_experts
+        self.num_redundant_experts = example_moe.n_redundant_experts
+
+
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen3VLMultiModalProcessor,
     info=Qwen3VLMoeProcessingInfo,
     dummy_inputs=Qwen3VLDummyInputsBuilder,
 )
-class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
+class Qwen3VLMoeForConditionalGeneration(
+    Qwen3VLForConditionalGeneration, Qwen3VLMoeMixtureOfExperts
+):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -413,3 +464,6 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
             self.deepstack_input_embeds = None
         self.visual_dim = config.vision_config.out_hidden_size
         self.multiscale_dim = self.visual_dim * self.deepstack_num_level
+
+        # Set MoE hyperparameters
+        self.set_moe_parameters()

From 3aaa94ac99f4b295ba95f14b4968620b2127044f Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Wed, 19 Nov 2025 18:47:13 -0500
Subject: [PATCH 042/249] [Performance] Reduce DeepGEMM N dim restriction from
 128 to 64 multiplier  (#28687)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                | 20 ++++++++++++++++++++
 tests/kernels/quantization/test_block_fp8.py | 11 +++++++----
 vllm/utils/deep_gemm.py                      | 11 +++++++++--
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5309581d8e81f..71249a9543c7c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -550,6 +550,26 @@ steps:
   commands:
     - pytest -v -s kernels/mamba
 
+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  optional: true
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s tests/kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s tests/kernels/moe/test_deepgemm.py
+    - pytest -v -s tests/kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s tests/kernels/attention/test_deepgemm_attention.py
+
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35
   torch_nightly: true
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index e9973c1fcc15e..d0e4f6554a91f 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -22,6 +22,7 @@ from vllm.utils.deep_gemm import (
     fp8_gemm_nt,
     get_col_major_tma_aligned_tensor,
     per_block_cast_to_fp8,
+    should_use_deepgemm_for_fp8_linear,
 )
 from vllm.utils.import_utils import has_deep_gemm
 
@@ -157,10 +158,6 @@ def test_w8a8_block_fp8_cutlass_matmul():
 @pytest.mark.skipif(not has_deep_gemm(), reason="DeepGemm kernels not available.")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
-    # only aligned sizes
-    if M % 4 != 0 or K % 128 != 0 or N % 64 != 0:
-        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
-
     torch.manual_seed(seed)
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max = fp8_info.max
@@ -168,6 +165,12 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
     A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
     B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
 
+    # only aligned sizes are supported by deepgemm
+    if not should_use_deepgemm_for_fp8_linear(
+        output_dtype=out_dtype, weight=B_fp32, supports_deep_gemm=True
+    ):
+        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
+
     A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1])
     B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size)
 
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index b5ab37534dd78..6b0a383a0e28c 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -365,11 +365,18 @@ def should_use_deepgemm_for_fp8_linear(
 ):
     if supports_deep_gemm is None:
         supports_deep_gemm = is_deep_gemm_supported()
+
+    # Verify DeepGEMM N/K dims requirements
+    # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
+    # test inside kernels/quatization/test_block_fp8.py
+    N_MULTIPLE = 64
+    K_MULTIPLE = 128
+
     return (
         supports_deep_gemm
         and output_dtype == torch.bfloat16
-        and weight.shape[0] % 128 == 0
-        and weight.shape[1] % 128 == 0
+        and weight.shape[0] % N_MULTIPLE == 0
+        and weight.shape[1] % K_MULTIPLE == 0
     )
 
 
From 5031cd5d55ad99e8f9b31dd0020a06b346f6e493 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 19 Nov 2025 18:53:15 -0500
Subject: [PATCH 043/249] [Refactor] Optimize `select_experts` (#28069)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py   |  5 -----
 vllm/model_executor/layers/fused_moe/layer.py       | 11 ++++-------
 vllm/model_executor/layers/quantization/modelopt.py |  2 +-
 vllm/model_executor/models/longcat_flash.py         |  2 +-
 vllm/model_executor/models/openpangu.py             |  2 +-
 5 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 2e042d85fcfcf..f44328418f1bc 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1246,7 +1246,6 @@ def eplb_map_to_physical_and_record(
     expert_load_view: torch.Tensor,
     logical_to_physical_map: torch.Tensor,
     logical_replica_count: torch.Tensor,
-    indices_type: torch.dtype | None = None,
 ) -> torch.Tensor:
     """
     Map the logical expert ids to physical expert ids
@@ -1260,7 +1259,6 @@ def eplb_map_to_physical_and_record(
         expert_load_view: The expert load view.
         logical_to_physical_map: The logical to physical map.
         logical_replica_count: The logical replica count.
-        indices_type: The indices type.
 
     Returns:
         The physical expert ids.
@@ -1310,9 +1308,6 @@ def eplb_map_to_physical_and_record(
         index=topk_ids_flatten.long(),
         src=torch.ones_like(topk_ids_flatten).to(expert_load_view),
     )
-
-    if indices_type is not None:
-        topk_ids = topk_ids.to(dtype=indices_type)
     return topk_ids
 
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index be1910266c878..d9525a7439c3e 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -68,7 +68,6 @@ else:
         expert_load_view: torch.Tensor,
         logical_to_physical_map: torch.Tensor,
         logical_replica_count: torch.Tensor,
-        indices_type: torch.dtype | None,
     ) -> torch.Tensor:
         # CPU fallback: no EPLB so just return as is
         return topk_ids
@@ -1509,8 +1508,6 @@ class FusedMoE(CustomOp):
                 routed_scaling_factor=routed_scaling_factor,
                 e_score_correction_bias=e_score_correction_bias,
             )
-            if indices_type is not None:
-                topk_ids = topk_ids.to(dtype=indices_type)
         elif e_score_correction_bias is not None:
             topk_weights, topk_ids = fused_topk_bias(
                 hidden_states=hidden_states,
@@ -1519,7 +1516,7 @@ class FusedMoE(CustomOp):
                 topk=top_k,
                 renormalize=renormalize,
             )
-            if routed_scaling_factor is not None:
+            if routed_scaling_factor != 1.0:
                 topk_weights *= routed_scaling_factor
         elif custom_routing_function is None:
             topk_weights, topk_ids, token_expert_indices = fused_topk(
@@ -1536,8 +1533,6 @@ class FusedMoE(CustomOp):
                 topk=top_k,
                 renormalize=renormalize,
             )
-            if indices_type is not None:
-                topk_ids = topk_ids.to(dtype=indices_type)
 
         if enable_eplb:
             assert expert_load_view is not None
@@ -1549,9 +1544,11 @@ class FusedMoE(CustomOp):
                 expert_load_view=expert_load_view,
                 logical_to_physical_map=logical_to_physical_map,
                 logical_replica_count=logical_replica_count,
-                indices_type=indices_type,
             )
 
+        if (indices_type is not None) and topk_ids.dtype != indices_type:
+            topk_ids = topk_ids.to(dtype=indices_type)
+
         assert topk_ids.dtype == indices_type or indices_type is None
 
         # Compute zero expert result if needed
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index f684c17452a9b..dedab33c1bdb7 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1706,7 +1706,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 intermediate_size=layer.intermediate_size_per_partition,
                 local_expert_offset=layer.ep_rank * layer.local_num_experts,
                 local_num_experts=layer.local_num_experts,
-                routed_scaling_factor=None,
+                routed_scaling_factor=1.0,
                 tile_tokens_dim=None,
                 routing_method_type=routing_method_type,
                 do_finalize=True,
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index fafe97cd2be7e..c5441283f9711 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -118,7 +118,7 @@ class FlashConfig(PretrainedConfig):
         router_dtype="float32",
         router_bias=False,
         topk_method=None,
-        routed_scaling_factor=None,
+        routed_scaling_factor=1.0,
         zero_expert_num=0,
         zero_expert_type=None,
         nextn_use_scmoe=False,
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index f814cdfec5a22..4124a181a14c2 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -625,7 +625,7 @@ class OpenPanguDecoderLayer(nn.Module):
                 bias=getattr(config, "mlp_bias", False),
                 prefix=f"{prefix}.mlp",
             )
-        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", None)
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
         self.num_hidden_layers = config.num_hidden_layers
         self.first_k_dense_replace = getattr(
             config, "first_k_dense_replace", self.num_hidden_layers

From 537cc635c77ac63f643c5289137debdd8f9591ac Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Wed, 19 Nov 2025 16:10:22 -0800
Subject: [PATCH 044/249] [GC Debugger] Simply and improve GC Debugger Utils
 (#29029)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 vllm/utils/gc_utils.py | 7 ++++---
 vllm/v1/engine/core.py | 5 ++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py
index 160ac9ac263a9..3436e450a269f 100644
--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@@ -68,9 +68,10 @@ class GCDebugger:
             # Before GC started, record GC start time
             # and top collected objects
             self.start_time_ns = time.monotonic_ns()
-            self.gc_top_collected_objects = _compute_top_gc_collected_objects(
-                gc.get_objects(generation), self.config.top_objects
-            )
+            if (top_objects := self.config.top_objects) > 0:
+                self.gc_top_collected_objects = _compute_top_gc_collected_objects(
+                    gc.get_objects(generation), top_objects
+                )
         elif phase == "stop":
             # After GC finished, Record GC elapsed time and
             # optionally top collected objects
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 6be19894d332a..8657a95b5e6e7 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -206,6 +206,8 @@ class EngineCore:
         # Mark the startup heap as static so that it's ignored by GC.
         # Reduces pause times of oldest generation collections.
         freeze_gc_heap()
+        # If enable, attach GC debugger after static variable freeze.
+        maybe_attach_gc_debug_callback()
 
     def _initialize_kv_caches(
         self, vllm_config: VllmConfig
@@ -645,9 +647,6 @@ class EngineCoreProc(EngineCore):
                 assert addresses.coordinator_input is not None
                 logger.info("Waiting for READY message from DP Coordinator...")
 
-        # If enable, attach GC debugger after static variable freeze.
-        maybe_attach_gc_debug_callback()
-
         # Enable environment variable cache (e.g. assume no more
         # environment variable overrides after this point)
         enable_envs_cache()

From 9ccef8e333ccd988a587990740405503e76c8c20 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 19 Nov 2025 16:26:04 -0800
Subject: [PATCH 045/249] [Misc] Colorize logs (#29017)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/test_logger.py            | 94 ++++++++++++++++++---------------
 vllm/envs.py                    |  9 ++++
 vllm/logger.py                  | 51 ++++++++++++------
 vllm/logging_utils/__init__.py  |  3 +-
 vllm/logging_utils/formatter.py | 50 ++++++++++++++++++
 vllm/utils/system_utils.py      |  7 ++-
 6 files changed, 152 insertions(+), 62 deletions(-)

diff --git a/tests/test_logger.py b/tests/test_logger.py
index 01672358902f9..8900e9c2a1e69 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -49,10 +49,13 @@ def test_trace_function_call():
     os.remove(path)
 
 
-def test_default_vllm_root_logger_configuration():
+def test_default_vllm_root_logger_configuration(monkeypatch):
     """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and
     VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default
     behavior is activated."""
+    monkeypatch.setenv("VLLM_LOGGING_COLOR", "0")
+    _configure_vllm_root_logger()
+
     logger = logging.getLogger("vllm")
     assert logger.level == logging.DEBUG
     assert not logger.propagate
@@ -70,12 +73,13 @@ def test_default_vllm_root_logger_configuration():
     assert formatter.datefmt == _DATE_FORMAT
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None)
-def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger():
+def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger(monkeypatch):
     """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and
     VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default
     behavior is activated."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+    monkeypatch.delenv("VLLM_LOGGING_CONFIG_PATH", raising=False)
+
     root_logger = logging.getLogger("vllm")
     root_handler = root_logger.handlers[0]
 
@@ -99,49 +103,50 @@ def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger():
     assert log_record.levelno == logging.INFO
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)
-@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None)
-def test_logger_configuring_can_be_disabled():
+def test_logger_configuring_can_be_disabled(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however mocks are used to ensure no changes in behavior or
     configuration occur."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "0")
+    monkeypatch.delenv("VLLM_LOGGING_CONFIG_PATH", raising=False)
 
     with patch("vllm.logger.dictConfig") as dict_config_mock:
         _configure_vllm_root_logger()
     dict_config_mock.assert_not_called()
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-@patch(
-    "vllm.logger.VLLM_LOGGING_CONFIG_PATH",
-    "/if/there/is/a/file/here/then/you/did/this/to/yourself.json",
-)
-def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist():
+def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however it fails before any change in behavior or
     configuration occurs."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+    monkeypatch.setenv(
+        "VLLM_LOGGING_CONFIG_PATH",
+        "/if/there/is/a/file/here/then/you/did/this/to/yourself.json",
+    )
+
     with pytest.raises(RuntimeError) as ex_info:
         _configure_vllm_root_logger()
     assert ex_info.type == RuntimeError  # noqa: E721
     assert "File does not exist" in str(ex_info)
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-def test_an_error_is_raised_when_custom_logging_config_is_invalid_json():
+def test_an_error_is_raised_when_custom_logging_config_is_invalid_json(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however it fails before any change in behavior or
     configuration occurs."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write("---\nloggers: []\nversion: 1")
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
-            with pytest.raises(JSONDecodeError) as ex_info:
-                _configure_vllm_root_logger()
-            assert ex_info.type == JSONDecodeError
-            assert "Expecting value" in str(ex_info)
+        monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name)
+        with pytest.raises(JSONDecodeError) as ex_info:
+            _configure_vllm_root_logger()
+        assert ex_info.type == JSONDecodeError
+        assert "Expecting value" in str(ex_info)
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
 @pytest.mark.parametrize(
     "unexpected_config",
     (
@@ -151,26 +156,30 @@ def test_an_error_is_raised_when_custom_logging_config_is_invalid_json():
     ),
 )
 def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json(
+    monkeypatch,
     unexpected_config: Any,
 ):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however it fails before any change in behavior or
     configuration occurs."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(unexpected_config))
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
-            with pytest.raises(ValueError) as ex_info:
-                _configure_vllm_root_logger()
-            assert ex_info.type == ValueError  # noqa: E721
-            assert "Invalid logging config. Expected dict, got" in str(ex_info)
+        monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name)
+        with pytest.raises(ValueError) as ex_info:
+            _configure_vllm_root_logger()
+        assert ex_info.type == ValueError  # noqa: E721
+        assert "Invalid logging config. Expected dict, got" in str(ex_info)
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-def test_custom_logging_config_is_parsed_and_used_when_provided():
+def test_custom_logging_config_is_parsed_and_used_when_provided(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however mocks are used to ensure no changes in behavior or
     configuration occur."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+
     valid_logging_config = {
         "loggers": {
             "vllm.test_logger.logger": {
@@ -183,19 +192,18 @@ def test_custom_logging_config_is_parsed_and_used_when_provided():
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(valid_logging_config))
         logging_config_file.flush()
-        with (
-            patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name),
-            patch("vllm.logger.dictConfig") as dict_config_mock,
-        ):
+        monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name)
+        with patch("vllm.logger.dictConfig") as dict_config_mock:
             _configure_vllm_root_logger()
             dict_config_mock.assert_called_with(valid_logging_config)
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)
-def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
+def test_custom_logging_config_causes_an_error_if_configure_logging_is_off(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however mocks are used to ensure no changes in behavior or
     configuration occur."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "0")
+
     valid_logging_config = {
         "loggers": {
             "vllm.test_logger.logger": {
@@ -207,15 +215,15 @@ def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(valid_logging_config))
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
-            with pytest.raises(RuntimeError) as ex_info:
-                _configure_vllm_root_logger()
-            assert ex_info.type is RuntimeError
-            expected_message_snippet = (
-                "VLLM_CONFIGURE_LOGGING evaluated to false, but "
-                "VLLM_LOGGING_CONFIG_PATH was given."
-            )
-            assert expected_message_snippet in str(ex_info)
+        monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name)
+        with pytest.raises(RuntimeError) as ex_info:
+            _configure_vllm_root_logger()
+        assert ex_info.type is RuntimeError
+        expected_message_snippet = (
+            "VLLM_CONFIGURE_LOGGING evaluated to false, but "
+            "VLLM_LOGGING_CONFIG_PATH was given."
+        )
+        assert expected_message_snippet in str(ex_info)
 
         # Remember! The root logger is assumed to have been configured as
         # though VLLM_CONFIGURE_LOGGING=1 and VLLM_LOGGING_CONFIG_PATH=None.
diff --git a/vllm/envs.py b/vllm/envs.py
index 1ff620af57229..614bc94b978bd 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -42,6 +42,8 @@ if TYPE_CHECKING:
     VLLM_LOGGING_PREFIX: str = ""
     VLLM_LOGGING_STREAM: str = "ext://sys.stdout"
     VLLM_LOGGING_CONFIG_PATH: str | None = None
+    VLLM_LOGGING_COLOR: str = "auto"
+    NO_COLOR: bool = False
     VLLM_LOG_STATS_INTERVAL: float = 10.0
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: str | None = None
@@ -616,6 +618,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_LOGGING_STREAM": lambda: os.getenv("VLLM_LOGGING_STREAM", "ext://sys.stdout"),
     # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages
     "VLLM_LOGGING_PREFIX": lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
+    # Controls colored logging output. Options: "auto" (default, colors when terminal),
+    # "1" (always use colors), "0" (never use colors)
+    "VLLM_LOGGING_COLOR": lambda: os.getenv("VLLM_LOGGING_COLOR", "auto"),
+    # Standard unix flag for disabling ANSI color codes
+    "NO_COLOR": lambda: os.getenv("NO_COLOR", "0") != "0",
     # If set, vllm will log stats at this interval in seconds
     # If not set, vllm will log stats every 10 seconds.
     "VLLM_LOG_STATS_INTERVAL": lambda: val
@@ -1578,6 +1585,7 @@ def compile_factors() -> dict[str, object]:
         "VLLM_LOGGING_PREFIX",
         "VLLM_LOGGING_STREAM",
         "VLLM_LOGGING_CONFIG_PATH",
+        "VLLM_LOGGING_COLOR",
         "VLLM_LOG_STATS_INTERVAL",
         "VLLM_DEBUG_LOG_API_SERVER_RESPONSE",
         "VLLM_TUNED_CONFIG_FOLDER",
@@ -1608,6 +1616,7 @@ def compile_factors() -> dict[str, object]:
         "VLLM_TEST_FORCE_LOAD_FORMAT",
         "LOCAL_RANK",
         "CUDA_VISIBLE_DEVICES",
+        "NO_COLOR",
     }
 
     from vllm.config.utils import normalize_value
diff --git a/vllm/logger.py b/vllm/logger.py
index 9341008296843..772e36497b45e 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -17,18 +17,25 @@ from typing import Any, Literal, cast
 
 import vllm.envs as envs
 
-VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
-VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
-VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
-VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
-VLLM_LOGGING_STREAM = envs.VLLM_LOGGING_STREAM
-
 _FORMAT = (
-    f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
+    f"{envs.VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
     "[%(fileinfo)s:%(lineno)d] %(message)s"
 )
 _DATE_FORMAT = "%m-%d %H:%M:%S"
 
+
+def _use_color() -> bool:
+    if envs.NO_COLOR or envs.VLLM_LOGGING_COLOR == "0":
+        return False
+    if envs.VLLM_LOGGING_COLOR == "1":
+        return True
+    if envs.VLLM_LOGGING_STREAM == "ext://sys.stdout":  # stdout
+        return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
+    elif envs.VLLM_LOGGING_STREAM == "ext://sys.stderr":  # stderr
+        return hasattr(sys.stderr, "isatty") and sys.stderr.isatty()
+    return False
+
+
 DEFAULT_LOGGING_CONFIG = {
     "formatters": {
         "vllm": {
@@ -36,13 +43,19 @@ DEFAULT_LOGGING_CONFIG = {
             "datefmt": _DATE_FORMAT,
             "format": _FORMAT,
         },
+        "vllm_color": {
+            "class": "vllm.logging_utils.ColoredFormatter",
+            "datefmt": _DATE_FORMAT,
+            "format": _FORMAT,
+        },
     },
     "handlers": {
         "vllm": {
             "class": "logging.StreamHandler",
-            "formatter": "vllm",
-            "level": VLLM_LOGGING_LEVEL,
-            "stream": VLLM_LOGGING_STREAM,
+            # Choose formatter based on color setting.
+            "formatter": "vllm_color" if _use_color() else "vllm",
+            "level": envs.VLLM_LOGGING_LEVEL,
+            "stream": envs.VLLM_LOGGING_STREAM,
         },
     },
     "loggers": {
@@ -144,7 +157,7 @@ _METHODS_TO_PATCH = {
 def _configure_vllm_root_logger() -> None:
     logging_config = dict[str, Any]()
 
-    if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
+    if not envs.VLLM_CONFIGURE_LOGGING and envs.VLLM_LOGGING_CONFIG_PATH:
         raise RuntimeError(
             "VLLM_CONFIGURE_LOGGING evaluated to false, but "
             "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH "
@@ -152,16 +165,22 @@ def _configure_vllm_root_logger() -> None:
             "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH."
         )
 
-    if VLLM_CONFIGURE_LOGGING:
+    if envs.VLLM_CONFIGURE_LOGGING:
         logging_config = DEFAULT_LOGGING_CONFIG
 
-    if VLLM_LOGGING_CONFIG_PATH:
-        if not path.exists(VLLM_LOGGING_CONFIG_PATH):
+        vllm_handler = logging_config["handlers"]["vllm"]
+        # Refresh these values in case env vars have changed.
+        vllm_handler["level"] = envs.VLLM_LOGGING_LEVEL
+        vllm_handler["stream"] = envs.VLLM_LOGGING_STREAM
+        vllm_handler["formatter"] = "vllm_color" if _use_color() else "vllm"
+
+    if envs.VLLM_LOGGING_CONFIG_PATH:
+        if not path.exists(envs.VLLM_LOGGING_CONFIG_PATH):
             raise RuntimeError(
                 "Could not load logging config. File does not exist: %s",
-                VLLM_LOGGING_CONFIG_PATH,
+                envs.VLLM_LOGGING_CONFIG_PATH,
             )
-        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
+        with open(envs.VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
             custom_config = json.loads(file.read())
 
         if not isinstance(custom_config, dict):
diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
index 44b40ead973ba..8d3354df215b1 100644
--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.logging_utils.formatter import NewLineFormatter
+from vllm.logging_utils.formatter import ColoredFormatter, NewLineFormatter
 from vllm.logging_utils.lazy import lazy
 from vllm.logging_utils.log_time import logtime
 
 __all__ = [
     "NewLineFormatter",
+    "ColoredFormatter",
     "lazy",
     "logtime",
 ]
diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py
index 02ba308e18796..3ad4ef8d119ad 100644
--- a/vllm/logging_utils/formatter.py
+++ b/vllm/logging_utils/formatter.py
@@ -75,3 +75,53 @@ class NewLineFormatter(logging.Formatter):
             parts = msg.split(record.message)
             msg = msg.replace("\n", "\r\n" + parts[0])
         return msg
+
+
+class ColoredFormatter(NewLineFormatter):
+    """Adds ANSI color codes to log levels for terminal output.
+
+    This formatter adds colors by injecting them into the format string for
+    static elements (timestamp, filename, line number) and modifying the
+    levelname attribute for dynamic color selection.
+    """
+
+    # ANSI color codes
+    COLORS = {
+        "DEBUG": "\033[37m",  # White
+        "INFO": "\033[32m",  # Green
+        "WARNING": "\033[33m",  # Yellow
+        "ERROR": "\033[31m",  # Red
+        "CRITICAL": "\033[35m",  # Magenta
+    }
+    GREY = "\033[90m"  # Grey for timestamp and file info
+    RESET = "\033[0m"
+
+    def __init__(self, fmt, datefmt=None, style="%"):
+        # Inject grey color codes into format string for timestamp and file info
+        if fmt:
+            # Wrap %(asctime)s with grey
+            fmt = fmt.replace("%(asctime)s", f"{self.GREY}%(asctime)s{self.RESET}")
+            # Wrap [%(fileinfo)s:%(lineno)d] with grey
+            fmt = fmt.replace(
+                "[%(fileinfo)s:%(lineno)d]",
+                f"{self.GREY}[%(fileinfo)s:%(lineno)d]{self.RESET}",
+            )
+
+        # Call parent __init__ with potentially modified format string
+        super().__init__(fmt, datefmt, style)
+
+    def format(self, record):
+        # Store original levelname to restore later (in case record is reused)
+        orig_levelname = record.levelname
+
+        # Only modify levelname - it needs dynamic color based on severity
+        if (color_code := self.COLORS.get(record.levelname)) is not None:
+            record.levelname = f"{color_code}{record.levelname}{self.RESET}"
+
+        # Call parent format which will handle everything else
+        msg = super().format(record)
+
+        # Restore original levelname
+        record.levelname = orig_levelname
+
+        return msg
diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py
index 5968884e232a4..cc872040b6c5f 100644
--- a/vllm/utils/system_utils.py
+++ b/vllm/utils/system_utils.py
@@ -22,7 +22,7 @@ from .platform_utils import cuda_is_initialized, xpu_is_initialized
 
 logger = init_logger(__name__)
 
-CYAN = "\033[1;36m"
+CYAN = "\033[0;36m"
 RESET = "\033[0;0m"
 
 
@@ -142,7 +142,10 @@ def set_process_title(
 
 def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
     """Add colored prefix to file output for log decoration."""
-    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
+    if envs.NO_COLOR:
+        prefix = f"({worker_name} pid={pid}) "
+    else:
+        prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
     file_write = file.write
 
     def write_with_prefix(s: str):

From 1d642872a27f1c6bedf28669642928cc7eec6532 Mon Sep 17 00:00:00 2001
From: liangel-02 <liangel@meta.com>
Date: Wed, 19 Nov 2025 19:39:45 -0500
Subject: [PATCH 046/249] [torchao] fix safetensors for sharding (#28169)

Signed-off-by: Angel Li <liangel@meta.com>
---
 tests/quantization/test_torchao.py            |  9 ++++----
 .../model_loader/default_loader.py            |  2 +-
 .../model_loader/weight_utils.py              | 23 +++++++++++++++----
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index fb8d6130c3779..f35c3973ab6e6 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -225,13 +225,12 @@ def test_reload_weights():
 @pytest.mark.skip(
     reason="since torchao nightly is only compatible with torch nightly"
     "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
-    "torchao tests that requires newer versions (0.14.0.dev+) for now"
+    "torchao tests that requires newer versions (0.15.0.dev+) for now"
 )
-def test_opt_125m_float8_weight_only_safetensors_model_loading_with_params(vllm_runner):
+def test_safetensors_model_loading_with_params(vllm_runner):
     torch._dynamo.reset()
-    model_name = (
-        "torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.14.0.dev-safetensors"
-    )
+    # using this model to test safetensors loading with file sharding
+    model_name = "torchao-testing/Qwen3-8B-INT4-0.15.0dev-safetensors"
     with vllm_runner(model_name=model_name, dtype="bfloat16") as llm:
         output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
 
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index b80026741781f..67aa584c6bda2 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -279,7 +279,7 @@ class DefaultModelLoader(BaseModelLoader):
             if (
                 hasattr(quant_config, "is_checkpoint_torchao_serialized")
                 and quant_config.is_checkpoint_torchao_serialized
-                and torchao_version_at_least("0.14.0")
+                and torchao_version_at_least("0.15.0")
             ):
                 self.load_config.safetensors_load_strategy = "torchao"
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 89634cbf41241..4572ebe2ea11b 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -595,6 +595,9 @@ def safetensors_weights_iterator(
     if safetensors_load_strategy == "eager":
         loading_desc += " (eager)"
 
+    state_dict = {}
+    leftover_state_dict: dict[str, torch.Tensor] = {}
+
     for st_file in tqdm(
         hf_weights_files,
         desc=loading_desc,
@@ -606,9 +609,11 @@ def safetensors_weights_iterator(
                 state_dict = load(f.read())
             yield from state_dict.items()
         elif safetensors_load_strategy == "torchao":
-            if not torchao_version_at_least("0.14.0"):
+            # we can't load flattened torchao tensor subclasses directly into the model
+            # instead we reconstruct the subclasses here before returning
+            if not torchao_version_at_least("0.15.0"):
                 raise ValueError(
-                    "Please use torchao version >= 0.14.0 \
+                    "Please use torchao version >= 0.15.0 \
                         to load torchao safetensors checkpoint"
                 )
             from torchao.prototype.safetensors.safetensors_support import (
@@ -616,12 +621,20 @@ def safetensors_weights_iterator(
             )
 
             with safe_open(st_file, framework="pt") as f:
-                state_dict = {}
                 for name in f.keys():  # noqa: SIM118
                     state_dict[name] = f.get_tensor(name)
+
+                # update with leftover tensor data from previous iteration, if any
+                state_dict.update(leftover_state_dict)
                 metadata = f.metadata()
-                updated_state_dict = unflatten_tensor_state_dict(state_dict, metadata)
-            yield from updated_state_dict.items()
+                # due to sharded checkpoints, we are not guaranteed that we have all
+                # tensor subclass data on one file
+                # state_dict has the leftover data from this step and we wait for
+                # missing information to be provided in a future iteration
+                unflattened_state_dict, leftover_state_dict = (
+                    unflatten_tensor_state_dict(state_dict, metadata)
+                )
+            yield from unflattened_state_dict.items()
         else:
             with safe_open(st_file, framework="pt") as f:
                 for name in f.keys():  # noqa: SIM118

From 05c2dee7e9f485f1e76eee084849e07c1c12a68b Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Nov 2025 09:40:49 +0800
Subject: [PATCH 047/249] [DeepSeek + LMCache Multiprocess] handle MLA for
 deepseek model + LMCache Multiprocess connector (#29039)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 .../kv_connector/v1/lmcache_mp_connector.py   | 47 +++++++++++++++----
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index 55831dc56c803..22ddabbf1e352 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional, cast
 
 import torch
 import zmq
+from lmcache.integration.vllm.utils import mla_enabled
 from lmcache.utils import init_logger as lmcache_init_logger
 
 from vllm.config import VllmConfig
@@ -60,17 +61,44 @@ def reformat_block_ids(block_ids: tuple[list[int], ...] | None) -> list[int]:
     return block_ids[0]
 
 
+def extract_world_size_and_kv_rank(
+    world_size: int,
+    rank: int,
+    vllm_config: VllmConfig,
+) -> tuple[int, int]:
+    """
+    Convert the rank for the MLA.
+    """
+    use_mla = mla_enabled(vllm_config.model_config)
+    if not use_mla:
+        return world_size, rank
+    else:
+        # Tensor parallel does not change the KV caches for MLA models.
+        # So we need to "exclude" the effect of TP on rank and world size
+        tp_size = vllm_config.parallel_config.tensor_parallel_size
+        # vLLM constructs TP groups first, and then construct other
+        # parallel groups on top of TP groups.
+        # for example, TP=4, PP=2,
+        # TP group: [0, 1, 2, 3], [4, 5, 6, 7]
+        # PP group: [0, 4], [1, 5], [2, 6], [3, 7]
+        # So we can "exclude" the effect of TP by rank // tp_size.
+        return world_size // tp_size, rank // tp_size
+
+
 def create_scheduler_adapter(
     server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig
 ) -> LMCacheMPSchedulerAdapter:
-    # TODO: have a helper function to calculate the correct rank and
-    # world size for the MLA and other models
+    world_size, kv_rank = extract_world_size_and_kv_rank(
+        vllm_config.parallel_config.world_size,
+        vllm_config.parallel_config.rank,
+        vllm_config,
+    )
     return LMCacheMPSchedulerAdapter(
         server_url,
         zmq_context,
         vllm_config.model_config.model,
-        vllm_config.parallel_config.world_size,
-        vllm_config.parallel_config.rank,
+        world_size,
+        kv_rank,
         vllm_config.cache_config.block_size,
     )
 
@@ -78,14 +106,17 @@ def create_scheduler_adapter(
 def create_worker_adapter(
     server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig
 ) -> LMCacheMPWorkerAdapter:
-    # TODO: have a helper function to calculate the correct rank and
-    # world size for the MLA and other models
+    world_size, kv_rank = extract_world_size_and_kv_rank(
+        vllm_config.parallel_config.world_size,
+        vllm_config.parallel_config.rank,
+        vllm_config,
+    )
     return LMCacheMPWorkerAdapter(
         server_url,
         zmq_context,
         vllm_config.model_config.model,
-        vllm_config.parallel_config.world_size,
-        vllm_config.parallel_config.rank,
+        world_size,
+        kv_rank,
         vllm_config.cache_config.block_size,
     )
 

From 3fb0d90999887949629d1e9bac4d98336a35c475 Mon Sep 17 00:00:00 2001
From: Qiang Zhang <email2zq@qq.com>
Date: Thu, 20 Nov 2025 10:11:52 +0800
Subject: [PATCH 048/249] [AMD] Use Decoupled Kernel Block Size to Support
 AITER MLA block_size=1 (#27715)

Signed-off-by: chiangzhang <chiangzhang@tencent.com>
---
 vllm/attention/backends/abstract.py           | 14 +++---
 .../attention/backends/mla/rocm_aiter_mla.py  | 45 +++----------------
 2 files changed, 13 insertions(+), 46 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index d28bc065852db..188becb6ad6f0 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -119,14 +119,12 @@ class AttentionBackend(ABC):
             return True
 
         for supported_size in cls.supported_kernel_block_sizes:
-            is_multiple_of = (
-                isinstance(supported_size, MultipleOf)
-                and block_size % supported_size.base == 0
-            )
-            is_int_equal = (
-                isinstance(supported_size, int) and block_size == supported_size
-            )
-            if is_multiple_of or is_int_equal:
+            if isinstance(supported_size, MultipleOf):
+                supported_size = supported_size.base
+            # With hybrid_blocks feature, the framework-level block size
+            # only needs to be a multiple of the kernel's requirement,
+            # even if the kernel requires a fixed block_size.
+            if block_size % supported_size == 0:
                 return True
         return False
 
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index e1864526f02cc..6ccc1a341d56c 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -7,9 +7,8 @@ from typing import ClassVar
 import torch
 
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.attention.backends.abstract import AttentionLayer
+from vllm.attention.backends.abstract import AttentionLayer, MultipleOf
 from vllm.config import VllmConfig
-from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonDecodeMetadata,
@@ -22,6 +21,8 @@ from vllm.v1.kv_cache_interface import AttentionSpec
 
 
 class AiterMLABackend(MLACommonBackend):
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [1]
+
     @staticmethod
     def get_name() -> str:
         return "ROCM_AITER_MLA"
@@ -71,9 +72,8 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         )
 
         self.compilation_config = vllm_config.compilation_config
-        max_num_pages_per_req = cdiv(
-            vllm_config.model_config.max_model_len, self.kv_cache_spec.block_size
-        )
+        # kernel block size is always 1.
+        max_num_pages_per_req = vllm_config.model_config.max_model_len
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         max_num_pages = max_num_reqs * max_num_pages_per_req
 
@@ -82,11 +82,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         # so we can only use the persistent buffer if a cudagraph is actually
         # being used.
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
-            self.block_table_remapping = torch.zeros(
-                [max_num_reqs, max_num_pages_per_req * self.kv_cache_spec.block_size],
-                dtype=torch.int32,
-                device=device,
-            )
             self.paged_kv_indptr = torch.zeros(
                 max_num_reqs + 1, dtype=torch.int32, device=device
             )
@@ -111,36 +106,16 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         num_decode_tokens: int,
         dcp_tot_seq_lens_device: torch.Tensor | None,
     ) -> AiterMLADecodeMetadata:
-        page_size = self.kv_cache_spec.block_size
+        # kernel block size is always 1, although the kv block size is not 1.
         device = self.device
         num_reqs = seq_lens_device.size(0)
-        bs, _ = block_table_tensor.shape
-        block_table_tensor = (
-            block_table_tensor.unsqueeze(-1).expand(-1, -1, page_size) * page_size
-        )
-        block_table_tensor = (
-            block_table_tensor
-            + torch.arange(
-                0,
-                page_size,
-                device=block_table_tensor.device,
-                dtype=block_table_tensor.dtype,
-            )[None, None, :]
-        )
-        block_table_tensor = block_table_tensor.view(bs, -1)
 
-        # after remapping, we assume the block size already equals to 1
-
-        max_blk_size_per_req = block_table_tensor.shape[-1]
         mask = torch.arange(
             block_table_tensor.size(1), dtype=block_table_tensor.dtype, device=device
         ).unsqueeze(0) < seq_lens_device.unsqueeze(1)
         paged_kv_indices = block_table_tensor[mask]
 
-        paged_kv_last_page_len = seq_lens_device % page_size
-        paged_kv_last_page_len = torch.where(
-            paged_kv_last_page_len == 0, page_size, paged_kv_last_page_len
-        )
+        paged_kv_last_page_len = torch.where(seq_lens_device == 0, 1, seq_lens_device)
 
         paged_kv_indptr = torch.cat(
             [
@@ -151,12 +126,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
 
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             num_actual_pages = paged_kv_indices.size(0)
-            self.block_table_remapping[:num_reqs, :max_blk_size_per_req].copy_(
-                block_table_tensor, non_blocking=True
-            )
-            block_table_tensor = self.block_table_remapping[
-                :num_reqs, :max_blk_size_per_req
-            ]
 
             self.paged_kv_indices[:num_actual_pages].copy_(
                 paged_kv_indices, non_blocking=True

From 3168285fcaaee09bc93dce7bc9ae6ee823c71652 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Thu, 20 Nov 2025 02:37:09 +0000
Subject: [PATCH 049/249] [cpu][ci] Add initial set of tests for Arm CPUs
 (#28657)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .../scripts/hardware_ci/run-cpu-test-arm.sh   | 64 +++++++++++++++++++
 docker/Dockerfile.cpu                         | 10 +++
 2 files changed, 74 insertions(+)
 create mode 100755 .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
new file mode 100755
index 0000000000000..d0036f24c8d04
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-0-16}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
+NUMA_NODE=${NUMA_NODE:-0}
+
+export CMAKE_BUILD_PARALLEL_LEVEL=32
+
+# Setup cleanup
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test-"$NUMA_NODE" || true;
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+
+# Run the image, setting --shm-size=4g for tensor parallel.
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+
+function cpu_tests() {
+  set -e
+  export NUMA_NODE=$2
+
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pip list"
+
+  # offline inference
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run kernel tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -v -s tests/kernels/test_onednn.py
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+
+  # basic online serving
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 4c961defaeda2..eb3807ef0ca4e 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
     && curl -LsSf https://astral.sh/uv/install.sh | sh
 
+ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
@@ -122,6 +123,15 @@ WORKDIR /workspace/vllm
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
     cp requirements/test.in requirements/cpu-test.in && \
     sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
+    remove_packages_not_supported_on_aarch64() { \
+      case "$(uname -m)" in \
+        aarch64|arm64) \
+          sed -i '/decord/d' requirements/cpu-test.in; \
+          sed -i '/terratorch/d' requirements/cpu-test.in; \
+          ;; \
+      esac; \
+    }; \
+    remove_packages_not_supported_on_aarch64 && \
     sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
     sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
     sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \

From fcbcba6c70a3308705aa21adebb443bf9015b486 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <bchislett@nvidia.com>
Date: Wed, 19 Nov 2025 22:17:48 -0500
Subject: [PATCH 050/249] [Feat] Iteration-level profiling for Torch and CUDA
 profiler (#28987)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/v1/worker/test_gpu_profiler.py | 203 +++++++++++++++++++++++++
 vllm/envs.py                         |  16 ++
 vllm/profiler/gpu_profiler.py        | 217 ++++++++++++++++++++++++---
 vllm/v1/engine/async_llm.py          |  14 +-
 vllm/v1/worker/gpu_worker.py         |  50 ++----
 5 files changed, 437 insertions(+), 63 deletions(-)
 create mode 100644 tests/v1/worker/test_gpu_profiler.py

diff --git a/tests/v1/worker/test_gpu_profiler.py b/tests/v1/worker/test_gpu_profiler.py
new file mode 100644
index 0000000000000..f7255fae05a4e
--- /dev/null
+++ b/tests/v1/worker/test_gpu_profiler.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+import vllm.envs as envs
+from vllm.profiler.gpu_profiler import WorkerProfiler
+
+
+class ConcreteWorkerProfiler(WorkerProfiler):
+    """
+    A basic implementation of a worker profiler for testing purposes.
+    """
+
+    def __init__(self):
+        self.start_call_count = 0
+        self.stop_call_count = 0
+        self.should_fail_start = False
+        super().__init__()
+
+    def _start(self) -> None:
+        if self.should_fail_start:
+            raise RuntimeError("Simulated start failure")
+        self.start_call_count += 1
+
+    def _stop(self) -> None:
+        self.stop_call_count += 1
+
+
+@pytest.fixture(autouse=True)
+def reset_mocks():
+    """Fixture to reset mocks and env variables before each test."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 0
+    envs.VLLM_PROFILER_MAX_ITERS = 0
+
+
+def test_immediate_start_stop():
+    """Test standard start without delay."""
+    profiler = ConcreteWorkerProfiler()
+
+    profiler.start()
+    assert profiler._running is True
+    assert profiler._active is True
+    assert profiler.start_call_count == 1
+
+    profiler.stop()
+    assert profiler._running is False
+    assert profiler._active is False
+    assert profiler.stop_call_count == 1
+
+
+def test_delayed_start():
+    """Test that profiler waits for N steps before actually starting."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 2
+    profiler = ConcreteWorkerProfiler()
+
+    # User requests start
+    profiler.start()
+
+    # Should be active (request accepted) but not running (waiting for delay)
+    assert profiler._active is True
+    assert profiler._running is False
+    assert profiler.start_call_count == 0
+
+    # Step 1
+    profiler.step()
+    assert profiler._running is False
+
+    # Step 2 (Threshold reached)
+    profiler.step()
+    assert profiler._running is True
+    assert profiler.start_call_count == 1
+
+
+def test_max_iterations():
+    """Test that profiler stops automatically after max iterations."""
+    envs.VLLM_PROFILER_MAX_ITERS = 2
+    profiler = ConcreteWorkerProfiler()
+
+    profiler.start()
+    assert profiler._running is True
+
+    # Iteration 1
+    profiler.step()  # profiling_count becomes 1
+    assert profiler._running is True
+
+    # Iteration 2
+    profiler.step()  # profiling_count becomes 2
+    assert profiler._running is True
+
+    # Iteration 3 (Exceeds max)
+    profiler.step()  # profiling_count becomes 3
+
+    # Should have stopped now
+    assert profiler._running is False
+    assert profiler.stop_call_count == 1
+
+
+def test_delayed_start_and_max_iters():
+    """Test combined delayed start and max iterations."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 2
+    envs.VLLM_PROFILER_MAX_ITERS = 2
+    profiler = ConcreteWorkerProfiler()
+
+    profiler.start()
+
+    # Step 1
+    profiler.step()
+    assert profiler._running is False
+    assert profiler._active is True
+
+    # Step 2 (Starts now)
+    profiler.step()
+    assert profiler._profiling_for_iters == 1
+    assert profiler._running is True
+    assert profiler._active is True
+
+    # Next iteration
+    profiler.step()
+    assert profiler._profiling_for_iters == 2
+    assert profiler._running is True
+
+    # Iteration 2 (exceeds max)
+    profiler.step()
+
+    # Should have stopped now
+    assert profiler._running is False
+    assert profiler.stop_call_count == 1
+
+
+def test_idempotency():
+    """Test that calling start/stop multiple times doesn't break logic."""
+    profiler = ConcreteWorkerProfiler()
+
+    # Double Start
+    profiler.start()
+    profiler.start()
+    assert profiler.start_call_count == 1  # Should only start once
+
+    # Double Stop
+    profiler.stop()
+    profiler.stop()
+    assert profiler.stop_call_count == 1  # Should only stop once
+
+
+def test_step_inactive():
+    """Test that stepping while inactive does nothing."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 2
+    profiler = ConcreteWorkerProfiler()
+
+    # Not started yet
+    profiler.step()
+    profiler.step()
+
+    # Even though we stepped 2 times, start shouldn't happen because active=False
+    assert profiler.start_call_count == 0
+
+
+def test_start_failure():
+    """Test behavior when the underlying _start method raises exception."""
+    profiler = ConcreteWorkerProfiler()
+    profiler.should_fail_start = True
+
+    profiler.start()
+
+    # Exception caught in _call_start
+    assert profiler._running is False  # Should not mark as running
+    assert profiler._active is True  # Request is still considered active
+    assert profiler.start_call_count == 0  # Logic failed inside start
+
+
+def test_shutdown():
+    """Test that shutdown calls stop only if running."""
+    profiler = ConcreteWorkerProfiler()
+
+    # Case 1: Not running
+    profiler.shutdown()
+    assert profiler.stop_call_count == 0
+
+    # Case 2: Running
+    profiler.start()
+    profiler.shutdown()
+    assert profiler.stop_call_count == 1
+
+
+def test_mixed_delay_and_stop():
+    """Test manual stop during the delay period."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 5
+    profiler = ConcreteWorkerProfiler()
+
+    profiler.start()
+    profiler.step()
+    profiler.step()
+
+    # User cancels before delay finishes
+    profiler.stop()
+    assert profiler._active is False
+
+    # Further steps should not trigger start
+    profiler.step()
+    profiler.step()
+    profiler.step()
+
+    assert profiler.start_call_count == 0
diff --git a/vllm/envs.py b/vllm/envs.py
index 614bc94b978bd..888a09cf6d3ec 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -92,11 +92,14 @@ if TYPE_CHECKING:
     VLLM_TORCH_PROFILER_DIR: str | None = None
     VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
     VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
+    VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False
     VLLM_USE_AOT_COMPILE: bool = False
     VLLM_USE_BYTECODE_HOOK: bool = False
     VLLM_FORCE_AOT_LOAD: bool = False
     VLLM_TORCH_PROFILER_WITH_STACK: bool = True
     VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
+    VLLM_PROFILER_DELAY_ITERS: int = 0
+    VLLM_PROFILER_MAX_ITERS: int = 0
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
@@ -872,6 +875,19 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
         os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
     ),
+    # Disable torch profiling of the AsyncLLMEngine process.
+    # If set to 1, will not profile the engine process.
+    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0"
+    ),
+    # Delay number of iterations before starting profiling when using
+    # the torch/torch CUDA profiler. If set to 0, will start profiling immediately.
+    "VLLM_PROFILER_DELAY_ITERS": lambda: int(
+        os.getenv("VLLM_PROFILER_DELAY_ITERS", "0")
+    ),
+    # Maximum number of iterations to profile when using the torch/torch CUDA profiler.
+    # If set to 0, will not limit the number of iterations.
+    "VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")),
     # If set, vLLM will use Triton implementations of AWQ.
     "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
     # If set, allow loading or unloading lora adapters in runtime,
diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py
index 58c6689531615..2155b67a3db4b 100644
--- a/vllm/profiler/gpu_profiler.py
+++ b/vllm/profiler/gpu_profiler.py
@@ -1,37 +1,212 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from abc import ABC, abstractmethod
+from contextlib import nullcontext
+
+import torch
+from typing_extensions import override
+
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
-class CudaProfilerWrapper:
+class WorkerProfiler(ABC):
     def __init__(self) -> None:
-        self._profiler_running = False
+        self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS
+        if self._delay_iters > 0:
+            logger.info_once(
+                "GPU profiling will start "
+                f"{self._delay_iters} steps after start_profile."
+            )
+
+        self._max_iters = envs.VLLM_PROFILER_MAX_ITERS
+        if self._max_iters > 0:
+            logger.info_once(
+                "GPU profiling will stop "
+                f"after {self._max_iters} worker steps, "
+                "or when stop_profile is received."
+            )
+
+        # Track when the profiler gets triggered by start_profile
+        self._active_iteration_count = 0
+        self._active = False
+
+        # Track when the profiler is actually running
+        self._profiling_for_iters = 0
+        self._running = False
+
+    @abstractmethod
+    def _start(self) -> None:
+        """Start the profiler."""
+        pass
+
+    @abstractmethod
+    def _stop(self) -> None:
+        """Stop the profiler."""
+        pass
+
+    def _call_start(self) -> None:
+        """Call _start with error handling but no safeguards."""
+        try:
+            self._start()
+            self._running = True  # Only mark as running if start succeeds
+        except Exception as e:
+            logger.warning("Failed to start profiler: %s", e)
+
+    def _call_stop(self) -> None:
+        """Call _stop with error handling but no safeguards."""
+        try:
+            self._stop()
+            logger.info("Profiler stopped successfully.")
+        except Exception as e:
+            logger.warning("Failed to stop profiler: %s", e)
+        self._running = False  # Always mark as not running, assume stop worked
+
+    def start(self) -> None:
+        """Attempt to start the profiler, accounting for delayed starts."""
+        if self._active:
+            logger.debug(
+                "start_profile received when profiler is already active. "
+                "Ignoring request."
+            )
+            return
+        self._active = True
+        if self._delay_iters == 0:
+            self._call_start()
+
+    def step(self) -> None:
+        """Update the profiler state at each worker step,
+        to handle delayed starts and max iteration limits."""
+        if not self._active:
+            return
+
+        self._active_iteration_count += 1
+
+        if (
+            not self._running
+            and self._delay_iters > 0
+            and self._active_iteration_count == self._delay_iters
+        ):
+            logger.info("Starting profiler after delay...")
+            self._call_start()
+
+        if self._running:
+            self._profiling_for_iters += 1
+
+        if (
+            self._max_iters > 0
+            and self._running
+            and self._profiling_for_iters > self._max_iters
+        ):
+            # Automatically stop the profiler after max iters
+            # will be marked as not running, but leave as active so that stop
+            # can clean up properly
+            logger.info("Max profiling iterations reached. Stopping profiler...")
+            self._call_stop()
+            return
+
+    def stop(self) -> None:
+        """Attempt to stop the profiler, accounting for overlapped calls."""
+        if not self._active:
+            logger.debug(
+                "stop_profile received when profiler is not active. Ignoring request."
+            )
+            return
+        self._active = False
+        self._active_iteration_count = 0
+        self._profiling_for_iters = 0
+
+        if self._running:
+            self._call_stop()
+
+    def shutdown(self) -> None:
+        """Ensure profiler is stopped when shutting down."""
+        logger.info_once("Shutting down profiler")
+        if self._running:
+            self.stop()
+
+    def annotate_context_manager(self, name: str):
+        """Return a context manager to annotate profiler traces."""
+        return nullcontext()
+
+
+class TorchProfilerWrapper(WorkerProfiler):
+    def __init__(self, worker_name: str, local_rank: int) -> None:
+        super().__init__()
+
+        self.local_rank = local_rank
+        torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+        logger.info(
+            "Torch profiling enabled. Traces will be saved to: %s",
+            torch_profiler_trace_dir,
+        )
+        logger.debug(
+            "Profiler config: record_shapes=%s,"
+            "profile_memory=%s,with_stack=%s,with_flops=%s",
+            envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+            envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+            envs.VLLM_TORCH_PROFILER_WITH_STACK,
+            envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+        )
+        self.profiler = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+            profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+            with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+            with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
+            ),
+        )
+
+    @override
+    def _start(self) -> None:
+        self.profiler.start()
+
+    @override
+    def _stop(self) -> None:
+        self.profiler.stop()
+
+        rank = self.local_rank
+        profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
+        profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
+        sort_key = "self_cuda_time_total"
+        table = self.profiler.key_averages().table(sort_by=sort_key)
+
+        with open(profiler_out_file, "w") as f:
+            print(table, file=f)
+
+        # only print profiler results on rank 0
+        if rank == 0:
+            print(table)
+
+    @override
+    def annotate_context_manager(self, name: str):
+        return torch.profiler.record_function(name)
+
+
+class CudaProfilerWrapper(WorkerProfiler):
+    def __init__(self) -> None:
+        super().__init__()
         # Note: lazy import to avoid dependency issues if CUDA is not available.
         import torch.cuda.profiler as cuda_profiler
 
         self._cuda_profiler = cuda_profiler
 
-    def start(self) -> None:
-        try:
-            self._cuda_profiler.start()
-            self._profiler_running = True
-            logger.info_once("Started CUDA profiler")
-        except Exception as e:
-            logger.warning_once("Failed to start CUDA profiler: %s", e)
+    @override
+    def _start(self) -> None:
+        self._cuda_profiler.start()
 
-    def stop(self) -> None:
-        if self._profiler_running:
-            try:
-                self._cuda_profiler.stop()
-                logger.info_once("Stopped CUDA profiler")
-            except Exception as e:
-                logger.warning_once("Failed to stop CUDA profiler: %s", e)
-            finally:
-                self._profiler_running = False
+    @override
+    def _stop(self) -> None:
+        self._cuda_profiler.stop()
 
-    def shutdown(self) -> None:
-        """Ensure profiler is stopped when shutting down."""
-        self.stop()
+    @override
+    def annotate_context_manager(self, name: str):
+        return torch.cuda.nvtx.range(name)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index c160c7cbcab4a..abf2c8cfa4539 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -160,11 +160,23 @@ class AsyncLLM(EngineClient):
         except RuntimeError:
             pass
 
-        if envs.VLLM_TORCH_PROFILER_DIR:
+        if (
+            envs.VLLM_TORCH_PROFILER_DIR
+            and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM
+        ):
             logger.info(
                 "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s",  # noqa: E501
                 envs.VLLM_TORCH_PROFILER_DIR,
             )
+            if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0:
+                logger.warning_once(
+                    "Torch profiler received max_iters or delay_iters setting. These "
+                    "are not compatible with the AsyncLLM profiler and will be ignored "
+                    "for the AsyncLLM process. Engine process profiling will still "
+                    "respect these settings. Consider setting "
+                    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable "
+                    "AsyncLLM profiling."
+                )
             worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
             self.profiler = torch.profiler.profile(
                 activities=[
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 7f9cdd221224b..18cbc38262793 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -36,7 +36,7 @@ from vllm.model_executor import set_random_seed
 from vllm.model_executor.models.interfaces import is_mixture_of_experts
 from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
 from vllm.platforms import current_platform
-from vllm.profiler.gpu_profiler import CudaProfilerWrapper
+from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
 from vllm.utils.mem_constants import GiB_bytes
@@ -90,32 +90,9 @@ class Worker(WorkerBase):
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
-            logger.info(
-                "Profiling enabled. Traces will be saved to: %s",
-                torch_profiler_trace_dir,
-            )
-            logger.debug(
-                "Profiler config: record_shapes=%s,"
-                "profile_memory=%s,with_stack=%s,with_flops=%s",
-                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-            )
-            self.profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.CUDA,
-                ],
-                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
-                ),
+            self.profiler = TorchProfilerWrapper(
+                worker_name=worker_name, local_rank=self.local_rank
             )
         elif envs.VLLM_TORCH_CUDA_PROFILE:
             self.profiler = CudaProfilerWrapper()
@@ -526,10 +503,12 @@ class Worker(WorkerBase):
         if not self.profiler:
             return nullcontext()
 
+        self.profiler.step()
+
         num_new = len(scheduler_output.scheduled_new_reqs)
         num_cached = len(scheduler_output.scheduled_cached_reqs.req_ids)
 
-        return torch.profiler.record_function(
+        return self.profiler.annotate_context_manager(
             f"execute_new_{num_new}_cached_{num_cached}"
         )
 
@@ -587,24 +566,11 @@ class Worker(WorkerBase):
 
     def profile(self, is_start: bool = True):
         if self.profiler is None:
-            raise RuntimeError("Profiler is not enabled.")
+            raise RuntimeError("Profiling is not enabled.")
         if is_start:
             self.profiler.start()
         else:
             self.profiler.stop()
-            if isinstance(self.profiler, torch.profiler.profile):
-                rank = self.local_rank
-                profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
-                profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
-                sort_key = "self_cuda_time_total"
-                table = self.profiler.key_averages().table(sort_by=sort_key)
-
-                with open(profiler_out_file, "w") as f:
-                    print(table, file=f)
-
-                # only print profiler results on rank 0
-                if rank == 0:
-                    print(table)
 
     def execute_dummy_batch(self) -> None:
         self.model_runner._dummy_run(1, uniform_decode=True)
@@ -865,6 +831,8 @@ class Worker(WorkerBase):
     def shutdown(self) -> None:
         if runner := getattr(self, "model_runner", None):
             runner.ensure_kv_transfer_shutdown()
+        if self.profiler is not None:
+            self.profiler.shutdown()
 
 
 def init_worker_distributed_environment(

From a8c536829cb7b5564f54beff97e938666f286dd6 Mon Sep 17 00:00:00 2001
From: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com>
Date: Wed, 19 Nov 2025 19:39:36 -0800
Subject: [PATCH 051/249] Consolidate Nvidia ModelOpt quant config handling for
 all quantization methods (#28076)

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
---
 .../layers/quantization/modelopt.py           | 499 ++++++++----------
 1 file changed, 234 insertions(+), 265 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index dedab33c1bdb7..6b5ed7762eb31 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Callable
+from fnmatch import fnmatch
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
@@ -13,7 +14,6 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEConfig,
     FusedMoEQuantConfig,
     RoutingMethodType,
     fp8_w8a8_moe_quant_config,
@@ -86,45 +86,218 @@ QUANT_ALGOS = ["FP8", "NVFP4"]
 KV_CACHE_QUANT_ALGOS = ["FP8"]
 
 
-class ModelOptFp8Config(QuantizationConfig):
+class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+
+    def __init__(self, quant_config: "ModelOptQuantConfigBase"):
+        super().__init__(quant_config)
+
+
+class ModelOptQuantConfigBase(QuantizationConfig):
+    LinearMethodCls: type = LinearMethodBase
+    FusedMoEMethodCls: type = FusedMoEMethodBase
+    KVCacheMethodCls: type = BaseKVCacheMethod
+
+    def __init__(
+        self,
+        exclude_modules: list[str],
+    ):
+        super().__init__()
+        self.exclude_modules: list[str] = exclude_modules
+
+    def is_layer_excluded(self, prefix: str) -> bool:
+        """
+        Check if a layer should be excluded from quantization.
+
+        Handles both exact matching (for fused layers) and ModelOpt wildcard matching.
+
+        The ModelOpt exclude_modules list is a list of wildcards.
+        """
+        if len(self.exclude_modules) == 0:
+            return False
+
+        # First check exact matching with fused layer support
+        if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping):
+            return True
+
+        # TODO: This special hard coded logic is not needed for quantized checkpoints
+        # generated by ModelOpt >= 0.39.0 where they are handled natually by the
+        # exclude_modules config. But need to keep them for loading quantized
+        # checkpoints generated by older versions. Then check substring matching
+        # for patterns not caught by exact match
+        for exclude_module in self.exclude_modules:
+            # Skip exact matches already handled above
+            if exclude_module != prefix and (
+                exclude_module in prefix
+                or (
+                    prefix.startswith("language_model.")
+                    and exclude_module in prefix.removeprefix("language_model.")
+                )
+            ):
+                return True
+
+        # modelopt exclude modules are not simple strings, they are wildcards
+        for wildcard_pattern in self.exclude_modules:
+            if fnmatch(prefix, wildcard_pattern):
+                return True
+
+        return False
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        # handle kv-cache first so we can focus only on weight quantization thereafter
+        if isinstance(layer, Attention):
+            return self.KVCacheMethodCls(self)
+
+        # handle exclusion
+        if self.is_layer_excluded(prefix):
+            if isinstance(layer, LinearBase):
+                return UnquantizedLinearMethod()
+            return None
+
+        # TODO: This special hard coded logic is not needed for quantized checkpoints
+        # generated by ModelOpt >= 0.39.0 where they are handled natually by the
+        # exclude_modules config. But need to keep them for loading quantized
+        # checkpoints generated by older versions. Then check substring matching
+        # for patterns not caught by exact match
+        if "vision_tower" in prefix or "vision_model" in prefix:
+            return UnquantizedLinearMethod()
+
+        # now, the layer is quantized, handle it here
+        if isinstance(layer, LinearBase):
+            return self.LinearMethodCls(self)
+        elif isinstance(layer, FusedMoE):
+            return self.FusedMoEMethodCls(quant_config=self, layer=layer)
+
+        return None
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if len(self.exclude_modules) > 0:
+            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def _from_config(
+        cls,
+        *,
+        quant_method: str,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
+        original_config: dict[str, Any],
+        group_size: int | None,
+    ) -> "ModelOptQuantConfigBase":
+        raise NotImplementedError("Please implement this function in sub classes")
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "ModelOptQuantConfigBase":
+        # Handle both ModelOpt format and compressed-tensors style format
+        if "quantization" in config:
+            # Traditional ModelOpt format:
+            # {"quantization": {"quant_algo": "..."}}
+            quant_config = cls.get_from_keys(config, ["quantization"])
+            if not isinstance(quant_config, dict):
+                raise ValueError("Expected 'quantization' to be a dictionary in config")
+
+            quant_method = quant_config.get("quant_algo")
+
+            # Handle kv_cache_quant_algo with proper type validation
+            kv_cache_quant_method = quant_config.get("kv_cache_quant_algo")
+
+            # Handle group_size with proper type validation
+            group_size_raw = quant_config.get("group_size")
+
+            # "exclude_modules" is the key in the legacy hf_quant_config.json
+            exclude_modules = quant_config.get("exclude_modules", [])
+        else:
+            # Compressed-tensors style format:
+            # {"quant_algo": "...", "quant_method": "modelopt"}
+            quant_method = config.get("quant_algo")
+            kv_cache_quant_method = config.get("kv_cache_quant_algo")
+            # "ignore" is the key in config.json
+            exclude_modules = config.get("ignore", [])
+            group_size_raw = config.get("group_size")
+
+        if not quant_method:
+            raise ValueError("Missing 'quant_algo' in quantization config")
+
+        if kv_cache_quant_method is None:
+            # No KV cache quantization, keep this branch just to have this comment
+            pass
+        elif not isinstance(kv_cache_quant_method, str):
+            raise ValueError(
+                f"kv_cache_quant_algo must be a string, got "
+                f"{type(kv_cache_quant_method)}"
+            )
+
+        if not isinstance(exclude_modules, list):
+            raise ValueError(
+                f"exclude_modules must be a list, got {type(exclude_modules)}"
+            )
+
+        if group_size_raw is None:
+            group_size = None
+        elif isinstance(group_size_raw, int):
+            group_size = group_size_raw
+        else:
+            try:
+                group_size = int(group_size_raw)
+            except (ValueError, TypeError):
+                raise ValueError(
+                    f"group_size must be an integer, got {type(group_size_raw)}"
+                ) from None
+
+        if quant_method not in QUANT_ALGOS:
+            raise ValueError(
+                f"ModelOpt currently only supports: {QUANT_ALGOS} "
+                "quantizations in vLLM. Please check the "
+                "`hf_quant_config.json` file for your model's "
+                "quant configuration."
+            )
+        return cls._from_config(
+            quant_method=quant_method,
+            kv_cache_quant_method=kv_cache_quant_method,
+            exclude_modules=exclude_modules,
+            group_size=group_size,
+            original_config=config,
+        )
+
+
+class ModelOptFp8Config(ModelOptQuantConfigBase):
     """Config class for ModelOpt FP8."""
 
     def __init__(
         self,
-        is_checkpoint_fp8_serialized: bool = False,
-        kv_cache_quant_method: str | None = None,
-        exclude_modules: list[str] | None = None,
+        is_checkpoint_fp8_serialized: bool,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
     ) -> None:
-        super().__init__()
+        super().__init__(exclude_modules)
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         self.kv_cache_quant_method = kv_cache_quant_method
-        self.exclude_modules = exclude_modules or []
         if is_checkpoint_fp8_serialized:
             logger.warning(
                 "Detected ModelOpt fp8 checkpoint. Please note that"
                 " the format is experimental and could change."
             )
 
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
+    def get_name(self) -> QuantizationMethods:
         return "modelopt"
 
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
         return [torch.bfloat16, torch.half]
 
     @classmethod
     def get_min_capability(cls) -> int:
         return 89
 
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return ["hf_quant_config.json"]
-
-    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
-        if self.exclude_modules is not None:
-            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
-
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
@@ -158,88 +331,19 @@ class ModelOptFp8Config(QuantizationConfig):
         return None
 
     @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
-        # Handle both ModelOpt format and compressed-tensors style format
-        if "quantization" in config:
-            # ModelOpt format: {"quantization": {"quant_algo": "..."}}
-            quant_config = cls.get_from_keys(config, ["quantization"])
-            if not isinstance(quant_config, dict):
-                raise ValueError("Expected 'quantization' to be a dictionary in config")
-            quant_method = quant_config.get("quant_algo", "")
-            if not quant_method:
-                raise ValueError("Missing 'quant_algo' in quantization config")
-            kv_cache_quant_method = quant_config.get("kv_cache_quant_algo")
-            # "exclude_modules" is the key in the legacy hf_quant_config.json
-            exclude_modules = quant_config.get("exclude_modules")
-        else:
-            # Compressed-tensors style format:
-            # {"quant_algo": "...", "quant_method": "modelopt"}
-            quant_method = config.get("quant_algo", "")
-            kv_cache_quant_method = config.get("kv_cache_quant_algo")
-            # "ignore" is the key in config.json
-            exclude_modules = config.get("ignore")
-
-        if quant_method not in QUANT_ALGOS:
-            raise ValueError(
-                f"ModelOpt currently only supports: {QUANT_ALGOS} "
-                "quantizations in vLLM. Please check the "
-                "`hf_quant_config.json` file for your model's "
-                "quant configuration."
-            )
+    def _from_config(
+        cls,
+        *,
+        quant_method: str,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
+        original_config: dict[str, Any],
+        **kwargs: Any,
+    ) -> "ModelOptFp8Config":
         is_checkpoint_fp8_serialized = "FP8" in quant_method
 
         return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules)
 
-    def is_layer_excluded(self, prefix: str) -> bool:
-        """
-        Check if a layer should be excluded from quantization.
-        Handles both exact matching (for fused layers) and substring matching.
-
-        This method handles both regular models and multimodal models that use
-        the language_model prefix. For multimodal models, it checks if the
-        module name (without the language_model prefix) is in the exclude list.
-        """
-        if self.exclude_modules is None:
-            return False
-
-        # First check exact matching with fused layer support
-        if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping):
-            return True
-
-        # Then check substring matching for patterns not caught by exact match
-        for module in self.exclude_modules:
-            # Skip exact matches already handled above
-            if module != prefix and (
-                module in prefix
-                or (
-                    prefix.startswith("language_model.")
-                    and module in prefix.removeprefix("language_model.")
-                )
-            ):
-                return True
-        return False
-
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import (  # Avoid circular import
-            Attention,
-            MLAAttention,
-        )
-
-        if isinstance(layer, LinearBase):
-            if self.is_layer_excluded(prefix):
-                return UnquantizedLinearMethod()
-            # Check if this is a vision model layer that should not be quantized
-            if "vision_tower" in prefix or "vision_model" in prefix:
-                return UnquantizedLinearMethod()
-            return ModelOptFp8LinearMethod(self)
-        elif isinstance(layer, (Attention, MLAAttention)):
-            return ModelOptFp8KVCacheMethod(self)
-        elif isinstance(layer, FusedMoE):
-            return ModelOptFp8MoEMethod(self, layer)
-        return None
-
 
 class ModelOptFp8LinearMethod(LinearMethodBase):
     """Linear method for Model Optimizer static quantization.
@@ -344,7 +448,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: ModelOptFp8Config,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
     ) -> None:
         super().__init__(layer.moe_config)
         self.layer = layer
@@ -686,7 +790,12 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             )
 
 
-class ModelOptNvFp4Config(QuantizationConfig):
+ModelOptFp8Config.LinearMethodCls = ModelOptFp8LinearMethod
+ModelOptFp8Config.FusedMoEMethodCls = ModelOptFp8MoEMethod
+ModelOptFp8Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod
+
+
+class ModelOptNvFp4Config(ModelOptQuantConfigBase):
     """Config class for ModelOpt FP4."""
 
     def __init__(
@@ -696,7 +805,7 @@ class ModelOptNvFp4Config(QuantizationConfig):
         exclude_modules: list[str],
         group_size: int = 16,
     ) -> None:
-        super().__init__()
+        super().__init__(exclude_modules)
         self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
         if is_checkpoint_nvfp4_serialized:
             logger.warning(
@@ -706,28 +815,17 @@ class ModelOptNvFp4Config(QuantizationConfig):
 
             self.group_size = group_size
             self.kv_cache_quant_algo = kv_cache_quant_algo
-            self.exclude_modules = exclude_modules
 
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
+    def get_name(self) -> QuantizationMethods:
         return "modelopt_fp4"
 
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
         return [torch.bfloat16, torch.half, torch.float8_e4m3fn]
 
     @classmethod
     def get_min_capability(cls) -> int:
         return 80
 
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return ["hf_quant_config.json"]
-
-    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
-        if self.exclude_modules is not None:
-            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
-
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
@@ -761,105 +859,25 @@ class ModelOptNvFp4Config(QuantizationConfig):
         return None
 
     @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config":
-        # Handle both traditional ModelOpt format and compressed-tensors
-        # style format
-        if "quantization" in config:
-            # Traditional ModelOpt format:
-            # {"quantization": {"quant_algo": "..."}}
-            quant_config = cls.get_from_keys(config, ["quantization"])
-            if not isinstance(quant_config, dict):
-                raise ValueError("Expected 'quantization' to be a dictionary in config")
-
-            quant_method = quant_config.get("quant_algo", "")
-            if not quant_method:
-                raise ValueError("Missing 'quant_algo' in quantization config")
-
-            # Handle kv_cache_quant_algo with proper type validation
-            kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo")
-            if kv_cache_quant_algo_raw is None:
-                # No KV cache quantization by default
-                kv_cache_quant_algo = None
-            elif isinstance(kv_cache_quant_algo_raw, str):
-                kv_cache_quant_algo = kv_cache_quant_algo_raw
-            else:
-                raise ValueError(
-                    f"kv_cache_quant_algo must be a string, got "
-                    f"{type(kv_cache_quant_algo_raw)}"
-                )
-
-            # Handle group_size with proper type validation
-            group_size_raw = quant_config.get("group_size")
-            if group_size_raw is None:
-                group_size = 16  # Default value
-            elif isinstance(group_size_raw, int):
-                group_size = group_size_raw
-            else:
-                try:
-                    group_size = int(group_size_raw)
-                except (ValueError, TypeError):
-                    raise ValueError(
-                        f"group_size must be an integer, got {type(group_size_raw)}"
-                    ) from None
-
-            # "exclude_modules" is the key in the legacy hf_quant_config.json
-            exclude_modules = quant_config.get("exclude_modules", [])
-            if not isinstance(exclude_modules, list):
-                raise ValueError(
-                    f"exclude_modules must be a list, got {type(exclude_modules)}"
-                )
-        else:
-            # Compressed-tensors style format:
-            # {"quant_algo": "...", "quant_method": "modelopt"}
-            quant_method = config.get("quant_algo", "")
-
-            # Handle kv_cache_quant_algo with proper type validation
-            kv_cache_quant_algo_raw = config.get("kv_cache_quant_algo")
-            if kv_cache_quant_algo_raw is None:
-                # No KV cache quantization by default
-                kv_cache_quant_algo = None
-            elif isinstance(kv_cache_quant_algo_raw, str):
-                kv_cache_quant_algo = kv_cache_quant_algo_raw
-            else:
-                raise ValueError(
-                    f"kv_cache_quant_algo must be a string, got "
-                    f"{type(kv_cache_quant_algo_raw)}"
-                )
-
-            # Handle group_size with proper type validation
-            group_size_raw = config.get("group_size")
-            if group_size_raw is None:
-                group_size = 16  # Default value
-            elif isinstance(group_size_raw, int):
-                group_size = group_size_raw
-            else:
-                try:
-                    group_size = int(group_size_raw)
-                except (ValueError, TypeError):
-                    raise ValueError(
-                        f"group_size must be an integer, got {type(group_size_raw)}"
-                    ) from None
-
-            # "ignore" is the key in config.json
-            exclude_modules = config.get("ignore", [])
-            if not isinstance(exclude_modules, list):
-                raise ValueError(
-                    f"exclude_modules must be a list, got {type(exclude_modules)}"
-                )
-
-        if quant_method not in QUANT_ALGOS:
-            raise ValueError(
-                f"ModelOpt currently only supports: {QUANT_ALGOS} "
-                "quantizations in vLLM. Please check the "
-                "`hf_quant_config.json` file for your model's "
-                "quant configuration."
-            )
+    def _from_config(
+        cls,
+        *,
+        quant_method: str,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
+        original_config: dict[str, Any],
+        group_size: int | None,
+        **kwargs: Any,
+    ) -> "ModelOptNvFp4Config":
         is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
 
+        if group_size is None:
+            group_size = 16  # Default value
+
         # For FP4, these fields are required
-        if is_checkpoint_nvfp4_serialized and "quantization" in config:
+        if is_checkpoint_nvfp4_serialized and "quantization" in original_config:
             # Check if required fields are present in the quantization config
-            quant_config = config["quantization"]
+            quant_config = original_config["quantization"]
             required_fields = ["group_size", "kv_cache_quant_algo", "exclude_modules"]
             missing_fields = [
                 field for field in required_fields if field not in quant_config
@@ -872,64 +890,11 @@ class ModelOptNvFp4Config(QuantizationConfig):
 
         return cls(
             is_checkpoint_nvfp4_serialized,
-            kv_cache_quant_algo,
+            kv_cache_quant_method,
             exclude_modules,
             group_size,
         )
 
-    def is_layer_excluded(self, prefix: str) -> bool:
-        """
-        Check if a layer should be excluded from quantization.
-        Handles both exact matching (for fused layers) and pattern matching.
-        """
-        # First check exact matching with fused layer support
-        if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping):
-            return True
-
-        # Check regex pattern matching for patterns not caught by exact match
-        import regex as re
-
-        for pattern in self.exclude_modules:
-            # Skip patterns that would be caught by exact matching
-            if "*" in pattern or "." in pattern:
-                regex_str = pattern.replace(".", r"\.").replace("*", r".*")
-                if re.fullmatch(regex_str, prefix):
-                    return True
-        return False
-
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import (  # Avoid circular import
-            Attention,
-            MLAAttention,
-        )
-
-        skip_layer = self.is_layer_excluded(prefix)
-        if isinstance(layer, LinearBase):
-            if skip_layer:
-                return UnquantizedLinearMethod()
-            # Check if this is a vision model layer that should not be quantized
-            if "vision_tower" in prefix or "vision_model" in prefix:
-                return UnquantizedLinearMethod()
-            return ModelOptNvFp4LinearMethod(self)
-        elif isinstance(layer, (Attention, MLAAttention)):
-            return ModelOptFp8KVCacheMethod(self)
-        elif isinstance(layer, FusedMoE):
-            if skip_layer:
-                return None
-            return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer)
-        return None
-
-
-class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
-    """
-    Supports loading kv-cache scaling factors from FP8 checkpoints.
-    """
-
-    def __init__(self, quant_config: ModelOptFp8Config | ModelOptNvFp4Config):
-        super().__init__(quant_config)
-
 
 class ModelOptNvFp4LinearMethod(LinearMethodBase):
     """Linear method for Model Optimizer NVFP4.
@@ -1157,14 +1122,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: ModelOptNvFp4Config,
-        moe: FusedMoEConfig,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
     ) -> None:
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (
             detect_nvfp4_moe_support,  # noqa: E501
         )
 
-        super().__init__(moe)
+        super().__init__(layer.moe_config)
         self.quant_config = quant_config
         self.layer = layer
         _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
@@ -1802,3 +1766,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 k=x.shape[1],
                 e=layer.w13_weight.shape[0],
             )
+
+
+ModelOptNvFp4Config.LinearMethodCls = ModelOptNvFp4LinearMethod
+ModelOptNvFp4Config.FusedMoEMethodCls = ModelOptNvFp4FusedMoE
+ModelOptNvFp4Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod

From 0cca9b4d130b4caddb60086ef26a0d8741582dcb Mon Sep 17 00:00:00 2001
From: prashanth058 <prashanth.dannamaneni@uipath.com>
Date: Wed, 19 Nov 2025 19:50:37 -0800
Subject: [PATCH 052/249] [Bugfix] Fix precision loss in LoRA-wrapped
 RowParallelLinear by fusing bias into GEMM (#28972)

Signed-off-by: prashanth058 <prashanth.dannamaneni@uipath.com>
---
 vllm/lora/layers/row_parallel_linear.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py
index 2ef1bd98fc612..95517b1aee263 100644
--- a/vllm/lora/layers/row_parallel_linear.py
+++ b/vllm/lora/layers/row_parallel_linear.py
@@ -63,23 +63,18 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
             input_parallel = splitted_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
-        output_parallel = self.apply(input_parallel)
+        bias_ = (
+            None
+            if (self.tp_rank > 0 or self.base_layer.skip_bias_add)
+            else self.base_layer.bias
+        )
+        output_parallel = self.apply(input_parallel, bias_)
         if self.base_layer.reduce_results and self.tp_size > 1:
-            output_ = tensor_model_parallel_all_reduce(output_parallel)
+            output = tensor_model_parallel_all_reduce(output_parallel)
         else:
-            output_ = output_parallel
-
-        if not self.base_layer.skip_bias_add:
-            output = (
-                output_ + self.base_layer.bias
-                if self.base_layer.bias is not None
-                else output_
-            )
-            output_bias = None
-        else:
-            output = output_
-            output_bias = self.base_layer.bias
+            output = output_parallel
 
+        output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None
         if not self.base_layer.return_bias:
             return output
 
@@ -120,7 +115,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
         return lora_b
 
     def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x)
+        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape

From fe25772aa97beb8bcb07ea49e06a2892b521a7ed Mon Sep 17 00:00:00 2001
From: Canlin Guo <canlinguosdu@gmail.com>
Date: Thu, 20 Nov 2025 12:38:12 +0800
Subject: [PATCH 053/249] [Bugfix] Handle broken frames in video loading
 (#29001)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: gcanlin <canlinguosdu@gmail.com>
Signed-off-by: 凌葭 <lvjiang.lj@alibaba-inc.com>
Co-authored-by: 凌葭 <lvjiang.lj@alibaba-inc.com>
---
 tests/multimodal/assets/corrupted.mp4 | Bin 0 -> 91678 bytes
 tests/multimodal/test_video.py        |  37 ++++++++
 vllm/multimodal/video.py              | 118 ++++++++++++++++----------
 3 files changed, 112 insertions(+), 43 deletions(-)
 create mode 100644 tests/multimodal/assets/corrupted.mp4

diff --git a/tests/multimodal/assets/corrupted.mp4 b/tests/multimodal/assets/corrupted.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..c355bb932ceeeae13cc2d0a4752dcdf8c5136720
GIT binary patch
literal 91678
zcmYJZV{|A@6D=IuImyY1ZQHhO+qP}nwr$(CZQHqd-uvOl^z7cftGcSXSI=5A0{{Sk
zYwYM|Yvy2M1poj9@ZbISqStdaptG`Nqyqo|fH1Z<G6De9%d#@mbNtnmf`R`06mANg
zbsevXwj@)o;;s=~-?-8<GvZR=8rawy;nMvNic<elG*YrcB2<jHe98j9V}?cszZwA>
zTQ^H1V@F(iT3QAwdRls>U((dk(UzTt#>K^j+S$y|$i`C7n%c(RgyuiL)TWMBmcJSs
zTSqe+YX^2*13i5`11>sTdn02m23$iUeM=hy3obf#T6S7oJ!?HnHwPmwT2}^kT30$c
z23#v6E>j~{Tn8upU&MlI>)`fl`rGQ*8*<T6)BYO$Hn>)1u11DB|9PbQwa~HGvo<l}
zqGQH2FtxX_($o1hrNec!H?p)ebNEH9uB?Uzj=#Xb&WelnHv~OHcN=RXE_ymjIyziq
zJqJe}TL%j>+y4~**MXg_j*YRggOMW_6+Nz_sr|2o0~aH%rHze+p6PE#=l|y%EX@pl
zWBGpxI^6#e|EaC*|2LU|nU$X7e*u|UI~v(r>iw#Jd3{SKdp$QD0~;$_J;&d^!Edb`
z?e)y8e|`N5?e+fC7~AVv89Dq`O<%{>?H8LFa?$@{JwrX)|I*Rd(Kpj`_-_(3d!zp)
z=wf7MV(O^>J7!~RWUXUjWBWV$e@WY4Qwt-vU*BBxOtk+G)Uh(N=3>HiFfg(<GH`O_
zVx;{qN_)NkN^5WAVESuqZ=m!4rTyQyy#bejy)mwp{%?K%%j&noMbAJ@i);5^E?l(K
z%)ij~KfnLG>$!3<v;7hdjz+e>H8HdOU98`Y@w=A4zVz&V_rZUw3g8O>fY@jh8VI2A
zvu{t=><tE$931BCZeTk({t$L=7ss+@!DCa`tP=YEE&J_MKoLIP<PA*ASZTTv6+jGU
zpyUHHKW2Uhr?2aV+(>S!UFYi^@R=LI9)fqSS^NNgmVo^)3?V7)u;f<LL&$RU1x0Fl
zM2)MQ7il<oEdD!`XHSkZJzdV~3278@W5ek<<A|6C`l$tPk>_ak5?>@9<V%0c&5?n3
zgO$MFpGms)!qM(C&(jKuAPm22=ii!`98+@iAJb{mqih+K;hLG<r6KaHZ>YGvrl$R^
znP+^J6xq+sI(DsF*>p0}8m$B!x|G#0)yvyB*iXl?@Or`Yz`G`-HB+OkhDMV1;)PLC
zSz1;b?dP57Djj$Se7RT0_wA7Mm`gZ$YLd-*qO?CtRtY5kPU4BD2v4#CqBPwT7f8mg
zCpO_(cuNn|M$AX68D}i}01(x?OPbRE5-82_*UUFnTfBv{Z<_2J(Wd9-uHKzoeJVP>
zwpoyH!%lTPtfZ_S*j1T!Wgsa}9RPcOzaE20sc0%+TUyvjtt2bliJ6@T6@A5cF>Q71
z6(1X&ZeTWgy^Y?9Wg6QAe*Of`Y+Eu`WIPe{<BV^mMgtlfz+BDL-3&HWU~PJ#d8A;|
z>(tN)?p8`D)SUAt0a0B-2-&K>%ji|YDb3Bf0n3DIi-dAh!6@CXCUM({{(09A1@udG
z|KuI>FZJ(Eo@PX#3kmcv5UESADcZ2)68a`2k~ww)uYaDAv_ll%QAQ9CF3*D^p_gs5
z2<l0p!o#e;d6c20Jl)V;xBHmTq8h*I-q2Bq+gsr-ow~C^`$JHBn@<jt&>Sy!a~{V~
zKI_|2SH8ZAXTk%1jmc{-1wUT15Fj2lB4}#V$GMexgIt%yKODUZm~Bm<e};41X>Xr+
z12#b+=M&P5_5_Td9Qn>=nhszeJ45>+GN{hO#$z_XTo?}%KR&*T#iNg+W=i6ok>QuB
zTJo1d{iR!UH3GN8C-YQ|%~M3qE(Y7$cS(7)(5$-Lqp<2JyQho+&!yNMQf80m*eH(j
zfykGPM2{WSDBKTh&GNg}@iuwKqg_5T+68&sn_^XA?l};k45G$fL|k17?Ri#iK)W*7
zAfqzQ9&oA`ta)IE)!;8xO%F_Y&I(UGgpkNSf_78S#TUG+^rbwx!}u-X^lL?EHh0zx
zw-UUIM%W<NG6NN-M?B;Tt3&$%BQ#QAEo*r@?rOrMy$Kq;y##3<O{tAHq1!oH(iCPR
zb=If2CwaqQ^GpNECF$_1oj=Cu1PM+6urLo=?#8P(-wi!YR-{j;1)<WR<$X0Xu`{5~
zoiHrF_ln$XIl<VnJhtlozj}6o^U3ERQ+7{?ndwBuE!>)w1avc)G0Pk{O>}6bnB`5N
zn!C}F^OMpE_<k5WGSb^3sHofXW$c=b_y*h#nOy!BA}*1yjL}WxL>@zgR7p3}c+9lV
z!9Kff*w{U_bdX9(5#Iw+xqs8KB&=JywC5J?vwRRX?g~m%1>7d1@Fanjw%s_FonMr`
zrZK69#I{l@j3Yl2?RVVz8%gIyJWYa>O#}N}ib6L9{U~rB3D*IDO)I;ZC{<b+3%YUb
zJp7PC?RL`CL14Qt+UDMo1c)^Jm8?C_jF}aUn_^^7`&uO?r1t*c;#9*7t!1A=Cx&Si
zPU$LtYuXy*pG#*u!zM-FV~mV!GJ47m>a~Z^8Y{a}_qp5rE+O7;SLEAvQB!Fc_3+_r
z3HeJlAog}TLRVql6#8+_N3-_$UVe5$$_lBi7vmM9tpur%g6W^{$}4G0Og|CA?+2N;
zWc?OPqla=FXN+fb`KvDX*k;mb;P+PK-V%#?3^9&-X;F?2r(xOv*oUktl0@SFxp3|}
zhS$eBNf{YTuLaYN#Kt4^20}FP;M5R*)&VW~s2bF~3KpoXv`Y<W&CMa+fbSA1YAo9T
zRqEECqIz;NX$h$!?2A@F+|-R%!*}`^2Y2}D=fIL(^CKH}Ff4sxV4o=Wa=x*nUMYWr
zuQ`;jZ?O1-CG#_U4IA2br>8MwxdH6owenHB2J{#O14x_=7{|-%Fi!X!Ykwoh%&@Lo
z+t`3QBq}YA?~&v#i24ilY>AKwDv_j&OKv#@kWrZ`Kl<A)Z@+)=5i*%~iYzLd@NALb
zD3QqZEO{WM7Yke{He#MsD}1=U(F(MzR3UXXHsnT1==Q9!T7<gI-=ALYY>>OP%%t~I
zs`C?efC3iaPKmVCf9F$T#wI$niT~pe6UO5<#}i-FR!fo1YKrx4?_ktu+y%A!QMAC`
ze#>)W^3Qgc%c+HKGMVZ)_Uc8Bdy%D8W!)so2~j)4Fs*4&x|jB}lZbE{NuzR?OpJ4}
z$hzzX6pMb&_QxrTh^`x(Sw?YNphH#5``W`{dyQbMJ|_+UpKC|0JWmcU<eh|1WZK&d
zhjkr@VzLwjan#L-#-gf8%n@>xmO&{{Vr+&)?%7_W>one>7&-OEpwf<`F4b+3p&w7u
z<iXJ%&90Y(_vMOQ;ODr@ybB;21qIzXnL|mHFwR#5?IrQf&tIeK{lJHlwQ09oZj2W6
z!RBWpC%h<JC=U8lamE?dQgVlMwmUJPl>zagF-q*NkB~y~c0`Xdbc6`oq2P-TgbehL
z1jSlN)Ct=+`u-<HIQ|f|PQm$w#iRHE;FY>$P`DLZEcW>cvMvKvF{kSre4LN$d@6zI
zO7a*BflU9;4JR6cceR)y!%M%Y(TadpmM?ZS&1Okh%^$c@%H*sYc~(O|6YlW!=z!N-
zXOfrLl?;4{!gJ38uXRpnEHU5O!a}5h9w!mAqZJB@EeM0az^*@PndcwM(roY{Z=9E$
zhfg}Q@M;$yG~@yq-FCiiwOzbqjr>oLc3nWZO96eXUpPw>MerpWe_@M;x%Y$<aX31L
zWELejyQfc1!OOwh1ouRJ^ybYdWj}N3>afh5aiO5XBv*j3_L!BNiPg=V=1@)SKi%>I
zcu%NE-s%H-K#`u1kO_|ijPp$s_oEbhZ4xAP795;Hk1C_hS@6(%uwzTK(zzsm(oXcD
z_x5Ev!1{kejVRq(Q4c~%Ro7wpQ^Q=0LgQlQimfmbyg(H5GHzM0vpha%-HG!OMF`)r
zbFNq>D1w4Gyf`Hm1KG$fOoA>70_VnP(FXr{XX$4<l7$InE9@U)7Yweg>>E4RSz?z9
zpeeF*LRs|^_HZ;RUhMJkdn}$KO>SKFq5?$1k-DxjK~l=x>ZRuyjd9%%1Q#9)_a(C~
z=DBjFv7_&UjkxR>CUboR^_Tv6aunJl0g@#@UHYEpbDmYH9CCi~%FL|xoIlt&P_+TA
zd^$MBM8AeRN%!wgz4z}Tlb82}<10pgKdGyR-r9(t-^yOzZXD&=>fYzaR?r-KXste)
zT4qp36>;In=yDKz!H7{(1~Sx8D~c%MMP(_dsA^nL*<c@lk|YmJo_a=CbMGz4QN3M1
z@;;E3=ak51f*9UF9PpF|<CShi;AXZI$<?j@qptAeTY3|;!DiMkZFV7)g|5Jj1)-n7
z@?db?+^G~2!GGguwzX2&1=fl`TQRstIIN!~9011~=7GGs-6agGhX7F2q#T6aCSHdj
z@A@J#E@S<~PC;u|1M!4jc%dzO)i0e|v+$VR(rL$3?)=Hdo75mXC@}Z*^*7SVqlR4F
zo{h#$%G=_)(RSVDMDPLioGZj9J<_Maq{ZjxUZn5_B`c`WqZeFWN?1i^tm?6@A3wHF
zutFsrghxySp$rJ5DB(dHz4+9r4_KUi@x=L(ZH(K%+y@4iLNqj}E$is$-4Ha9i(Y8|
zbC@56uds$W&z3?cuiRZ!Tv|zSf4C-R+0S-V-<kCq!i&)mZ$c7WYT~Vilo(XWO=Sq~
z%cpNJfCh=EA?bDa3@w0zdDjplJo?g^(@CaPq(_hADfitniRJ4GAC@v!%4o5zVW+t8
zBMKan-#HzIZh`~o`hHL_v=O=ke&V35QMhkWh{Gf_#fth=wQ3mF1oi&fDGqIjHPMkL
zaZWN&jwFG?=1(A1V%NwO@<j8pr{!*2n+v!;*+oFUtAF{n0fB3aNi5MO?p2U9CVwmo
zS6;3jF0ziRr7^;5Q&tu);`8jo4yUn`q0H?8GR7e7hy{C2CqfK)nf<sPN`K>LeRlF^
z!+6_pEq;i{B>m%pZ%8+oO=$w&U8MQ#nUqEM6QR@E<$`@$P4oU5HM3mrAt5H1uCbz5
z_xqt;%+aF2tiYOp+uh=*CZPGS1*+^}M&<#);~H2+#PUuCe(3Ny+c5}`KY4~iqAu{z
zOu|Vy#VgG{_O=67L_Lp)K*(*Cn%oGI81N?j%|nKo199#T1PdP8zFNqrB{y{(WyvM1
zvx7EdoZNvXE_b!vAx4-*Mj+>rbeB4+ti^`xzQtZT=WG!}^MxY%R6rr)(cBrawttK|
zvl~wR%Hq{Kx)+N_a@M5Ja}S+r7tOpi;EjpBsMw!Zip@Rnr$UNeNzMU^&U~ejOEI@-
zp(Re=#HmHNu;nGnxaF5bk~1x%&p&YpaOIXbiL|`!q#_#3YesksIcg@WVHqfoao^{i
zVow|x3m%gU0qc|Re63+!RE1M#U$iWp!ftqHB_+DuyC}P*BG}T<MxG2W2zjdpp0R(i
zXNLX$9N^6O6)uiP=?h{CtoP$Osxre$D7#ElHnSUNz-^}5vkYV9#_+^xJt}1tO1QUp
zlL-MIO<{*%LEPRuQW2!tvJwT;g?HL+D>-YiBkwj(2241!A<{_=&nQUFDyd(Mve+jA
zef6HrI>C>RzJ(m-$KCdAv78V_hmfoXC9!ok8?c8(X(y6kYd2^!qp=CEiB8}zbN?gs
z=+D-N5#eP8iPQ%PnZTT2@fY<`=A)kk<BlF)(5Fm5ff0@8{vt7`F}%2G-`+`)m+U><
z3*F&J$33}{SIn&%8%2kCytn_$jEF{Q-x@L)L6V8s&-Ap8Mm1W}vQKrVORnv=dq&0k
z{juc2_SyanDs4Yt<h=jF%N(*uh%aB@%;y7qdd<m70*6O~ZU|#qxZufIoEgI|Y=xsO
z>MUd4UI^76$49x&Ym9%1tXOLY_OErSyQ^afqSwpK4ZZ5roD|pz)SQHm+FD9%C{1~M
zpnbbl;JbzEniy^#yep>sA+cj-B38OxqZ_+6su4OXJ^qO+%|@BUE!$%(xabAK?7R6i
z(cs|6EA2@ZM6}n!kq<jWzDN%?$dt&Bcc)1`=dNlOBH(L~p!U%AA3{Ihxlo2flzxb0
zFG$8&>KH>rnIlN8^_(0LHZ!U}+qQxScef^<m8|+ss+_QnCb)?FWbcZGV4)7LQ9H?&
zuLLfnY>YxM6WK}Wbc$w8&kFiS^OZ2(AW3WP`?N@9L7m4J+l03*pMLBTr|s=<+KWgW
zpE8U}@5qW_p#TZ5o}G9196NYq<9R;wZbu1lhW8*vVy?cjb1qrSnOmePp-YdSt2ToL
zQ>(<P!V%ek{-GF&JmHCpziB>|W^B3k_vWfO0L$OA;S@}EPgwF!k%>t@!*yH2k$#t9
zyH=gImzsz81DozoWb+?pJd-p}vemdE3;K=JZGpM4w01KY2j_-G#Yl-3=D7<;MYPOi
zpcfuSR*OezqLTz(&r`*y`58m4=O<i;!#1fes3-*G-~-TZ&J@MA+XZt0&@34YKXJgs
zzDKgU_ZN00S4(E&w<UY6SrIOBY2Tz8L?~n~*^L$cQ8LWn<6(w`pP9NR3ygJELusFf
zfj`<C5I+KY0Ou$ZG0BdujE*e;$ha~y{k|KlYv4dI@^u+fz*+1cl)e#Za3pMDVdLj>
zkqUki?^cB#tQVUC_;PRYuK|v%V)eL@l3|?TuVbHDlpk+%D~i=c)ck3<6v~=whozm9
zcC2Pl(l)0nnFF58;2IU-+#CIG?F39Hvd=h6^=E_W>$hQ4B;ywvFYU@?Jx9Z;A4!_P
zj^UjkpmiDHb;r`r>OY{@qg(xHRwk&o`T^dR&SdbHX0nCOgJ)O8s`k473_0G((F|h*
z*x1~p?9ki*0MNjUfFMG7L3-k7r%EJC(Mo5&k0I#hCIPR<8NCv(a6|N3_9dsjr)o4q
z_#N8BHmCaEA!R?@jeS@{0(}0EM@O+G?`GMh0zn9AeE5A%d_ON8A*{RBk2T0_gtqQK
zF}_b@ffY8|04otGV}!*r-6QR@pT}eqt5Tm0#~j?z5`GA}I}Z!#Y6rgC?5+YRSgZ}j
z+d^LL0m#0ZLGnvl)=a?8CwNZ1zD9dE*Y*i9cD49pJ;Kk2#cKu79G9M0^y}iQvy1e#
zA-9j<#+2qH;PDsE{DFJ4a}0G$pQF^*FreagQ<Pn#;_^R@F#8jtS#K%>6rTS$>)_#2
zoredMWSa+pej-HL7VVcFMmv>x$gOr>T!}xZ+VLGK<OSUq4Z*2>0(!5)V2mqJ0V8b&
zcyj0ge8-l`Ue(RCeMg?C(l&Fq1A1zu$`=i}uXzv$D7OrmUt;FupmwC&j-bT5BXkG+
zS}R$+=M1Zp*M!q$0{Zk&q?C*v+iBpt<kr3E6C8`Xi>r##XcWm!8&z~e>K}_43dSYP
zhxl)^V>ooukUT+#-siDBB|ZTf2$%=$l$qWVR7XI?yNK*HBN8Z=8mcllA=K41wHh(X
zCc443ZXJwRDD(G$I~UQ8V^QcGt~kPu4|+z4^^TmnbD92phOI5hh*aa~!BCJPU}1Rn
zgA88Uvha|D!jMpnd0vbNnl<onBatzdE>DpOr)15KVCAb+gw&ALm*783hJf>SZM*z0
zI-ze0pAn_nCjPHnG}dck_V<YI13+bt75XvQtDm6(S7ET6zI0nSVFIUifQr~nEhwpC
zY17fq4OuST9r8HVOQbDj+*^^rtx+tzokyOg)zr57w%k}=LrOpJT1JsarRYi%E3-t*
z1xyZ{|5x9>$fb^!JfQiNbu1RX?eJG4<TH28m1b8EDwsF$rEg>297A<t+JS``y;GTQ
z3sY%`Nk3QBa|0`*@J{6yWA;|}ZhQ1VlW22GMM?s5Ai)Y@<|mk-Od-9qs$^fgByGM8
zD4kJ8;rW>M`GDoOT+z`w-FNMy0I+bjDf|KdvjkQhv9+C#oW&a0XO9F+E80uLDu$`F
zKZWRKa3+p?O@GI;z;~5^g1&^pZDRWOm|-M`Yzm-!pg`i_{X1PL_x#(k5|M~Q4-;S8
zV*tLUTCS%kd1odM`jFbBi5PL1Q96j0tUfQrmV@`6g9u4PosDt(sg*R<N0uN9Wh+}K
zw~UH&Y^fL@*kGqj4&Z#7LFS<j+7ZIctIc*(Rb83YYNVXnNU!eN_rQQpF3AGo)?hy2
zv*sY-)D;!jAdU#ha@UH7DeKTIFBX!}TM$sEUfeWqsbTyr;ZLi}Bs>)MLE55Xp}SnU
zZJwoFdQ^r_SgoAI7u450)r^CObQUNngDOtgP4(`gwJ2@C2Vn?1PzbSIA#N9b)B?8j
zbQZ2-W{G?Mr%9EG1^+DB0*e;)L7(H25D6lH?H1EaCmNbA@9vu9L|Yg%iP^-KUv{95
z5e<`A37zO+aw(|Gj-y%PL7%#>TCMQaWUyv#E(r$_uez{i?s*vh7{<?eGlN{Y%-2Gt
zO1+N_b>Op|r32enH$Vtpr06<z9iu4ifO~yTppNvO`+fcBPatvlGv}V@Je2U)Uf~xy
zwQ5l0cnnFjbAV2TZ7<H}rSwWbOYS|YR1k3r-n{<{;MpeiwCdRipY+V5MOJ&a_~*I$
z$4J3#v1lPB9Um5bS35lGw@QASH4slPw1(vs5zh>QY%rcc%6i9`?7P!V(7XZE(^_;K
zWF-eI7V^r=&r<u0gBm^z_dYB*`&L^rSMJ#Yal8(MBf%6-yz0U9n;Q6tiAI?UC;A-i
z2~-KM0D`Kw9AU^w`r4RI2@$O-HrEl`;47Rkhc7#6pOV7O&A#R7r!XdR1kl`|8%dZ&
zEotlhMO^e$sf#5e^r1!8Y}M0tc|jyYpeiH-WOw4AV<q13or-+4>w4(sP#&+pYpUPq
z*_GQxVWbCAmHYMA^LpH3{Nx{ZuMM)TRq7X+fC{whmv&M2ST--n3y_?Lt62=M-kDhF
zMMe|4tG2bIL|bPP#<B*vft4N;z|c`L6fg`K>`hw~mtaWuF>i?zSiWwUE??_<n$VEt
zauxwVL+YyGfneD=%Id!j9y%+oLC92&Y$>{)v%BR*s<xP^luvNLtU4)O)sNfW`u}Q8
z$xpEo?w)8TA<*RW0!_<f*}OFd;U&^LG@jw!L2gWXQ=s)h0M*&zB*>BJB_RXubp2$D
zGx)ne4Nerz_$ceR%%pTRYrE33)~VXaPXZ^}WL6R&+2vKlcM3{Mtf#70jBKF0J#T_6
z^wO|0Ds#(TaXqm{tKJT&BL3>1xM>wcIpVb5hovS_laCEV+u5<KO7WxZH_E`lpV?bN
zAd5GI%r0F)ZQ~)b)Epii1BAk{(S>~b3kivuelLipk_eaXRH7BI!sN9*b$H#A_r1_E
z=~EsQT`A5-GFm(}HokC+L3H#L^7r=@B}I46@nP}O_|4o4*rHF6Xe;NRM<O^6@cQ-S
ztu`*A7!@~&dyF(Xr)~t6CpFDd2+J}o^AN=zbj><&)-4~=ixof4^--|pG+()wT;jsI
zzUSuYUV*iwFyH^&*)J^yo_#O-Ml6@VlhnKJfkFR0_6dBIdmIWfU#`Am9g(eS0^@<U
z7O)ez9UVA16#`^DmCYe)#C~m?iuN=22N667ukq<5Bi#hKL+6Mp6KM0m$&1^JFf?u-
zN8Vmc`h3Y8vRgO36%Ydca+wIDWxc4AvF@V<C6+CNftag{EQ9@&bf6SS0fAl@-!RZ#
z9*{_0row*|v3@wFfmm>{rVAIH%z=4)-F@t1zJ19XY&*#Y1S?pMkVJ~ej<a5q{)5oC
z(*WwQs=YwYFV4a4)`DbzpE(^E^``F%q0R%=tT*AwgiK@B%Zxd!1X$Kpen8(sqKQAQ
z>w+6CB}`P>QEnB6z`WVRuDvSZV<8KgQyUbq@kjsCszTm}>VcAN&$n~7`L#pbNe2bH
zMmszN60;-?^{N3$2wm$K`bkM<Q>!uA&z$ALXJawu`HGNttFM7m)ST<kvG`)C-STmg
zc`pECS<{x-^7Fj#dAG-ps#W|l#a)X<Pk3biA5;s8-U(LQ-ZI4B!eMLAQsq&)FJ^8<
z&BJ@G?UUO#AkiJ)n%7y9>P}^#^&pS0ClVU&@u<b5)BEHQvTck4<K-@{RmSo)vQX|S
zyEuJJ0T;T_lCh#bB*P$8cbip5Ui+(Qp81`!9?SGKl!3r>Lh&Ok1QFmSwDTPEqQk;R
z)78UR%<$s5<d-aFA>Z+%y#_=x*={2)AdpQ04lVUm=W*y~O%=c#3|3R#P2)l_?cBC)
zoy-v6fO#DZ_diosKAnEXX0Lb$@h*TAGLXEs2d+r`-YTUBgAxJ-$#Y+2JwnoZiyVbm
zLu&*?u_ex^E5zXqZvC+EUZF7bVTjKHx0qPCiv${miOobAJi-HOUgZeEy$#T^zD&&N
z!4rtdhAyOTR6~NFctiEiYC+YfeB_|ugAj!hWz#(PdR?SJIDdCaa?Usyj!g#|ID~`H
z4)d<~m-48h{Z-rd&Swg=f0Qu*19d3%OcN`Qe&%SPfxy)#R06t3oFts+5fOG#=tKvn
z`Z>4GlGYHjnA~tpxAR=l<Vw1iF6h%-QKR`OTng&h4;SX8BDfT?0`CI{=}R?)&Wr`p
z`I8{SEv8B7!b(0bP@vAeKDK+;9Af{9xyBio6`46kSM^wPf2$!2+UsDb&nLu6{X?8#
zHq-3sg7}_ji*n6cI3Zr|(6Ht*&nqPICcX4V%tl6>__^q{;|(BKB4(Mc+8+I@G*!yl
zGAc<ieWq?rfVYb194&d!6jr#*jrQzAkFxK9VWh1mG$*Vc6?M2DG{gErkMF|RU0itZ
zVIg*~k$Oh6V+O<gr$mZ~3hWDFj5;*HUGhthrO$zX&v1=n`^=I&ss5mKk`sixE8eq#
z`pFS)8Tp+Y(ZOMaETN_y>0l)v8tFT>Cm8_(Kbbqx>|%uULFZ9C?(+i}HQr|2KvyMe
zr7-a+ea#69zle5sxkDVCH-1q|)X*yg{s5R0+cOS%1X)d!EohQcS9(BdC$ogr{g{kr
zz4PSjMZ}k=%iQAo3SmJ-0KSKQL@SYJ`BgNyUko+l-eHA)CUlA08)Zu6kR^qTtkD|l
zOhhKCW|vVs;j$71rqhWc{MiVe*10v#pstYwp>KS1v-vNatSFSW|6i+{)i1*!-tGE$
zeKTk-aQpYV<(nH;M}%<iB?ENBZ~XW>8dFdDP1x2gW5Pvczp1orygkDK%yt#aym3>*
zV^NS7qv1iGz&B<aBx&S;hU@pG3$7B2H5v{kx777BMD^U@M?T>Y$vM_XWm8#m7vTLx
zI(Dr+k;Z6xP`et(ObMo~)`Nj}Tj_rhf4SWG1?-J*a-I&QDr*>}zvB->JMI~ov(LHd
zSt382IUl(F`rC+@Ytq!4npOtinuy{}jZsyy{@NBWudW%g3$S!B!-}eK!~3&7Zu@A2
z=D&@j4QNZRl}j?{F3P?eO+Wpi#a+m^rw>M)%Iu#I8e4Dk6kNuv#^FsKduq+?c=OBX
z1(Nt_wW*2?CgO@(G-A1$Zyj(xRP*y5oqpAY0EV@&_=>9%huk&@hOvaoaw5h@Mifb2
zfNb{|puLKYogCmsH@zLgy0>+eSiC*b9&i}q{YUdIi`H3HfeW@#O*7%d0eWu*Q3hzL
zQv(MPt7xk_{+I1TTO6mN88?HfSEL!{@6z9Pa`b!{F&+rt-S-#Mp{|OjH)zJK6CXO+
z*a@7%n{X7-rP+>A@Rn?YscXA37_V4#VcN=s-a&zc6Brrk*I+&ft?+9TDMX5#sAK4m
z>N2VNXlRpUi6RbOzo!gTg*CFtI#-VTGv*y158WSaeX>>sci$z%KPwLY7UjL|i^30~
zjul{6lsXRLP<lfc`CUFZ2YT0Uf!!vjkhM&mu8Pb<A9bpfKY!l`NG6K(63V5x_)FxG
zb_YNaEtaIkYnPtIuNmW3a#MsH0Tg{K>+Z>M#GAb_xp4y_`elicydihF8Fn^eC!O^<
z*L>&ylB6}Rho+Gbx!NOl4HWSvyuW;@>Nh3Pi=g&QnjtX~(XGOsP)cCF_4lAmq`XWL
zhxMp%whZ-%;FxE0uc6Lc6><;IbgWv5uoIzn_OL4_HCVsoD`DC`P6b-y8EVHf?R%`}
zAMf&U!cji{U>_BtU_MpA9cz}={;8=I+>>|O-<B1Et?SHDYPvF(lNTHPJemN(8MR!d
z<elRww&G-Nln_|`139*4+5R&^twT9=3s|`0${{<$KTS^Xa3TPt6xV|4AR)xIzvOib
zO4nUj>b7n+ptFH<`-0efrT6r$pXT!iN~+X)0o5W~K-j&(5o0uRm!#G%cg(U7`u@NV
zo-<8&SBuKsU$Qx{)Kx#|R5l_Z4Sdda4W%mS{iVE$EY5K$d+>)yu2uM6q_;ti9;)I}
z>0dCf3$R^_OKuuZ=#;}S?9;%p!OwCeozg38uR`xj&SxaZw}I#V)t=`fvxQeFv%4O0
zDuHMe7U0rf0M`>uyL|7^C-Yc>%z`_%k!omuJ&A7L_kWrU_C9-=ij6+XmEgb*@em>g
z(BR868$LeN_mxSFYTkjotDX*wXHYfQ4*Klg$VUjL!L(Flv#|UdA0Qbo4#8c{6uYP8
zkvr=l3Iv2=4uIlkD|N{pb-E!ES_2$EY+TCIKlyST$p?!9TKjce_NUsE6KOoSg&$A#
zD!p-yra~f&9}bR<9I=YUvyN9cEjh`g_Rw&9^8Fk!7}6Kv(N*)5Dsks{eOW7W|9WL<
zQFwa~F<g*QZL0;lz6G5>Ej12nT^^AwaK3(QpwK&(jho7aeAhxqzXN~pLuK4M5QG}m
z3I3s=yUbj}lsyh=oA)rWMI`v#kPO@o6OZm@L)<0#v1JJkfUGKu1|xmt8H&bt*n*(3
zKW>FC=9XBY+AR7_(#YHHj%wkSYweNNEN0}UJnf{G{V`?o<d&QgjP;+3WVa%Q5O9e1
z$TD=i46&x@)`KSAk-+b7YblQ_HX-wm$b5;5je(*t{9<HWP5CN}R;~1Rh~K_mL>y4+
zy9?(r><F-@6?NsetcBC2KLEzp-)hEfki$%a(KhlpZrbn=uY4<hsHWD98}t9hKP(zq
zGY}j_qirBG^O$ccXzfh31Q<Xo$l(8yv;yh<`U2BlfKpzQ3O;UGy|&|Y&1dLg8vX|%
zAf7A50!@f<hytdk2#mFOP2qs$XPH=m{^%-3zf3qUcMY=m2fP}-HOlV?h;=>Et|e*T
z9_S`X`}TezsUpNpvT%3gA8gfM3$qH<_sad(_a+UD5!<dv*MjKUV<`CIx%t^Q8fA2t
zn`dae>-ldu%x})m0*%90ZR>=;?9dD*)v&;x4;KEuIv{q*(#lhYURPM|casCw!##yj
z1D}m+EN2*Au!2sDk|M*Wo+HbNrxwXCd#Tay<4hV}c7xUkoJWM6G+6{mg5R#n$(9?w
zMA6I0iToIs69wtSQ61za<Hlabw&0`7y;}L9-b4;%-*R}lM@#s-E=i7it@x7Li$tie
z65C+JYaT-#f^ps>V}qsNas`(Fsby4o*l^x?-Yb7+)2Bxz=6UWK91jcvTSbu8*35i0
zEGOGPA)U<6UCQ^>AXCh)qJ79PQE4;U1kn?NO6&;xhLpv6wI`F;(L+B5&+-e5cWtkk
zWN{P%f=|B40%9yX;*`=SxiVrf=UCmFr}GfH?;^S=2m)iyhrStKAay*k&OC>A#t)~s
z@4qhP#!J}#2r*($JA|?HJ6T$@dO%{a>}Y9+aRYE#<;^n9iD}$fhxZE4e<D+u9PgM5
z>9p<QMMSL}SBk-ep`cy>IWoFigLj-1u@cyWc9az2WXF1CR?3RWEz>cD%UDBWBt;1)
za9p81lneYJW^BY|YW&0wRF2#~vwlY}-N*_DIX>yw!<HPVirp$ERR=pc7WSHEw#&Cf
zY9X^dDAQgSZ(u8~oM!j@r~h6>F>oSs5;`#?zk3~A!qMAfZvlqp6kpbYm9g-23-Z&u
z=;jwP!818B90?0TY7vLUE)_45*@_4|UOG@7@el5<GbV<;dg_ehN~zxq9E|jp2Iws&
z1W@Jjc1*~GH<PlaSEB@T#Xxpk^+qGue<8dZHN;fE<@2HPA1vU=%_(-B60<O)8Jb<B
zBSYN<ecuE%5fMT>|G-`nX`W78MI$QhR04PXtw7M9!*SXUXjbhse-?6D;5js}JBawO
z-@~?9^ew3Xch<{e%QOpr3kwotlZP4dS|9;|5$8R`_-@3>d_9WS)K8OM(DV~6y5f*I
z(Jh}LpA8RO;uvBOM?E{ypIN6N{N}>!0H|k)OrDDF$hS;;9EglxA(?faRg+u1?y)}0
zq6K#jfBB#NK)UXLI4P3!?S}bCyXU_AFvxmGAO%O^KSQGvn__YfV4p!vIcI@Ou6yX6
z=>?G79Q`VZc-I>S&LW>FDPas)yCS&8FkRn~5r7DN&Amdon<^45#_?=V+va4NDq-CF
z34gNq^=P7W+VDACHRfw8YX#f}6%e?}lQ04(vlgAtaSz-g+D9H5@l&20SO$mw;tK<1
z=BiXsGZ>Y`TsKcIIOiu%{~xG86!Vtcv!wOgo^ABD+ELB*bp@gDpoWvnu2BV)L4HC%
zO5hcmW-SHgLM+v>t~}z;Oqa(3oPzkf{wngrwj(amDN{ipXMFuB1-Oz(ESPum;RkBY
z`<V+TC_?&bP-tF<LQHvKZ7txg{!=tns`kdRsA@9$-krt+<?w&l{Nkp_paX2ixFy~0
z3v?7lFdaz^TmqhbTa}JEJ3Z}Dq`lRADQd*xnkio4??f)#%hy}~vO)=AyTOuR%$2>J
z^gpH4mKLt$_N;rlLm{$$065`vh}(mrKn=>4roX@M=YbQqVKzgxK%miB*dHtP&sp*c
zVD!&ZiGXPEkCpGCzlld7nhcuU{S%s9gSdP3Na0+M%Le|9Cuz$)iVBsq2t%Un$cK%8
z0SgBz4QWdovu5cdMT4-Km1PbQjF}q+vw8O_GQ~t!hi#Y3LbmZijp3<aBp=MTQcw+P
zv=8Io2uV7$FP>B1)KTs8Y)I`qz#Vs7VZ<QX>MQ7Q{eB5>H39^}M*yTrBp2mM;-y(j
z2YGy@J*{1hrjPb4qNa57v-M^TDAfZMBR6e2N950qSZFx1jjHKr+<aQ~%j|~PZ?~>E
zUY$1)n5r^uK%uT>S0Y)4_%}1jDK@<-=iR0O0GY^$=dt?pw0!eXM~{32Bmq^vqmMBG
ziYmlj`KZ>PIvizvwn$Fzd|#?V{bD>?oiPS=#aUdB;KQu#E~vwtB4rdLoq8DY?c?DB
zH)V6(q3B^V@}rA)+EWoEkYd!i)Tx|2qAMk(DOHgqx|!$RO*Km)+xztCAn9=r?y7Z*
zv@!5i?~a=6Kmq7*7?d+1<hZ&3FA-tKT^QgJ+fwtcL8h7Afa!-mDfuRs6R|l(8O~!#
z;E%QApxyD3TIKXv%g(OwvzUCW?uF$V<}viS{hs-=sd$t_Jsr+MqhDkHv87j^I97V$
zY0wLZ7~6ke?L|0qWi$LHEB7Q1bbwcI@&1h&K+8P5FjHt`z@Qt72-|6Lj^iGomU*&5
z@KFWHn-QqicEYgT&l7l^5|KZVdF~-uK71te;9e=`ma&lBGTx(|-Jx`-UQQ%LtFCKW
z&}S15?}PF#LYM}WQF;{EhG#>S(UM-FTS)-`Em`NBDHJg>eDq8q&XYnhv9QF2CSIqj
zzioXPj>Y|VSR8Ln6t)gZaP(w>Mc5LDlhZ6dtnn`utWT(__WqMzDGnj~x^3y@y`MX{
z+vI^h*0+97pQZO!TdyqGY5kJ3lt`^<kaWrUmaH3{4iaaZt#9-Dgbjh7{E#zkLIk8?
zK7&g6oicIeDV(*QxV$D4kuNdMjl^GBGmx%UmdQP0Pr|OQ_EZ|*8XR7uWBQk;ZP*VQ
zEXsnsAuz@xJVW1jaw3S`S^Hwati$L5I+D>^k*wk7`*kPmziw|kXgD9JWTPD;LVcS@
z?+%KU6=Br*Df5h<v4kp#A(Dz!f)2cfCU{N57vJo@BoVl9CIS<anUMe{;x1oJVwqCL
zht&LoimlxA(R*$Zi};MQSJD&03J1up`>v-DBT(u<TuU}8(c_*fjIciX@8aFg)H`vz
zREibR<#0i_K>VSeml+vk>Cq0^od7S|>|F0a*Fo#c*3_(Y32603tr=ftSoG#WKX_Wp
zqI?dQpd4v4NBKbl{Z^q`dlZaKfGRZNl|z==Bj9ggQR6G)v?Fht>`B!!&_VVAFH6c3
zMrc$&cl0|f5P2KQx~dI`d)uT{W7$q)fv@UO2^9pH(&rRTv-A*7Emi~D#E~X_ybX4g
z+uHM<Jz(1X0iyPhaB{qkP>gFs-imF0mQMdVq?%T;#LOZqtTgnh3QuQw^Z40<AS+ja
z>12_4P*n}QflXD?!fp-L1sy?I;Uh2&AQ|ACHe`bKJHt4DsVz?DI6ANO5X)2H*|u(#
z2f}$NQGprRdu|2s|Cy5yG(01dYP5@WiiINd{1e9I?FVd=_@f|S#_Jfsxl3h+8F&Zi
zqt0_`)WYxTPaq&jSXz9pQnlH0XSo~M<(1%}L6r|JFKwH&SwG{Ya&bo3lFFVcWOT^$
z!-_7!+!59~0<xvRnYmx(aR|9|%L!C4aBCAI7^!6plYlNI(Z#lc`e|fA34e}AR<8%>
zqhaj&FBM2zx=6+FqKD`su4C6=Q9+8z?2Ncx!1@B(tZ7=~2^tEMlBYm1g?u>}t0z+q
zG0Aaf2c13z_COV0JZbMpZ&DAg(W<S*tfmj<+gFdG#Kfc9u3ZMlQ+Lx)gM*&xrnDsQ
z){4q{`<3VC1h6mG7_x5#y2leL3SbiB%lK8*z+O?fgS8^a+w&#j^TD0f*i#^t8GkZq
znl%hRs<0?;gO3VhW8l_$+8b1@=7e*p7LOYo=^`k)gJi>M%`2sDEiXNtFOpy4^@FPp
z0aQgxwK`0)68+knFsJ^sbi^V=?@2g~qwNV$<2Z>&+p|uagou*5i@1mIgvk>IW9TX0
zVK=?__c-i&F_7bIzppGsq$tw%?b#Z2!m0+Ay}1F>k~-F4nX?#Z!KJ_dRz-pc+HAMj
z&F&y*yu$3%1`A==F>yzXnI5^2r1|%QR(L&AwgVH+;p(mlb^4Zne633pCef$06-rwa
zyj4!xHh4D^*RjKH+AC(249f0zDLmg^r;nusm*uDRH5QWI#p*>|PA=OP#lIt^;k#vD
zBx=D5A<B4o!Te*c_~q7t4mOnQvl&)BJX_Mt?|J_>ovzhyLz0~Y&LVbfGxOG^$l&h8
zoq8u^VgN6Rqyk|7Flb?nK#VH`ax6wmtuwH3!pK12b~!))h?|0?R4WJDD<wz9y>*LR
ztkxuTZr)p7?pdoHo*F9#618$~EXc*jgfdO>uJTM^BB5^8Y%^s4xt78{RUUgxps+{E
zzTwfq?_rfu4aZCWw&=Dux*?VEYspGb$xr~Po@y*Z)`ZpmZ(7#Z#85M?FJ6od`}Vt2
zD@9>elu?3Qvi-9f@iDTrMB1!f)jjQ#Vozd=^BF^Hs?}KKgGPWBsWe!os_n#gIts!k
zzrg`x5M|z=%J44QXoMgTz_-S0kg&zN@V-@;#*VTWP%PIcTL4O#!^1Ao3_>do<rt-4
z>ya<u-q;@Z9~OIzM2CZ0jpI9kXp&$ZB>Cp%jT=}gb2NqFoMufnh#7x(vCwwhnc@i5
z&^)Rhes_SZW$gHuqD)-5=4F392%IV2`y#E?Ld3sLAbCJA-d(jqzdvEPuWmxZSWWm<
zxjbC|*#?EJlg8xLMx988v23pGaPDH}2ycQ0WWEj;7Q84(LJq-(^zFo33J72!KbPK4
zOhJI;rMk;5OB&a!M(J!9d)Fw?$<tletytiX8uf>#Tr<(kmg90=YE7dA$r8Y_&aAGS
zQ;R4`8Y09*_ZV>l(!ogu;WYbt*FNYv-$MnOTjVmGO9EF!nJ&6kQ8?M`y47F7&jXb7
zGe+@7AeAEX;wrU5SCr??XiB+xBkp-cu#f{5r52rp5e2Qg^J4$Y*RQ=G$<RWIv7Lb5
zTD&qHQ?(eZp}J1CMvfPr(9Fv}KY$$6<VMhsCl;NRiQ?_7p$vQUFc@PO3^y1d@6*CO
z4GvABt{?6C(YpGGbd0#UoG;_6-?zXkx9YvtWs!9PsnG^Ltct?17ispS2ISL`8dibl
zO-1NdFHrkW8_v_CS!*Uf@DN3nVJwMST4SQ~5KqIW$mA|7WbzuLg0)g;k(KJ7@@Et_
z##fUsO&}&KH1Lp@9$HV51;MvJ3qlH5n9ea5bRRh>5zo4`Qx|sJ?p&92uL?f<JuLwx
zn$pS-%7O;wf}0N*%kc_9$_TS4Z#IfO1=|b=s)m|KQYnRG&7_=lC^Px%n!%kTlV038
z%S^S}>?%Ilut>)zWMcH<$a!w9F4E=6x;;+t9k&4!3OMj4C4lra6VMb7M5dFWV%z;s
zsol|x!#@dTy0<MsNMe$$je%K5N+7Y;=P76iel0>#R2O+n_|a|#Pg{v-;ahSR=&>Xm
zu0qTlIJ=bqZd$YMp>CZQ1cT+AY7oz{H{3F}LiO+=p)0ESyn=e_8qScgW>nI{k`rye
zV+Bd9hp;L+3ahbtohx?S=2T?di@!L!5@Il9x!2$5pz>_ST-xRshFE+>Ug<20C^svY
zY%%Ib>=bYKx}d|M4zmi%*9Ks%pnB*5;r!_d`c4K_9a-qoomO|xqoOYAq_i&44UONi
zjaO!Tu_r*~>U~hU<we=CS-LAt58es`_}!wYo7lwJ^wp%dE3xY6Nq_BD*Wic&q6&fG
z^DpccX~9e*dC1jbE@p^E`AQ&2JG@>B5(GO_%nyV;+e89mAhg)sl-sK<J<77hSZYe7
z7&KxW=?c9Qn#P`^VCdTQmK+nmNvrCLiX}#-bRpf%$1EOQ&_a;~iCXT+zNeu&E{Yv1
zWuHGxgFhn3Aw}>Y`U8+6tghh*%nr6X)=ase+WWymgjXFo7ZvU>f%Apa^P<GNIa+d0
zC>0e>vYxk!R)w1fw{TmvAaAK)sHL-{tB$IaryxfhFL!eAP+peI`*RO4FGx1;H|r`b
zV&D=4#d#gS^_=c)kcQ~ro1PPw`q^yVaXo_6F=^QKL`}&BRe5C0YfNHUyp=xfVz}1t
zbR~7^yh18<_C}MuRZcbPlgYAUUsR`BZY5T(zKPJr&h{YKZWPusxFRR<ZWiy#yItH(
zd{Cf)p|0iBipNSY5^*$q%wdl4XOfk10dT^=jd^GWi3mP1$~*rIAh~ee#--e7EoYfB
z-=2S3?%HF7^JX`>IxBrNv|_!$@FmNnXJUbT(%goHIz8k2%4XS9*^)fsY{I;F#3{eL
zIGE+jx=`wuv($h#2Z_(c;=NMGINKXp0juDBg&j(#7UL^Ajw72hnxGllorp3J`QU33
zYCJ?luyy(rx+12+whnj+&o4Ao$GLH)y~Rxjz=Er5`dgye=!n!qgJLhVU(Gfq@1B1)
z4=d9-yg#%Sl`1mLA?%8($gG_G5i3CWeJeh0<sufNh&YvqmNVY+xh@PC8lD#kI#!CB
zRR&=&MGn%?gWw8eXftE-V`)1oVvo|cb_gsrJSt-Pbdslo9O^0?md{-WYcwCjD6Y-3
z&DYdvCpvDj_0};r{`ph=!(zbu%t8>LQF=jL?J`l(Fum8<kfZ8C?KDwfWDse=)o}ii
zCIzlt2zj;+qfK(O4_v;y;@9`Xee2~0G%;WwQbOf9XdUi|GeH65Q}LyJEUv?_p?uQW
zg^?ZD$?6}Fb!?b%Lk>x-ge4|?THx~Gd%-8om~=jGE*(~R+Gqw6?$~CB@HJ+p<w?ZK
zzj24?F^BSP%jz)my>n{%TDV|Ae+$(xj^D(RIubb(+<dQQ$RQVvDvvvgCq|T7LhC6O
z?+|JyluC-`0xVZad&BKPPEliST-E8iB96TKIoyMf+-3r-N~Qi&J6@)CsVjjk(vIm`
z>(U{l5BrXC=~cm#u}h)laEd;L2SwnkzF)PGKql;CK}O^PN2NMwa=QY!=~nNo-UKfl
zQ&c)>d$Upz9plsTGQ?Yf7)4mpwEwb%7Udv(Prt`_i1i_9^GHyxg(u+*c|xeVW^lXa
z4|dpJ!Y=C$q)}$?Bi0c2#U*5M90ePyLepbLrmW=p=1CjJo`kzWXzXZ{QiiQDK*1>g
zCk*V3Ni9NsS|Iu7Bw9eDwD;2WeowWPk=*I+QBEfn{zUtt<yhbaS9~I_XFjT^;`|g5
z?96kk^lwpqZ7*mw0ql{?*w^65mwQIZg;yyW7`;wGJfd{!gHlJt#rLX5K`@Dk9)k5T
zE+M%N20Bta49Yc@N!kX0P4PT}SJ69uJGpB2JdVc`LZGwG&Uq|h->KW0qnSdt!x<2H
z&NWwo0G~_<lkE<d`t%`|u0-7+gmF;S(|FvVX2Z-j&ff;PeV|P*ALYj~JXIDLiD6l&
zTVmPMx$8n;OyeYD=1mV-X_r7jElTYHZnp$)ZsFnfb6>E37k<s>hUG|Qpksm_>g$Og
z|4KNnSVK7?!T{#4CQ-DYDtpWSy8Li4w2F8uM~VaHeM{8RRF|32&r};m*&^p7JR()$
z5oD+vTRPz5x{<Le3w(m@d-eSb6&oIG2F5!Y(nhERcz-<h<3a%RZeBt?{gXxDC8XYk
zJm=~LRWCS9NMl2LybaSI-$nQ!pbuEjpA-1qxbzvInzxVn>fXPk^3G_q+wD|bh?ln1
zhLz6cq#Io&ZYiKo9@q8|Smkk&U$gf0=WtD}x}`^Tia`t+PfYWt9fcrrM6?CnKc4;Q
zOUY`90-2hKpyT#>-z3+n?M05C4FRpSSi4$2lu3FEu#{h6%+xJIDrFt~U2ax;@Ve1H
zkYq@=lL2szX`jMtCyI3t2hEtKXb=aKsNA#|t#Bs%1-i8U93cwUc((1)SL9+iZ-p$B
zLmzKjN%)$F1x!e;3XAcbjHD#CNnn=xfC+|Z==FluC>bo~k<h=GY4%&!JcpHSC1UQY
zRy%Hy&6EG*S|Rl6VFuZpqL)-Ith$$GH*wK9|6i!-QgYd*KjYHJ7B!+dCb*wZI5r!5
zx#ee1E=jwF^U?-N37oHSiyRBllxGI>0H_1@pcL2n>TcdJHg2>*LC*|{QCC-|3L@=a
z(wQDwy8OwWfeQ5lzzv(#><1vinmlpY`Ygw)YyR%Bd>^!#I6fC@N_x)SP$K0&2Dq`d
zY*GY>wL_2Z_QP}U*<1-o$l7VAwYFxTVXUi`{bFsy(z?1qJ=EW_hxIBwS}2>9*_83~
zjwnQ`+zhKdE1ChO^k+{x{{6~P*%n)%n<xcl;}dl?xv7rMwCw!`#O5=9RJaqA-h_s`
z^)SIw9YWLk8ysEVxx9^7jvTGNZ>ur-5(m%D=%7C$`bZ}M2u2H;Y}nVG6{Qx=5YB9j
z6&m&_EZ|xh$FWBi;DMm|aM5*0Y~e@IQ>O_q9>pcGQ5TcxW3W34I4Y~3F3oYvMX;<V
zI7V1^0ldu``zDgvb)-JeyDY&t@KHOegXCUKsK1cu;z`D^qoJT-L3ogKiB7!1ePYNw
zQgbpn#p{UpoOf^|LJ!!zs-7@UdxdnNuUC`d0To-cJrN$k*={77v5{#w5nm49ydoxX
ztCuH0R7A1|pCW5+w`>U=n3betR}KrXQ;%_rK9Y#?4zN61Y#<G(v#qd7b@q<u%h5K2
zwYB%C4hWP@o2HlRAESI(fCnB;nOW62&y$9YQ{B69`k0K(A4%2Zd!p=RKCy<QM9bJV
zbFsk8E?YkCL(&OA=;^gpnsXw?u-KpZYmh&NbT(Q2%)1!u;qWjVgx?mP6mb(KOC{C|
zM^PqT0=*e#!1?ss+|V}cM$eNx{fPwT#)nqi7f?no2ZtXzOKj{217PGuWRYG=b+>@!
z=3Sl~O>X`V08>D$zh<2<7t}!fIGrPfbR|Kg@YST{X7-&HPgdU(Eo?-!zKKGB<jm!|
zl&AKB>g0w(dmok1Xqsl9#V!!@y3gYqACoj=aATAm3kbD-YrVRdo_7rHbvyNs(X?WQ
zq-(&3QWlGx#wf1>ODX5y7vmqx#N~)7bXyTO=+Z5d%B_+|#^h*?itkJ%+kl5&vJi41
zG!H++$rQN+q}?dgu2K;?#2iyj+KuoVzr{v9uJdYqh(39`_RoFfW4NSl+9x;^gj|+z
zGZlhCxSB|CT(TB~eQpGXPDCAfq#=X$Hx#qv3D#VP7jt+QU}^Y?J{8}3{!EOHvV3P{
z&=d;CZ+5Ow(@!ad;;*`mm7Gh36ms1`$LDv|L9O%=Ovp`|OM-A1ZfrHr=@pEXs|W;T
z@|HjLi}Coy!js~c5>4?m7X^5597JNOhaNR=!ecX%x2%&8nR1hJqu_q7L9EV{tRNaU
z75H}2&nw|Oh51x%>LlcfXuS;HvIpBzLVl1EAr*%6d>mXqI;~n(zt&T*to}@hgC+GC
zB6}WzijIx~xllplsyx=%Gix(s*_Coo0=Iv)HrECHkJ2d>^9Pf?Iu|8@<#Fp_A~X?1
zgxQ;vF3kaj0~&qq#R7kl7M+DJnm`z14<u(Hx(tTtG-#Jk2D4Sz`WELXx*XJBP~V|}
zXs}wx^QcT`nD6DL@VKTG1X&MPc$RWc%6U3P+&5=60+|*UBPwsK>@yVW;D90AM6M`>
zYs9FYG@nyCBTh0})}DJfpxnPelmYnrHwnG}p<HINmL`PDhu0AD$@ULb_)v?FW9(^j
zI6bKbe~f{<y^1-j2K|S_>q#)M4g{sUf4eI@;VzzUrr@4lR5G1p>$_?^+BR*K-8Qs~
zwsV!hy_UbgdS4UhN-J>s=-rO^Hy$)y9<IBck10!?O8nI-tyU&6!pXyJA%$00T{=3&
z%Ng`rm$_ax>Zfl_vHjDR5Ttc122l~Fy!xS8^ZX&aJsuU1d?)(hUx94jVpEAfiGau>
z<6wu#=o`pp2v&nUa$!)Qu-s(2fn(5!e!Nx94WX#tl*9tNq*6~+b85p}cxaJrjPq>M
zkdPwg(}rXklii+(`7d%PQycBigvL7mr>q)LZ}Y&XN@d$dfE*=l=FC>fHjmHTn$O0p
zWb<n354NE`zOq;!u#4ZSVqHN=5>c?sncMfcu7+`HlI!A|I5q`NfKay&m3i;Z6NPef
z>HvYMo9eOTuM3vM2DWsRg`J+4u*<$SH2dCibjB%YbB7QszF@^caM}9PyPr;z%qBzR
z@dhzJaJZ}-0KWdQP(E5ebtnMFC_apoXM`(Q*isAc1fDuHEko5mo;a6!>Dq0>tFpCk
z%jQbB^Y!ONt)%pVCbH_QM^$+#j1TtkmL$CLL+$o$;2%#{*jzB%`nwLkmTY$V25;(I
z_bQ#fp$WTQW_hV<DQ+TAjPzOw000t_WdVXB5CLy--c)h+fEQv?6F*8t#s>DIyPSqO
z-tZb#htUT}-S6%(bpObLdA6OU|AfnNu8j<y(Z16Q5W(vMu$SkwnVD&2UpiS`-*g^0
z9h5VAGh&Lc#=>>gNUU-TzS8a-{eG3EX!*X(`J6j-5axhqc=mJzyK`=&Qb?@d$$E}@
zlV;YO9>U}5S*^??4H2FH^~o@{T8*t&F>KZ@ZIi2=M{d8N(?cWS&$JIgTyGfWyPDF~
z{3yLLgS9jAbdgFm!|)VTC_Yl?L9rByM2&_j?MtKb#A{A8)8C}Vtc5y=V-qWh=HP6A
zEtm7Z?f`t<VU7Z2<G~u+sXV1Iwq^ApDACF8WK=fRt}ad`_3UU5^>u~*Ra;3IW|d!M
zslPnkBtMXyRSRUfgr?<je9@-nsnpE#Y4ev+JYI{tcxqj`TRvO+>PS-zD00q2?(irP
zySr&(0-xr%;?6!i6BYQnsTtr8SUmb#JgO&DHfM#=9V)~5j{2^2%F6`%VGT*>Fb<4D
z-{9T%aROAb11;j`*Pm#Noh7qyotbuIb38iRzhEYj=_(qiSODRHWC6*kVNU^ATdX6u
z?6YCobWsL=Jb>R50)bA#ZZ+F}4^_Gqley|Ek;IKcMV5BxtOVie@9tLe$TouzQq&p<
zUeKZ$zP4Gz0D2h)M0^O(!d|fsM+R(Z2LG#y8(NDr058w{5jKmM6f4pht`18aLt00A
zrO68ac07}aN)`wzSy|$dzU>Mw#;QsOcFA+eVWOy8EgsxD!5^#QVwD|wJ7BttO6`D1
zSPwq=T(Pl!_-rLqFS`a%_#eB^nGQ0)kVq2~P&__XCg0U^AW|<H|I)Y|N@NkcbzyD6
z<*GdhBr^=F^Y+$?qB5Rt*mNZM2nL9rghWK+TA$)PKUA8#M{V9p^sDTnaAd<eBh=Vx
z!|IMkd%ij{w<-w-c>(Hp{~U#8#r$1I{4|SBSmYgOz0MRjHphKYs@|kl_J|*irI6y@
zIg~%`<ha0xBa4ZR&VZR~D_v|DucGtNSrIEyFPw_O<Kb~$H!E<Vjxs*nGDg|biwpO0
znvplH_trSKP(|bCT0?26O+`{#zO?>YKl@h@E*E^r)VumzbP!vR>mqvWwg&27QO|T!
z^h&e#ahq4)tO&S3mwqg=8bnm55RsuMe(3OOh}OT9rF#H;`ki3%DECUdw;zfEikh-S
zrPH}n;^a9EHO0y*l(%IgjXLU?UUa6`A=Pi`uFs;o>~1<Bh7d^`zg`YUEj%wpKy`T?
zzWFTYN?Ea@=J1FwkJl~*fw{xDY$Gs-O&sjt>>5aHbI5Bk3|y8J^|H=+=xxX>S3;5b
zt5>4aY8BuzYq4c^umN?UnU%l1hl9#gYa5G)SO3a-vQ?K@#~4TZisHY`Q(-Lk0%Ux3
zN#6f7mG;1F3TU_@p{ddC_z!8eFr4LsX-w@iG>`MzKvII~mw+^V)`~!HxyCww*z|lL
zT`4;4#T!#*{|fnJ{d2K!`X~`Cw+%Kp)h7RmpXWD1lJMe>-GTgH*c%pCk=0>W?dw{^
z*luarPb-}VQNp~9wATJN^ayEM7Q~-M<nrbP9+@Sc$TCqbdY$;@nBUe{yDt(kSkyY4
z2JHk4kLMtrt~lNiN7h*^$6rG#VB@(k3utfp<UnqkScfAzoZlH|#N%NTQ0B8+l}`}b
zUA-UgIsYam{|ZTE6gC}iZkcutdsF~R5kxmqA2w1i>OL095qR_kv)msol}eJ&WX6$Q
zc5uBvo|8|9$JdXcHf#8K*lq7y{*^~IqgTw`3~RhAUJUHh%%Igle2~=lJkv|&(HYE4
z`AB8yW4J!llvoLG0I0oSA+8h!P>@N;o}<|!PSa6`2bsu#{&u1*%C|_hRMio?32jN<
z8%k?2jawo-$@)1=BH5_`!BzQGa3)^Y_>t8r6RetvVBbjs(Cj4Op$6$5VM$_^qLhBX
zLjVlfx;`bj=m7T6z7{G4!J~QX*(E3^vRG8d4w<-hpk~BtB*xMLL_ljD-~hnm=B7Y8
zzuhd*Uqnsprpc1JX)?EGzlw$FF%-xx_?B@9os3gKFd%?&>muqfTEcEuzH*54w%zdS
zB><=RHCx9ifzk}uX7pR!P)n|Jo=7qkO>|$S_ElfZ@rAYzWUA0#3xjT7Oa8PRPp;L5
zImIPUqb;Urxv>|!9-v~3?25Y;eisXw*a#!5s9qErKOa)a+J=ujlN6I`l~V#Q6uAZh
z$}oYOA+s3TWBBUO(jS*nM8hf0H>r`OgQDXv?sUdLfh;_E1#w=Qg9EaQe6yxu8c+od
z1UHA_c>&c`Fa$w-nKs7as>9Fywv!Vr$jL|qjQ!2bZflJ*)Px$^mH|G1_mM27aLf&X
ze{RUY`}^z(4i>Jk!LBaaQPVmJR)v?2D|Zm{HL!I;XL-NsJ9>!p`i!NNHPo%O?4dRb
z>tk;W!~eMdJ-EPUhr!UCLzoYtMMV{x_dbKq@xQb<Sn(`Zj6*n>mGP>)-KM!GdDbk+
z68%0kS~WtR%&fx5e&wUC4(n44KV{G7@jPtFi}C-qu>QLuNQTeTZ_5RTaLD!1s*LJ*
zs*C481SsX`XYNlsCj1h)*HVwqukM61C%ozDCng@zM{!q8vOjwEP+F{ynzR)iPIh2t
z%V_YYWDPQs{-e5dE0?J@uC^E9SgPu1cqlKSG9!YYasR4wC=6l=I35hT<gdn@tzNA#
zXurmAN1CC!R{j9Wqx<MSs^o{l47l!RgR9!T18h1smPH@}#fzRRs2cxMW$SfOvmu<0
z<1QQV9YTO7z6C(?=eA}lGTj9$DZBHI=qn2hI%3ZNE}iikK%rz5|J=Osh1EEinz7d8
z_ywc&Sm&vls4?}c&3Bnw>HHS3^pnZV@?R9NSc|$1OZ3_WchTeZj|D0Al^nvWxPN)v
z-AEm=Z7FDazX{g+n06a~CZWSxW<1fHc<gosw=)<!${GH1DKpVd8zgi20}K`U-Xock
zWdwuJ2w419xA@>FV)QN5)ZsdkIc}i#*L*Ebc@O{28i^beP@w<DlSb?2q1e-ASmb_I
zQsqV4mIW8$7@(p!FKq_Cv(@-=Yw~5y%}}nsrjQrQa)0uEZ}igl6a<%MGFwtQKzR1}
z5O_A5Xn0`D;d^mCAzJ(Fqjnc}1(yNHG&szdS@}TC>soH7Jp?&}1jEOW5m<+sMk8g;
z{nszd$YxJ;G-_l*rH#atrBnX{)$H*%QWNSafzAKD!^p0<;mbuXYp_I(4ybkbFyLdU
z!|5mx@<7SlIk2ZFxWmaNju{#mIR0i_So(MUlgUWO1p_RQnjj%#m%XN#dAaxgGVKq%
zPq;#ohRT609JlSGtKiPN9&k})YIJvul`%&yQ8k)|qY%v(6t<Ki<;$E+c!??%>Rljn
zy*2>H-Ws2ZNO>$=q1YCrvhv;g_jC5L{zM-j>KAeB7`pC9da-OsR#`xBcT2*I$wWR1
zEqnwL8r!|=A0$7Dk56It*|FU5eM}5(9()b<41r?drluoig^HeJ-Wd{V)lT&x*ksb@
zILVDYl?TW@t()esXaK@8dc)pRVs1H2To?qBRtLcRm|O;|Zeg28kGI!VUeW${Au3QQ
z>Kqc(6-LDaJdxU1_Yw9SjZC1SPQZZ#>Qa0IVP)~XVB}0xGzBkpvW1J8!i0l2w<;Tt
zJ*^_;$$Jx1S}4Lk0zd*qV3D7`g5X_t)qE-v*r=i%YxA3utE19hKA~o6DR-jzy8bv7
zIp|?H!oMH2_4%%&X2w`YKa(K*h!+GVRZ<>H2<_KuJlDowF+oVIX-&`sv#be7tuTap
zNbRhLt?v*6LYt!QmitZAB(&c;cxw+j^|vBC-=MA^F)lk!^Na{G!c`vDTiUVbF28B`
zcoJ!r@bj?#z(q-dv_iB&XEWnkb7QA(sao$a5_EGzTkFMCTz};%*2gZEO;8X!IC{~2
ztxmO1VurFX4d&QV_teeomN)LWeIfG7Q#8!UX7)4j?sSc>Bzdx`^d27;#e8p~r75w3
zh{?g}Vd;7kwFw@;9YW^Q(Xu4hjHBbj8^?yD1TBs;9%MOJ5sKrV7oPN~kthfVyNp%K
zQ{Wo5zqmQ0cc8R8?HpBe1V(sE07YNa@*iu{L^HpVNsDSKYRShlKQoY-KS^l8+u>+S
zm=HOY$9?nF;=l;ELN~plxvV<O9wlK9if`XvxupQw!yXmaC?S*`aw=%-bo`985ygWs
zsTN#ld@s?`z|Ayjm@~Izi`fyv;g5e|$<H+%)?(_@d!AZg0(RoD=&SVp;-Nuc1=-E&
z7WrjKh3>C6C#Ld$p3dobeU1Z6&ILe2kN#s%EBu-pF;@_)9+tw|O)X9w*#)E~o(g-5
z?z6Y%jzLH3!eT3{$w!^O<;V8&0|mg8<Kh>j?v-UHbj7*h4lj)!D=N6wezk4O=gQXe
z?B0zeIPM_s+MUzEbpyTnmi)8Aq=TI7gC_BA*a1+jmg>Vc(7u;0-oTjB=R?n;SZI_C
z`9sjR*0QGx(8*={2OEbg(e=Ln6pNqK&qY~HEysA;S0dAyMvjM=Ri>@J3nnTzA0e9N
zn4j+B4mXG1>7|T@9V&?pVs<?)c#`0~|AVzVS1gmRT73!U&CA<mS*-&~6mEfqQw25G
zLFU-hQnTq|$#?GgWYC1OBGW1+ctcgGt}WPyY$U0zGp8H@RCvKS=O5#d`S7>vcNl6^
z9<1@@cL%2tK2!WfXTIx!L*At_o~|f!^w!v>n6=Nw;=L}%R*eWJnpobrifVv@p&7sO
zX^dh!cUT;u9(>M0{UIISj#9AIXQH>H)(lCx^EZ(#nBdxnWpO?Yr$<Xf(1BvQYs~3V
zSm5D|j?Dbro}C)>+%ZeLwD@vo-<Wv0RM^z)VTQSvq*bCl*cRyBqoj=a3T6P!T4e-D
z&>280jv_OY4%UhH`~?qUz!s?v5`sDZt_X;^=E{T;@Ji%yC+vlgh(~t6@C~7ZM3CTU
z1{UwP{9q;Y2KZbh9}IXZ?6u8;A3nq?@~L1sy<NhAlc6?ft-Y_3Dwtai1Th3KBp?k(
zbmqfADwI52X7sHX!VYJJ&bApVtJ<1@)i+Kx<Vn+pE?n=M-lzA2Rm(i@mN)yMxKs>f
zsL9*`iYC^^bTyglkI(F%fteu#Wv#1aHt!q8R`wSlFO%BOkPZ{So0LUeXQMmYiWfzF
zPA3u)3Tncm=l2ovh>Lx>ED=FNrEPyYy?s5yGXIOr+X#?X<bDm<fw9$$K98Fu657>m
z(8+=yWre}KTosh`qQy!p$}Zm3a}4AM2{jEC|A8<T=dKbfXTjr)d-d$P>cAi~d1&<H
zz|U^tM;Pw}S1<)Ua_4H{BEt2<&TiJ-8yaS(5lQt!;4{Hjj*y}apA69=MKpho#lboq
z6jVVm4c~h8k}*hAt78vuMp9eb+7198_FpRKd_r>^a9K7e;t#ee%6cd>dXpY*k_fif
zFvbLcxHsk-4lmn_SQfb2pa)$v8PjeX59=UM@ayjslz?;i&b{|h<X)}v{aL-B5LgEa
z(|w)el2M5C*(kK69WEk7b;w+9I?XhH$$rDwpjHKBY>?u(-gZOpmj;Wu#^1e;jC-ph
z>#595o$0~FYAq(i70}qsz-neZVp|#!E8xABd4P}xY1h=OGWt6<6yP!3sjm1<l{ZP2
z76r@za-`*3ik&d0Wzl3UzMq{MJ^1bQNaH16PcT?#w69hRS98RZt$+X{OBi9rA5fEn
zVSQAPPtT>StpiSec2${{hp@b|M;Nmj@>o#2RZjAk!4yj}HhYKkqXo)wY|#DHzx{Tu
z4Ztjk42qlbfkw91vbJQe1vk~3HsU@F3d9y*h5a(fb>`+uM%1MxaDu=e*5G|8hk+_W
zQbbFzb?iGlx>cARKImOwB6~+P9{+5{gAOT$IZy|sA@oc680bh~1)U`)=6YJNY^k&D
zR6I?Ti_XuFzewM+pP2PPRuKlt?wuV1h77;pfX*@2S7r^Te8y<h6c}A*5et(!bI$_H
z&{6kKv&{>zORbXCz3jAvZQ3GYz0OebaWiU#{wbNw{sXwpnGI^S{ARQKS^gw-a>Hs+
z(3nX_(nQ?gGDylfu^poGX~TeKV*GTq;-er${KdUHo;_qLL#L!s<)i%MJ<{Z$@J3B=
zxvyWrFu$DMOKUus;pRd*Q{TfR(%1c_+0$4;r~9IwAPxWtCzD9^-d+$Zg0M<6P@}Q=
z&kjnLvOZ!ky-+1yM%!5~`QK3R=&Uf<b#KtZEMz(_heaDKw+Uvs3#v%W-XQ=02_`|B
zBy2>#Xg5LhDdkQ%KEg!aj_sL?ZsB{~(j)Y{8&%*caEj>LkLsY9|HmriDk30?ebsDs
zfQ&i8c!2qwjkgJ})Z@(-WUoIb!VWs`!%r?@NZ4mQgInltJ)gqP(E`EW<{92wo{m54
z=E2kPf<^o%pOpril?>43$!e4EG+3qWi$_lYoSl#O2fo!~U9^;$DiU)b-JBX2a|#Kr
z9TsM}zi4wyFKKBG?RRzRzvTQULlUdFM9l#V+Dp#B5S3qyLdL4m`M}`VP}NhbP{8(k
zfP$`Jcvn#&kKLHh(HPH29Ebs|K7rom(Wj?K-^HJn%j5<JkP?PKxmcJC>5E&8p~ziR
z2^lK<+M=%M0$x4qsRkKTWhaok!BS+~M0Ewnu(C&rJ8UZq|Bxy&QzC@rkYmylg8X?w
zQW|v#BN6xaF%@cjcPzDa4U22lujqxUl{mcXdYI175$yte)^MIok*ioU5mC*0XHH00
z9a<GocRC64joElRq${-Xfh(t)3w9;MOrlYn#i8YwFsO98+-faF@b9a<cGNtdJL~zT
z)n;@E+L7Lg*mR8-%R$X<*ajgdd-MnTG1kUa95uyV?3e51UhHo}X7IW2xDL{rx~=cz
zC2ct*xWYn!^^ND3;Oh@lQaHYl4FD@0tQ*~}llwyo@z3Hx)-_yMgnxL<MS`41i=SEr
zNX3+tD1IMz5qViunv<`Sm<c2eg@j3J!VZ=jOic^W@qSnjLdN+;bv|$1hrdT3U^(GN
z^Ev42PFs_#4PDTw#so`7H*Wz5T1Q)+x&@vd1o1PLf90SbIBtq8&B)Ex0h0y&zuz{f
zCAGQwQMUp|Z&_{g)V4=r9{Ti5#fHi>ZY&^>yf*CpVfbH~pDsRhI1k4t4I!C<*ICg%
zIjCmAR>6(Wdb8HIE>^sV3{k1(&_d$4O(QT77x}$U>fAkOzr8Y{S_y8ef$xo$52T2J
zE`kE%ckRkvtK^@*zUY8CWv^ty6V9K;y5tq(WG+M8GJM(>-?Aks;f%=XA*y6=7l;hx
zLt{3`4Ns)JxHlSTidP9`aGjq{)G=2+y|&bv)%6N<!1P1_u8J|Jv)IU)z=$FXZ6wg0
zO8Tr$;kc>_AxFSS=l8a&UihsleT$VK*u@KsQ9HaSqu5*R@GPTPPDm_Cz3Bq!MpP*j
zi<1?zRQ5c-p8xqG(9A&;rC4+&y^O~uf?{Kbo!bIa%u<wSjv(A$G~bJy<r*K0I2ed6
z(zKhtV}nhs3UW(YTlHg<iy^)39;}zvDn%V*>$yNLTx=hL;yT^2#3ND<o11ByL*Kg>
z7nY?XUpF<imI`9R5LxX~SfA=+Lrp15thd<tj8~>&&{M<<e!=J{xsc+d>T~)=aD4uu
z#LBtaca0GYR>s?W(4skoR_^$S+Jv~V4sz^o25wkFrrD>J+q^&ac-uZQ`5G)RpyYIe
zO;7u{{04V0k8}*>Sup0$0<kM_RK|*K=#(D(QiMiDZc5i-kBFFL@ef5wS}GKGRpIwz
z_Q32j1~*ZFVl4@(LU*nI*9Oj0(s}HiMB=o>XbSg|v3g_m7G}%;I<UgJ;JYGL15hWl
zV!sOK>od##I|5`m+Q1drfn(n9b&||CF;Rsb^q$<UkyG9DZT`H8Ogx?oo_H=AQa@B7
z83v%hLBil<EM;6-jEZdZ!=t5mVvl19WIiiCvn+FH&o?qdXDd56T)-WG@``me-%Wu&
zmr^|bW^?D%sq)x|7yR9QYrFM%5FC~>k%?l(m(}G8?BUzz7^khZ8$LIAg55I-14RhT
z@!+gf<}7t+N5l7;S}IXA&=K?1u5o{A5oQi$&fp0`p&tRF^MzCQ+&rBJ5K;=floMfI
z0f1Y-TcdNBAES8#q=x=$@mW8qC5z$>@A|~yiK^mrx-uK26s}!A>uc>LR0~#K8tBA5
zn#a-J!pKlqV5q~Y9`J!maEnt7iJ9z)7<F#*tHnor?%|v$Q4yKPj(o6#gF%jwr^D1*
z2@|X5ArQ1gT@!F?M7=ak5cmoGI2;r<n*o?0q*}YFa6=cCZkY5?h#e|KabU+@Cil(a
zCLt@tf{SikZ>}?s{H28l8G6*uPvgXUtW|{wF1y4SFjBR(Lutrdnu~(-lV;27Y8A{{
zsDp(g6H@0-et*IC<B8VBgb@J9eXE4$HlmPf_nCEjfXt}*J+XXPU+zZ2TJ}Iwms*;Z
zbqTC%d4unqT=X8c7Dw$|q+&ezu`nGwxxiXvCot8Tg7$+{K`e4!+i6>o)vNN#QMy<@
zTle(!#X?py$smy3ZA)Y?)0})yn45eYdXXO<ct<hfP7(6_=R~Z+$YZ6(E0zI;0?m`W
z<SYVHuM<}1UPLz*$JfSqY~=^8ZOs_-LYt1Pi$%WZ_+K5}k@HRW#jS8aD~xrhI5Nev
z{=Gb65lvwAv~YkxG&yBq?92_UFvu=F6Q4k+0%0+}K@?F$fK{Z_vu|9A*E;fo&kPTM
z9|jkaQM_CM=Ui3R6g#;c8(~ccYuHUuVUN8;{i@Kxd~Oj{0vx&z9sA&nMMSF9k4;=_
z@UR5%;5LYA&b@2DliBSf_nYX9%ot;wGcUp-elS1hEG06$`uG*63swrn2|gNQzx)nX
zmjbZ>OGjpn^u$#rqqA*@Shylz=&v9v&R(V!PNHQRfKB1*YmIFMs;Gi>bE?BQd~viA
z<@4$cf($sR!#GaX0r(X!H^Rz|q*aEvjkAN78WQk^wAJ-FDwl<2W8U*fjxsPZ?W6ch
z_*aRo%l>o|(Df>0>7T~T-~L{Tory??NxsE~5^YB76TlbT`wia5CGQ(r=14w?1bJ6k
zi$Hu=yi;X_c;zZ{@>0JgTJYQALMnx&Jwg@!)X`KMsQy2NJ1#lF6c0vmpkoSig)-ly
z)iE&+vDU3s;hAM!C|N&RQ8Za^t;AIi-y}|Ywdklk_CoU=;0y=ip{q|np(`FXYn=w=
z&lTZbvQWx3cPlS>6-=t>3<&;Kcb;OQ?kKoXM+FM8b+9qM=YtXiFt&F#*U%esj?pB>
zFth@+I>OX4JAxOhzv|?u;q(E0j@@1Sr4Q+(<|GC{XhPEAgt%G-5!#9@{!HY-7()Tv
z000V6K>`V)M#S%7a3L+~EALcDRNBjYve83mZ?TPURj4-=M>EZ>JJFD&$q5KI{R}V7
zjHHs@fG)~ATfI6-eLt%&s!d^T;1KwIW0pDBzOk)^T|?6G1*b)9y5=yv$(L%Xci#`R
zOI#Mm1!hz&LwIb&6IgxVL^&0V=m%Fk;on=ZgWh3&P>TA#3m2Omep!^5h}IW@-p7Qo
zu^O0L|AhT31`_n@^R(J}w;n4ebQ`u=&~OYqk7q=yoxzib7I@mMdf@rV>!8SA(R3>c
zGTL1J6)2K6TLnybN-R%x1cY6Tx`C@wpu)~#MBcJZfLoN2TGCz8s`9Tc@m3e@kti8m
z;8kAZ;jXLM$zn|_WyOL{F?}3nTmg3hY!jZ|dk71+eVL6}9et!#xi(%c@3dOn8>`mD
z_J!!dTS$I_)K%PnO_yjHRQ=DYMw7{BwuxG`6cYvFXk5O-K`a0^l^}hZ$5H`h{##sL
zsR?7=3$Wgjx2G*$w;V#oE`RaK6agU=gwBGR%SrAzpl>X8p@Qgna@i1<PON&PBQadt
z*ZT-$MqUE81^&kDc6~7@Fj`emA~jH8X1MP!bhm`ZfI7-|L!x;6J@ZVOtJ3WeP8TK!
z&^DU4@^TyUvYzUO)h30#$y0c;A`q=jM3rR)NGIOPsisOY7F_&y6%pgUkiJm!Z3gc?
z8mq6}N)3TfyvM>s2OQg>QSfnzpc(M65==Y9+6lFLh!!z+?7XiG6I04jt!Ab*GpjhH
zhn}?RutbkojQvrFlLdfEfJkujTd`->rDV1SpA!%FmZW-H2Io<qEk~_aYDm4d23232
z!>7dtkFXB%jiBXNPEx1hP5*aFP|?|t%75TtIz!OvH%VvBNWo<pA33=E9(?p<8zI9}
z=5-2arjkBs&~ZZ5ET)JqS&ww(HcF_ftis)*nA&}*!^f|Z0*JdL^=6dhdG=C&3@ZZs
zU5<=2{P<}?4dHF;z|kG;FLwqcJNS0nWA((~CCq=X0fIHkIrjU2M}WxeQ%GH_MBBtE
zrjo{LKtxFS|A0FgEffCtj%i_n;QR~*-9Rk=<p2n`IIg;0DqpGZwovsRVLnZ(Mq^A<
zLxAQ(aDoMA>E+qKj>|@+mv36x!WrP8(vg@@s|}+-o9Bq;qr$8bO*7ghG_GG?vspxF
z?*9cInjtFMQr59d8DchxY|(dbh+1M#G29<qpp+Y|A5xQzTNIW5NqU6n-?Kk^P#k~6
zH$Y;Kk2jb~76s{v(IG0(`Q4B4zo>yYry`EBvaL@M{AvnG;Y9}!lGxD$M8|Y$Oy09z
zpeU`*Q-gRMZ*to^VbWbZ;Kq^^Fd@DSGTM^a_U7D1k&cG!x8c4o9RdO?iaQO-P3yEF
z^uY3u3&wK&{f0o@(QTZ<O+f|>_6I3>`ZX8Keg#eJ-aW8y#0{}VvM&67N}UB<ndd)b
zre{adJdGqr$aLNpjA?#*IHe7L#7F>EbWu>2S8N7;iQ$mTKp6vQ8DICHvoIF-UZhIM
zrSaoJw3e;*Myl3IVPm!9y26va<$bYLRWkK*L~@+;zTT0wg{k)*mTgtuHtf6OgW75A
z5FTIDg(-x=-VHGfF|TL@8f5BnF}R_M$e@M5p6teYkfj|e3J@4&+GhgIH1fFdpD{Sq
zKk^-j(J@7;;Dhi35K29%w21efg@<t%8;GBxbPeDpbNV&(#wayU7RT4xTUK)-&494o
z#WG8<N`ngq=+>Z-GILU;>pNUm$u%ZOVfesVvQzuN|17kNr$nKiqoo7BoA+z++`V4j
zX9cBttu<BECq7>}zYSvs{b&_Ackn^dB9U9awx(?M`EEy+lF2p@JfgyW>uSh}t=WmS
zRXt9adE6HKjEZ>Fx6^TZenAL6DLmrG4Wx{v<q}^d_z5$vT0u#=p+i_Ib`0@{Q#P20
zApbL9zx-c2yAhXl*tPEL4c8t&;9UwssDJ*^cO~7}Zo1;B%It5Q_UsXsx3N9O?Ks6%
zg`IuyhY;!VxNZq#71IeCy>_a@PwO^AV>fB`5pn3@wQh0ur1btS#(Lr{fQOq;nZaZ-
zhtL$!hgfP;(0dOTRvkPx0NXs*ucZ&Nh0~z?*6-~Qh`ct2I)1)$idXGZ4YjMWZf6+h
z{<`hak;0h9L)C+DMJ@<3Ty`tEu}&-yIY<)8dWF(_VPOl$+?^Itjoz5>X3(s1H6^P#
zQa-wMk*>{ibwTx8@y?L0?$tT+3g>LKI&N>mp#oS>J>2*TNG4L<iPwu3R!xx6w3;ED
zLrf|9SuEsburwEHlm65Y@MK-U^A#QIGzdICj^)jLd+q1_^Oo^87S&*Y7i2XJo9!$B
zyis1>goyhU(F(qF&_eBQAbU-IRjZA?3{iA|!`rY->eksv!CHi>2<CNL0b8bmgTY9y
zr0#CB;eW2jUS^Yz!Cma8U=>e4BBge<Sk(O|*Y22M|Gb`OS`jlf^K*8VV=jCxY<6UN
zK5)<<fGbKO%2+6*)P*cgjoEHEa47{gIWb^)J+?8~<iA!qF|UViVs?vO85LCj^>?-N
z;87zYI*!J;sU8U&tc5O*bfLMbk}NX9-dD&X?Y3eE>4^XT7Q=OYhB^1o^LAyNAiYx~
z{z#!BPo{d5$e|`yQUhpPMO87<NgjvD@t*pEJ1!SUM?2D~S5-ucvdnlhjkfnK<6|N6
z32mbIFCRKT6x9#zZ!xL4Pcqkyf#cC*Y--);>T9jZ@rE9YoQA*(LKcnlNOBZu+qkxl
zrtCpX=Vr#16I-c|dg<g!kDrTBfsOY~BwSv{5YOkV-ti`Ahf1VP;FrdT7aEQYUwWx-
zxNehF*Cy5Ldv0yTE6C<UQaQevFZ4UQ<wRNJO3cF?6=NJf$OfgE+HboGjUePh4Q|_m
zf%YAL^8yxN9^uz@PDs=A+G%1iw2$;8uPB~P$O8O-Yr~NBMqWMNHf{x$BnhfB<&OCA
zztP5UX~7N`Nr|syB9rTUqg!>vr&5=>-euwVDF9~7>Ow)D3uC>SMad2Vut^{xP(YwG
z+IY+m@?jG<99I1>p3i(T`OyNHHM4|kT4?kPG{CAZw;L1StWG6(i@>^UZ!SMGUy>jS
z;6j^LL&5$s-Hn2QxRwcLGb3B*>>^&lkz{Pp08L-FZfy(aaGH{;qMzwJa2k3vFZib`
zpy6s<^ifiQ(#d2|ZP*(7zYh9L0D}BuzxXJo^%(;6N$qWzRbeAVkX%vi9b5olA-%XH
z(F+E2(5$p5T4p`-Fhl7zOX|ZA$G?Y3le(%O5T{SERHU+aQqY{)mVx{`uQ_vlIvqRL
zeRHDPR+gK(y;5Knf0*jtq{%BX$R`h>`^x>)`1cqTBp&MK$xKMa6TVzWB!AcxiaHuY
z#Z-1rD&KwWS@~bph#0x4MMGKEGw?n{kEjoj-&W8wFLP?}SMNGgdiB`^&N0ymALgpx
zOCtimG}ZU=e(BYu!tS$D3rO_UK22juzn0y6gFx3*!}6tHcsRkza^Y2qrT_p3xIqB|
zXpyly|6u=K(av;BipeLZ#&G%AU<Zv@_>HR1`yUg2fqyz=c-{de?cxZ{=1jGZ{$iSO
z%#9C_uk<xVlF+4{Uc&<_+g*)tq{byZcn@^|;|jT3?KZs^V)L3M5P=?GuhPg6lCs^-
z7WjjRt;a1rgL3YU&ymRKQwPn}lQXWBuMU9H$#_wjj(_vXjkD4BBDKL5QcpfXq)c#k
zVE05ZD?^u89DfvG^&Hbk4MV4y#O2T&6g}?dCmE+eaHzSfjrh$O;Hqt)V2lw9X7r$S
z%09YVOXA#tqNO%N;I9`I;fJiQ0xk48Ju|UmW@rdC+x8UUCj7#D+~}Qrh`ZI*q-S98
z2Y54MX}pVkXEeZSV2*0iz_~oRm~`pb235f%TA+HhVdnvJV1)*`DGR(@CX_Ovi!Pqf
zd)Io|M<fHcZbq}V3;?0+q41}UW0AswQbiPYeo0zmU_vqb<DuAv7RKQ+j4;7U0=e7Q
zE3dHbu;<ZbZ(ijqdHzRCAuDSs;)+HtwtT!UQL$S4%zA*!@|tII34$#^=c2~ZQRXMl
zn}l_qv|6k#p%|bwa~46&mlcQIk?{&Qr*rBBK$&a@XKknRBIU+RNBd94VPuj#a4#V&
zE6DBGmltMW;39&&F!PwLu)$L(py8At0F|#aPY8*Xt^}If>ykw__~xW;ZhSbYK*^-X
zANFn~orjo?&QU7K8a0K@kVrcUc3PkHqAz`?JAtD<)e;tRp9|V3xiVF(Zq0nPT+39O
z0BV$#tyjG2tkzpXk3M{d&w(DxKE7~wH)nsP%jQz?N@Pd9r?w}9P={}v&M=d^%tA4k
zYP&Aj$^B7{jVwomJXz>y`G8jQo3)=>?xfZRW9&@CJ(#{kkDxg`H>)xoWlG050oqlj
z<hqX>n0k0w)Kb3x)r3Mf^S^?H%?aTo0P>o14w%)6=<cNV-%&O#=_e$g`KHw+4Mm?b
zqw|qQ)Q~$6w&Ab|^Y9L*FFjBnU-@7yM)V@N*N%<4V5F@L_uS&xrXc?=L)knqMfnk8
ze<DD(u%C|q4hWcZ9SG3Bf$&fyTtq5D+u9K36|P!!FSZOO*)lJHL`QyAMGbL4R_92V
zP%Bf^DhR3ctoEk}*I%?VY{j5tF&(FN30hG|X4@<9*Rl1?S<K}tJsnOMVjyR^k5iUL
z2MYfR!bjgQm^2$52|+iP5O^2NaVqSG)V`?s^B9MJ)<Y3n^4QU17}Zu+fPh&6mSPRK
z6^7p67A9lAEfF3r+a-VMkEgOluz-dcFYX131?9hHovy>5z$O7?@9W??&Clxu4iu&3
zgluQgAkM?E@V?B6ahMt`r>C2<eNKa5C8!;Q+L^3Q_RJF`my-kOiUJfVckw_1@NrBR
zAUZz^&1L1{q#p1~I6U>{2<2OA-sg$*-36;U553HYuoUD))z4eG7x`}#Bus)*&Sz>_
zT~t$&gC<sFg|mqz7!g||wjoF1ctNP>{=$W~An~T%@>K(7f(v^ZgoJx*p;;WJWR(VM
zbX&@D5X|tZTFJ=Fe^3^h0D<*qK#AI`3tH)o=dzhQx8`w6gL?UW)>~5}Hs5k_PqhOh
zkmc;c++Psv2`9s+`^Fxw3i9&kQfQ7Pb5E9`z+jBtw;lJe92*6)Syb3*5RnX9rM?fP
z^C{s2<T;|W{?++<B!W_nLCudgJI^IdP<12$mPiGx_cj)zio<=6IU1>FXn<qBR!1Q_
zSJC=<;qj0H#een!vw|7rGee9*Psz2{jKcJp9EX^sM${n-0V)eLhxPtl8pmNr!fP&W
zbG7#0eGJtQ$I*JRQ>ByaYO3=>xIOpqAa~+1-hvx4b{N>uh5Q^0KjV(lfE9s*2dwZ?
z6c|psRUFyFMq^$&*FUS>X1GG+#xjgOE40P$9EkNG4NG^*LvDa%RwRZYa2@tV@EiVA
zK-$Bc5mI2hF3?h4;1woYuuu1^_r4{3P`LkeJ`3XA?nTgm>qqME^U|ETWH?z2`mdKi
zs431v%NJlIG;}Q5s^KN!`}%UOOZ26t&QC_jp}>?GfU6aeF6eE(R>*#<(TF~$iY^?d
zdLLV{oCLEZb1I}%MA}?J^BTC@voksF753X#wrWbOct}LOIl2luw)p!le|7JK95|fx
z%qJ<>DhRp8cJ+bi5h2`u>aw@|I&OS|{zambVOoqMpko1A%=Z2j2bV&#%$?m4YpW4p
zWLG|};vA8px~TQlA^F1<5~~kk0XA2EcSBWtBL{ctCrTw_cU;OOV$O;rkDhPx^WTDr
z1eE&?($uoBg+|ktL|;Wn1iSk`T+;b>!dyGsF<e7lDf(BfDGw>mJfOU`NB`CCx{<=t
zFw6aV3D$_KLFql}aIo04pwaZ0RjLtWuu88=a%!Qi7z}ev6BWLY3RsQdmMC&Yu>Alo
z!uL@Kg+jscZ8sYZx79e{p4N3@J)^map73D-MVUtQEUjP@xJxMIMIhy0Z@{m)cFJNc
zpd+B_l4Yqu(T=ivs+wvqjJ!|{Rt8vr9TCZS_J_+>XICLcAJHcfyH`GgHZ?g4V~*nK
z$ERR6rD@W@0^C2UyM?gLMOeZikHq{x%BgY9FBQ(=scX}y0oou^?;zsYS!aI6FaQ7t
zcR>MyXpylN|408$Zatop+viUo(g4gFRR&uz)&!5U)%>}j!q{?k4p%dWW?Dr~6pSUg
z2a}JQzVb79@FwYnoeRF=fg%ZoPAtny{QQ<v{&U#;fW56?BBNc(`!|qe$jw3szOZ4U
zA3jBPK-$Bojqyp9mx4`pmcu$fak<8DrUg|Q8I<=G4j&!cmU!9aBtz8YH4=6gix%Xc
zXijDJKHF}tR*%xlU-6*>j|x{WEw5)4rr%rnu*;H%6h4H>ujD9HA`wU9s1Rl34>7KE
zeP}EILErTWUN{-6!^WKoZK_pjdnzuWdHWQPB>sZ@y2{ar!;)zXVJ(?(c6VnE7KIn;
z9Er7izX@U|O96bBJWJaaY5NmB>Uy5k3==8}_Rc3&(5272y@tOH5q+@g#)x0@v~a-y
zu~qUEt?fal@Ac}q=&@1=7dHui0i;!i;6GLd<##F4yIllX7&R1#`iG2KK)DY0Q;y$h
z8@Y8H?t(44GFMLm3XkqyG$&Lva^G<0ztZU|i6I?%V_7eZuH?Q<MQcVVZ5fNq9g}Hk
z+FZy3x9Yk%N7^W3P8>kvL*@Ylwu%xMG-F|0bxWbLTXU1g%uS4+bhrnFMu2bR92_y^
zrh|wIrGDt<*0nw+-)!4Lxnx%rj4<0=S-*3Q1tb{x_Pa_$3CtS=v1N;PPMm1cvA<+)
z8Jp(;f!5b=(?axXF3%MBY;iXgt|vs08EyVmXXaWX^up${bZEB?8*2HZ#gpV)Xh4~^
z^s0Pd-23(Axy$C<=md5HTBm-Yw=yTOOI(bx|CjOnbcxfTi?ScDeP1f)amw^|OlDi!
z(d5ja0;tS2o?(C9igP<J<I@3iSy=oG_6bYPEsU}!Kdip%cB`Iyi6dx_vazHMsy;8L
zjEgu)uQB}mPaR!zt0U7=d?An`8W-<{9Ef7-nr>_}sJW8yvU&#Puy2=ms1|J<SQ#(M
z4dD_I8#oLW+tH$708?=}9ot&s1ZpoY&ddvH9O%88Iuoz5TFH|O;`xb$=VSD-(rk{u
zHd)4BJV}?N60&%Jr5_4~f-@BKM9ar%Gy#d%;bMtojh3V7duIOGk%j<bI`{)cM>wa)
zN9RI4B7#%;AAM^W0N9ueV_FPQxgN+y8J2eqKTbim-XIBycC0QYs)e3RZOxRfF`@Uj
zbYF0bc3HpAsI<a)$dTvHw}R#RQuC#`>?W-)&hMGgSFK0kRSs*_rkAXpyk)Uupi*l|
z#kaw~Tmt8B53`YT5TS#Sa{f8(dJoC;AJRsss>N_IVIQKD015^VVzc;BLViZ6aPAlc
z(#9*EXGZ4NzSgt+0Q1<bU>WhVamJmh{(l9Zzl1(n-sZIqmwK`@nK-jSx1+2uhBT~n
z_Mzc8YdPdPj|>>@nMZvJ_TmQhymF3B>PPGAPxdEivs!meFt4xThx+OZHG>Is+l8n&
zZ9uRL08n&sZLF$l@|O;i)O>qI5@xJe%$3_J{*aLDYDd=Y3X4j_%NU4AIADQf{%9G(
zG!rTXyHNKB5u$qDGHOFI5YRuV&@y3+Lf3aKRJ{9RoWy?*8%3kALN-|KS1;F>W1}!=
zR2ojB@HukY?39R6Ux7FjZM8G)3U{D#v)<sEdykOwM9!#8;kh5pwRTuSojJ6g&oK?5
zP5kP=au84od4J}G9@;?<`XQEOyCG;q5%G9p_$FU$8{T}F=ISF-h^z+f{TT)J)CBAc
zXK&NWZ@xv~(_L=7rSV)m(2g#?sr-+cX#iO52FhSPGga#34_(xOI{k|D1*~*7$sx@v
z)5B?@2+FF7nT~c*z%cp3h6&sc1=!^`vLaeBw;n#sn9qeIA^0}vt^J7#HSD_83`QNY
z3hMif3m;_l<0O$xwBKlYDNChKVW*PxGvF<MCI_SCt_&eCi<>wA7KJBjj5gNWhX#Jj
zm{*RHIkbyDXV%E7o!)r02r3Vnray3?JZaoGz7QJUoz9r$m22D6yhU`0rV!Rl%Bi9n
z&VpgnaZvS5fxS{JqqQ4Y)G^^!lYZEXJ`*|A9kTC!qekaIaV<!HyAP_St8l3YD7?i1
zgN1E|DdZ7USN&+|p^8T0I{qui(?tj0M)nug4UbFqqwfVO=tIxQgv~~0NbWhOVbhp4
zCYE`NVmSDJIlbxFM5ng+DE^QHEl2X8PQC-*56s9~!3xHJT-7Yxsitl}m!dJ9?*^SG
ztz4?tBu#32n#UNK_$i@4@zecy>Ds*mdY4+5ysCN}qJan!;j<u>mwhMrf~7(1W@XzY
zn86B$`5>V5LiVv#_gO6JA&P9@<IDSOOk)jnWM?Rdvpae+Cw7_ejHF$cCDQG#(I8)=
z+~tl&HKL%!#Q255|9hSxL^IkaQW~Uv<dSL-U5A7z|3RQ2j7m#s_<7?N(Si)v8-T&%
z-n7A7%A)*JEW2Hl+yB38(FE2IQGV8wtzB~Dp-5i|Y=h4s_{+J$8w|9}KgfcDSz>W!
zqWwfRVoNr!!iIw`Ur(IZeoYJ^m)SwPnFcGJKLkvnXblVieyhdU0009RL7qZ*mH+b>
znCc$I%#kf%9JBvkTE5??9zOU3=Lh9waa<yhplfSW%vk9f3TiKw)U&YgIsUn24XJRW
z&WY}8kHt%MT9AmnfsJkx3NULMhl`B3R?f1G*oCM2>2g<@02OR`E(!mx>>~O#vjGOP
z$J=*Mj%$scHkCBy@wHcSL;NgT_YYNS%N&@*zMAb?CgIiG0<EIyE&%vQW~Me72E6js
z)5PKDK$OmUAnRrEuC|Z+%9+md3_8Tz*`rj;i!RU=8v4#HO^FDn-zqgGdCg+*q)Sr7
zzIDpqMdBqYi)IAT^@~y8Tlpvg5vgnBA8sxtln+<!jMX^H1^|B&@$zz7YM(q>@llnE
z4wQ8rVFPe~(MhtRr78CFU3xRC-I$j9uG8ynbNMhy-ZrDG2qDF<=tPgO>dc}bb3GYX
zl~Gum$U0gu@dU$kKTa7BCQH*-O#XsQ=zuPrv3un>A*GEcSvT=8$63;u*#+|@ySdck
z&iJoF0xFD6pToa&MRTUt(VU{&S(TbV`RMrIYj&{`JPHZETSHFQamO3vUvb=4_2NE}
zt-FBJub>x~Fu{448AIC<=FLU`jf!o)k=Vj;htO2{ndB;4_eVV)<A{WCe^y|bSE=qy
z!)!gZVPA6jRyrE!QWGje=DanB+`S-DyZzbj_?_bbx?sk`#Zeb6CHs)E+5Yo|i8@=f
zU_1`j4oJtANpB`cF?oZtI9j8}sch|m)DwY|ZM*6L9-@Xwr6csb8nT8XR3R@uho&nS
zN7t!8em~yIG)3&iA-yguMz~!KmTpF9fk<&l?ms<SIBP-sd4!D5<I9+o^VzTm`0kQN
z7@&QzVLG7x_TBwnni0!cdNw%e{@DuQ)raayEwr9r;}u3iZqPpma058b;i(kHc2OBh
z{wbgg#kv4{`7hs&ryv$*AZef6Dz=(7BD}CBSF<037h6V^Eyb<_#^#DqG*ORB&~abm
zEtAbNfL7opXrH{AN1f7%Z1Fp-w7Xf7ZOKGQ$L=BvGA^4+(Q`(D!eY6X_l&4a&Sp!)
zABVT20005aK>`WnCy5`KAO{^{ChtF~ve%idnHdCVKKF-(%zDoytO-Xb(#jO~(@?G<
zGcvt-MHCiI3W`F6c@p;%WXE?FQ+OISiNF_i-vny7ry_`Nrr*$Ix<)i|<tnh`5kEic
zL6&|AY(0zv*-_uMDVvZ-+I*7XGLh=I0$gx#RtSXE{jREhVyq$|4k;~7iCiVBGxX33
zO;59kR|*pF)smDNc1}B>9m<>5gL16+vp2jx^Zd}m=THW^&~G{3hi8CE4Ds~m_?xS5
z@%zFpoR$B)o}PK#-^o;f(xlWTYKP^{#KwS?d6IavN`m6$$Dw3P8C040jyqG$HIpnP
zGaAoFAw(`u!Z#f207W@f{<$%z13xtjR3<oRQgS7UN$Vk1=!JSCy!b`aH)WG`|LAJ;
zlcRS(F&u01`)Vbrbg+h4Tc6yJU5y}dMCr$uG2mAeG!$(pp{7!_>K(D1g^zM*K}=Sz
zkN7XAQzmXV4aT`Vybyn?tk~^>+0$05qkRy96xz>5cegC}KZSTdq(liqm=qxwhi!5X
zcUH#JoZ2*(XvsfMHcQNXb$76URGg-~P09W97oWVlOx`qr<0r+q*8+UoisWxGQdwH(
zfdBviTR{N=c?sf2|4~$$R5!yT5}C;Ai0mo)wVx!KJ6j10ar2YldNmWt_-Tm2WG=&h
z$rA|MOF9vV5u^ff!FFOFX2P5;1oB4teO5oc2s}m?modGS>?iOfS)?{w0001+K>>n!
z3F4oKLcT7fwBMSWG_vB6hDnl+Rn4*jnbOF>*tRAFO%E(i6~>80;8+!5j^Yf_%DZjF
zzohWf5u4TqSJ50N3R)K>jhwI_sNs{IJ3PWqBwmGB0EZ&hA~GFZLGE2$t^2QU+v&=6
z8bQ06|15Ps?u!)rzFuFQZ8W2O=%Kk^J$2?VADsQRcW(9~<cH!lXpYLf5{$tB00G7U
zo?&!v&HKmMY8)#rD~_}$BhsGd0F=c`e<u+zqBSn!QggS|3A5Em8D@AGR-w@xC4ub1
zK2-OJt-{O`p#y%-w=Pqq^!72H4jcdnZa_)s<tV-VOy#yi-KR$0GeYVrY{q8OpT}LJ
z;su@=(V~Phl~pnO!5_tYK+^MlaCw7&6$xf>$YRBcaYp8ZggG4dtFUygr+?5vEXhl`
z7r(M$7;)_`o{FOB8~}7&Wulkop#^N)e~G%xNV3*T69<Sdy+XZ>*sU9}L;<&(0Fpp$
zzs&2tnK%~~7q4W0ef^%R=<$eoB0aU&GQrADzK!Q`?QfGv)>KL}9GShT)p~zYb;?Kz
zB|<CB^I3|5i~e+)_Xg+|6{eEZE*IAN1Q6LIL~Co5>2F`{sH8$!#Ktt4Hr+S^XI$cd
z{kY?;dSn)b1hT8EJWfGs;bngKj@d*sN!4!a9sa4yV%ubUP=ge7Z0MQF9LdL!Vu*!>
zh9(RnA0Z!u2R|jDQqBf4K(B7|;q)5MNz`gNvPXW|ju5Xk{2uIg8}foBVN9FV$j*It
zAA>5F)=Lg*SHZt8hwqy_YQoP-e{CRL+TAr<kzstgVR<~!PW1D825<lX0M7ve3Fa3>
zL;s@?j!lhLkOV(*DC+5ZHXY6n`e|tFx;us&?UsXOD%MN>^5RAcEV&_=eW=2Ptmfyj
z%Fg)q)0yJH={dQmoZQJ1?()eh-G9_l7HN&Lb0SuMPj3jMA4Xs`)5Bv#tWEivExPYV
zgsl99@~CS39=-uvB(BnrCnHW3+CL!Lr-xO~Tn$iou?q{&1o!+K*u7s)e#Z^dFBv9;
z5~YS{1C`hL&r~!WXVfCxTUZ>C=$Pdd*>lgwH9n_AW#e~mj{rSS)tn|NfB*mh6afJO
zd4<ss|AdjYVLjS+8dV`a7ytkO9svP@d4<t4|Jd@Jfwll8k+J6vQ_Y-g^2PWGvaUD)
z00ChEo?~in%<4oS(YH0i4zrtOZ;4{n$}sw(aKe32d7M#@s0)bed9OBTGl`&msEW2S
zmjjOm7a)Z}cMyTkr;3A_+vqy4AJe+}TmmY*D!(jU4%^F96ky>{g8LidF+PA2z)G=&
z2GD-@@0X9Dsjs2_BIApj<bLmt+VA<Y5LUp{QQOvyBH)sQ=u!b%33+}qE3M{xGs1lT
zw(1OzBpVnL3=+n0Ufzmj1TDa8KlHjzl)1p(1#O9!tCov8wD@PfgRND1=A#H3S|52h
z@ADh~EiH=)%&RnEbmtQZPylx~XNyF*DrDdo)8~2J&JRS{7o>qb6gpWf^2mHgjv4sx
zmR))jWlP#5WTUnfhf~DdZb!}Z)7;adz?aT`cSq@w6CO7;j>Yr&pooEtJ|zM8%Lke3
z%|Mq~;mA6A<js<LLuVf{me$98DM(}M{eYMH%acp$p94IKeoBYONHTq!0003u0Rjo;
zHl#!U*axpAE+1lU*L$>H;FuxRq>J2Ix+4~4VA*0X#voE?_=&@>zZmY_Imn*EZA7ca
zK_p5gxe=Yl9i27<Jh-VlAD@phT-y{G^)QKSrSJZ;49RXhi2Db5stVk}m}M@Ii>34n
zQ}|!-{n9tyB@EneznO{354xiK3u|hGK)u>0sJnGI{eW-f_LUgnMox3@+$nEgHEuVw
z2|Nm`+70Yyg~To|RJC7ROfgs&jZRJ*o_nqs%LZ-n5u&kQm`@n=Hok*k4c}w+0vf}x
zBj>t}V>R`ThF^HfSflAr%u)|VrdhGzK+`$88Trp^a(j;?!bZJU{!1|k#IE5-4jNvD
zb%wF0ucKXk%9ei-k~|ME^p&f__1$RmbG#KWJ&SzNtE9Uas)$mZ4n|%E|4Nwv001Zf
z0RnlAsSyA2kAP@u{^7SrmqN;Flaq=unt4fGc2ts`7}^1zJvB)H003wK0fKprsWboA
z?NyrydswsK`A0bvu=*Ua5@scSpyh>-^4=fiJ$q34wy&JrcMvHKmf1BadCiW0{*vjS
z75A_hO?(co7Z!$9mF&UkduOOrIgU>w_!o6%>%(g5_Z7nN>VGXu@B}9S00=rknr2C;
zL1>vw6!CAIt=oQ}0T0G4e^*rIQat^Y_<e1h<~k9J(zq419;;!7K}cD4t|P_p`rq@=
zk(WA}*ljx>20EbgOJJbE9`nb>!e)wE3|UaAwsdN9Z}9NEAzVw`DW(S}s){|#a8mi9
zIyv64NTSr4n$st5<F<=62dAL=X+$P7<4aB2i+A`7j^bKSOXEgG7lh-9?{YKHa-|MF
z^uKnfA=TN%<ozC&ZGAJAst{6>@2ibd*>VF(2Q*ufDo?xTq!K^cEtCo&>g&vANTPWL
zeG*L!9|IPeKNM+lW&k-$eDp6OP&5v)xQ9bFea@}Dl*<M@5rv^e!=Jp0=3AT?09+Z%
zaRK?()3~iM0_)&iz1ka+GBl1TFyVddyZDf@SUIcFkt#DkGMp%<n@HEjS1!?r^rB5N
zh+K2CBMM+9E<3tIWZAz|;DP+(>t>ooDf>WHFnezax!duD;C7p;MEa8Hi+*!yCbG93
zc7${L%^=yr{ccgpO$I6nJ@z|-zrcZRCeUygtr|I}B!(wItLe6=)3Elxa8}GhR?G%g
z)%-o&Zcij@VDkeI?%zMyVsU^o$ZIgau{xX*1l|wY`x#Dbx&q>7eps79$v*qd-Mj1E
zj5<$w0KjNMxzGAwf+yS{zj7V6PBMaq-J$jnCi^&vm4l2Y9^W7PvbE8`5UfWKBx!#B
z{ptkf;m*4~B*<U}k;k<?B>INnX42#^)OgZJm8A`<kP_+TI05yw4eB$ea-S#+WKK3F
z%uh1xoQf_$!|>vzzhgPVR?WWhS3bxz8Rg(<Lkyv<dzy-A$DM)NKTIG@7{^rv&g2Tu
zCixkpw4)elIa(bg?x=)EcA=yFMSjD&r}43e%G^Lam$^#u-Xw`*wI9O=>=Xq=lsV-b
z!h&~ts$dovsYk~QD*n+)zI8Ae7|m<KMmAD!Ygmr3#3-1vvr$Z3v9pc*lfMW1_;Pkc
zFe&IQ+(k>?FhUfihJW%M$4I-FD{P(gSQ^>E0^LPCoR`nkda5=&^>jYXB>&=yq&Je`
zFfhz=5;^;mVdufsK^-0_*G4!%ahr_DSE6y$O(5gcLtV_d+4Y2zDt<KHp|B+zuw}`B
zi}vVpg6f9rgeFjNV#H_@?BESXI|aA?5?#7YTLVW4=sukvb|&^;bR)p0Np4hLEz<2e
z_!(+#ZG0;UK|XOBP_1@ro{6w@-;j(f1)(9sXBV|$x-qe2ch;I)IOWHO%VI9C%tK%-
z+hWDQ>H7;`y6!e$n7d=>;4L4axF-)8B@ynD;+ll4e?2?wB(*ii+=l>w_3HbVZTl>>
zh=)#aB@iv2f_efVppnyMi*`#Z-UbHDB>3}qhu3Pe3ef6LWs$bVaas5x5Epq&TH;1z
zg}Q_9V#I`WZicIb62;fUtPt*moCSoYlypIWOk7DH$ig7X9~m$pY&80Iy8eyH>B2`R
zsM^`N_KQ875~eNF-0)I#B!y6`GzVY?^jRN9gfz`E1#hY305!76T;V86WOq6>($8Nl
z7f3mHW8*7mFPXHV0z#21)mI|>T!XT+r2Hi>NLC8HgF6Ry;b-r|^$)J-_QLr*of$iH
zORrA{%#;EXzv);li>G$p*nNW83D|A>s#YFxn-^M@^>b>l!FWM2z`+)?o-$SUi|zr<
zFY7W&$+>#94yOiD478(@wed2>2a%pBT%Op~NW`0i2V<MVvL1n<!P7u(NvT05RASc!
z`PI+XdH3(O|M=%Ngb-fu*C?4~sPq0_L~C7fITWeQ?C04}KJ7c3=etQY;;qL&M_p#&
z0biOLx21wAKs8!=AOQIy_mGTQrF+E;I7$4KTeUA4-b2q)m;1In9obJxSO_a94-O<>
zG~z+@yYK|zyw08U-6msecI}1O>4VLRw4&U^eppWZ&Q9UM%ibqYpyUIw-Kmgjcc2o0
zG&=d|Ka4GV9}q=VLjt=53PP`Gt9Nv5V}9t*+SrNIMjJysP8pylm|WC{a=E?tJ0Qv^
zt4>9sY^Dci1ZyVK7bN}Is%0Wfq1ZcTNqTgs(J^A*``%DWX-fJKck3)L_H|9*H>N}w
zra~|^f^|t?0itLHUvJD^OB0(FpQ-G}e;t;)->RhIt$MsoFh(*Gr9+CONfT<Z+QXcu
z&O`y2r%9QbSAth`<51?sqv4r6i%~%DJPBimf(X(e*+3xthrSygo$~U_Bsq5hbZcVp
z7_DTK5*_c8jh!Kvis7*DJu~1*P&_{n80ZBg#@6BHI4K?Q<R5ok@Z2B%h=rzf?tPRM
zvP~et7wLOv1K%7`>@2Mh+wQ;6)fGLO(G6MW9WIoW5@q@D0S!h<Cn}&3R$5-<-tD+8
zQ8RQXDZTgkefAhi3A4LSebkT`vmVP0cKzQLFh)GB{N7K65n`(7k<)H7>8LSUgM;-U
z{jRHaySfA1&Gyb=7zJ$LwjIdY%zJu5<VyduXs_F|*ISUl`4TNEB-GsF=dQVv?LJ}N
zmfs(!Y4skY<%TW@oTDVZim6Aj1X7@i_KdZamSfuQ_S(VPcFG;P7R*B45YV3YMi~kI
zSNmdDd0_VLVbJtL{F)BgS<777BMsx@!z2B@B%^(f79WXT$rBjmgH!Q@Wxa2{3gX?F
zv|2mC$)R-Lkee4u>Y$P`e-?%$l&0B2N2Q~iy1=q6GeSAyTyILE^=A8AV%oX#80F$E
z(SwGsknD$@hW&|}xU#3v>)lO`EKk$;u}#gG%wxD4m&GE59e0(NDgA4}#=uL%>N6~R
zJqS?H?+;Ds-Ao_WZ1Et|&BiQb%>V!hj6nhkrevCg7KxO>Pb>cdx^J*2#?0gE*HNz-
zxBU(iJR@j9JtYF_w_}|h`t(QmS7EhIadejG@6PaT8S&{-35jdX3L8sBrb`qDWll7q
zT4@{QI=?;E)mc>v?h(vXt|>N;i>zNg9s~yznRR#IReJRItH%Tu!Tg#m0vn-dI12cl
z>;*2wPT>ootwF;b-66*z>8&f~;MJIFd}Z?oG#^#OfDVn$bhynS$cS<)&>Q|TXPe7X
z8k1;8zg=F0YEcJsds7i8#?^0Qci-CeOE_NFhYp>Q_;227vXSEGYP?9hAtH+0Fm!j#
zAr5|NNk?VOUX^RmhGeTJBq=~uw~nM$(l6N4uk(Hlj|f}t|5ItSPqQoAESIoPE*PJ0
zrP<Lb1-UvP%r1~sE~57vKql!Q&yU_$I`sbS98Tl<buqC3Haz}X#i42=x0X3eaZNA!
z>n7+FJ2TS<ipmFTD%^mJT=;4w<{SDc_>dS4E=Yta-Q#Ab^?bhlio5kxV?+oH^~1(W
zxKO!{c5_Muv5do1Hg*-LHy3xVa@A7Rfm2Rs&H`V^@T$S<OEY7^q)Gj<Dlb=`&-ayI
z1k<RvEzpW0luRyelaT%k@ulk%%i#Y0$<`Y0U`mR`L8b;NgurEJa~_REsf0+{-<bxp
z4rO?krASj=_V`7(Neq5L&Zo(Bze7#kio6aE%jM#e3TfYKN&EM!2g)Pgk1EOZ&d`}&
z)rbOLMD&ybW+fXUou%L+ZX`;Lz-m{nE|uZqG(=4R%0_R^KfN&|Utl1S8NrGh=Q78{
z@Ic(1AoAUJE=%vUy%f+vBHtO*8O#7~u<vYhZfO3Df+Q~&uw)d7=r1qr2IF(DhxcVU
z0s*af5k21M167xrT$&yvW(i_`1EQRI`V9}Gey{Zbn>P^B=iC|BSrl#D1O?t<h&=WA
z`0J-=Mt`*SE5Z)+w<4<ptCWluj!*9R+xa79Ost;u{E+l&NR^M-sC+_m)~0DtbNt=C
zYoO6aEUs1N-lSo!qo;=Fj9CI3>Z6*6cHZsExtwET?2)Qwgi)t-3r{2bcOSRwm0*In
zs+Sa}YvFP>p)8qZu9N*~)XMghTK^tyZU3b3rEzrZs5CoDk<9mc$AJ7UFabM7KDo!+
zh;aGUwgfF(3TtNJa1pWN=v~}VX2@dkP8x7)-@-GpFCOh&5jw7H>|_`hq1EsN(|G=Z
z+I$Y44|#+p%U2OZP4OZzy@s~=c<q_M;2?BE7xWrof;PMWrT$7Dix_0O(T!@w%FDb&
z<QT%Yr=h0GVP+pc%aZbiMZZS?6p^(U&jXEj;co3%;6IWY6pZ1dP)7R!4;;U=ZuMVS
z)hQd_RNx<R%v+JFJD3gva4xwpXx<A8L)@*}a|HTkaThoa!T@JHpV>3(axrdM_v|6~
zNKY$l$E~(CtRfxvd=Wpllw!q*6N=L3rZrP=szX>H6QYcBN2_ohgii0*;7oVV(bbz{
zE9-^KGDauf{`N>De2xV~6RG}6T864}hCmC*(o|x=dM*sH9HaJmA_s}CLL4a%&Rq2I
zz6&;jl0XIq<I=Cs6)#F_BtNa3CXp&9YAq-=;UqS=_GfSuss}#fFx-~Qd9(2s>yuPC
zWpm}s0{C@tN2+sA#t7%%BojNEPNyBy(El(iT!Gy-!v}Uuw9uidXs{TiwI4x7i)ndv
z&%&@+`H&UM^W_$q>eBK=<L7RJL0P9h8XaIDCv2m7x7v4l-v_c0!u<!6=LAcNWx@$|
zK}6nqZl0fichQSRN#vPjyt)0=F18I5FJT1)|EfF22mO)8TYaOWRmNQ-%*h6V`9WuI
z6puj$n;jf}pl(f(<?&nI^SScLu0~2UPAyzhe=53u3gME$&B~aWpXL5#+%iOKk0PW!
zgJjW*Rm}`r{wm5QRd~=f^?#}?d9<53>UsVOC3(Q?efsa46LLMLdZBNxpqI02+`*Z*
zF!Ux~JMGury;Pm$MNXY^9O;}88j0GwJ#~kuC{hK-_+<J^;7~`HS{m@igIn5p!p)6~
zp7%&+=Og-gcb%@$b_tyknnD-u$vXl6719(|{VUAv=|TLHxbdRw1lkKTWG|0~b}dWz
z$2d20IXJt^o3m;#T6;&p6pmBHxLyW3#W!o}tGe(;PR#$9D&re49o@TlPC@!WMQ)H5
z$xNAvGo{2B)lj@69%E&U7Q8*zfv#orM0;1BJ^jFr*)Rya8Uvix?DgOf*mQMgi7tLG
zGX1;n>=#dnK|KA3bSfsf-wn#9<~@t8MuAtTtE)nCq}6>&Js&z`?49Fn(g&~O3{1jo
z0>3vFx2;~P!Tzc3UP+EOXc-KeK71G|8R1*Dfwj$5ZrLW^9-NcoK8oJ>vxFZ5C{H4+
zL*}(p*~^Sc2R-91BdbsO#oj*tS`63yAO5;EsxhtL-cx@Ih(=}O6n=@+_crAs+?>7I
z<o+IIl!73Uz$kmZr)DF9&>6d+gVHbcA@rC3zT}V$rBB)8aCPfLp#<LD#U^cQT24<G
zUq@E5uZJ8Hg3M&rj}P%p3!$6zO0&s%vX4D+uH?);RxeqhDM{D$?|f9R`44_pttvNq
zP-lsjENX{iBE72_nvoL%*YqTCX=V41733Ef2Pq!=w$P#Qr}~J-CFJHwm7zx=wcAbW
z&pm(w@1+fDn&?Z0OkzQB84bI#328BHV-f8#5Mj=gqV2@CgZ2(?U*1xCQ=+tln|Ik~
z6UR6Pf=0hsnv654Mk$}N(4%DeE7OK1B{S+?G#fXDk7R+_VO=*n+aZ?7lR$;6uhP>Q
zaTO}ERD&I1mz$o>HoP9=ST{M-ufX#ww=qd3?OiS1ZOXpZBY}J*-$8|FV@NOm19y{Y
zhJeFZJZ+LIWTfGgU7z%)6qS?OCc<u@OERdm2LJ#DMnM4rX_+RW1)^mzQ_lZmESyiN
zkNYKn8bB{k_T#Xv$X^^a+=jc_`nvQn*6NTDMNXh-EHA%>yc<B|?&1w)f~a|*RjRuV
z>pGcyqKm{HvoF=&ZptIybt=f5{E-NW)|QD0yxaHT)D}(kSWx;;Iu7T}+UYT6ZsIHX
z75|qcyu+r6K~n9%!Wk+|HOen)(f)?5Q~gKaGj2KxZ!~{iw-ljc^(eS)RNe#qsasB-
z5b#b=W>E??OO}+cdNdVYF-pV-h9^eNAnI}JMi^D_iZDbdz*q4Zd&#IRgW7*|3d+7f
zc&C!04Ind=<sX9Y%hlQQKANgzUU7+A^=pz&!ZJ(ZWUbcmp8}Muum;aB*y4Yiq@!)j
z>D3A;tKGvFslJ$d*!L|X__XGlhfdv4T12>qpwTE-$oyOkJls5>gLhCIdL<JfmerTV
z!H3*Pu`g>dIs$O@RDJOfmHX$<g3Ot@o~f`2?f+_L5u>oi@>-+A74mf7mEw^{e87rp
zgwf>m8*+J;d|lPSB+VN6mW%4Zm){2&B)AK^tso84@2#QG0Y*|`&wBW0q#K+q5rtWG
zO8zR#4PjfRxhZ2w!*%DCB#j&NW$l@78WF}weSN&rp(OE&ovy(Gk_m~qAOXbSOB1Vj
z9*a~jBSR#{z@=RSW?NSgxp-_k=o_LhBtL`Hz!YUxAyku#`&(d7hKCP-Q&!2xgRgdK
zCcoGnn6d4AzI`5yuS$r4YIrF6VTk2c1UyRny09`3npCn_HIH<TS_`_#!Rm=n!TDGd
zdG`8s1MoxZTI(8kF1p8Pcy#4ku@ube1(C52Ku}??E=CEzcijDLuyX`Ewm2uyrc7<{
zWXk93wPwvfRiY5KEt9jJ;^geT17&-(RS^sX0(wHQp}i{-X`c-E@~7clFk)~jwz6A&
zu%^wS{eoMJ(c(4{8n8h^tU9(XRSv5o4BY12VZ|Kg+_mr~;dEpT)!9AcWKDsc<HraW
zu44dA*NbY9$KkcNVXG;x!;>0BdqF+y^G+c%1xuVPzF|_X4~56t2wC2uHPD3a@Y#j`
zn@$UAU_FP##NExrE$yY=E`>rC%EMt4_y&=guexTHa;*6?;-V|DpxjLa#%?3j!I)^R
z6oM6nUTYmj#{My&WHK$#_z+V_`>{-nK%S!ifU744YZt=7|8uuZX&U|i9|8OcpL?S|
zb!2*fNYWj1fbzzaW>Ofx8V#O#g@Rd<Ye>AsG+J0yz(JR0*Yn4ylh1X+K|jd1DU|~j
zFo9~;3*IkHb<)1`)l(i0RR+9!<*(;jfQbY7YWE=lvNqUqZV$lMQpK0yN^KT!kX3YJ
z7189eqgaL5Gy8>q>FEB+fMWm@<FuGgdR&!My=olZxmJ47hd7;h(*-}N_`dBwyapg~
zSHYLRLd%jxw;Xc<($e>Im$@~IUxC^K0OfImb9e%4DU!jdd*aOoSKzn^oy$vDR^`G2
zON_ES2@83?Mpt;N40*(8W!9@1S_xK|lFz#IcN{)U<c)ng1sDyGX*-Izs~547WmDVs
z+ct?!4GAB;*^f&(;O4|*U`x8`*Xm2WfhXm|fz7Ku0&b379ERj=JNhr_Bvy`9L(l!1
z_}_!$+xJVs1m!ezL;}s~Zw0PB?cl)himGl?`Ko5kC#(Sb9BnSeuSBg}e*&g&r1G8O
z`6@mhAufkCv+vb1u?L6-$+i#h(K$2sMQh;KcM5VWiZ`Cs>V@I*03AefPcH(A#iE2C
zLKIN!tW8K9%TJa+6woy|-FBIniIrLutv?2;VxM&>;j2Ol{o#s|&(R1c!qw%R&&|j1
zTN6LiN(jDhiMU(w4}tgTWyypEKa&Ke0(zsVi{R%DI>QSgO;X?E8^~F~=DR>xqtv`z
z^Qa7?%vvV}P-JsaF`Vg1@Qdc7bI4lvMJ5nXYI_i10NE<OhDeE%Z+zPf06W^m*y@rE
zpP5zcDX|`>llvm6^OVm!c&K?qv0TSoWo^gB&E#TaR%5~YO?ylhkKK=FN&&O*fI3)!
zEW%-v-Wphfc+v&J7Fk0qTseaUs(7+TZSooXMQe^7flqKSxr$J|ZOtBRo9p2CEdl~B
zb8dxZ&`$Sj-sS_v<kpe)GyYecc^-f4Uf}-#00jC$0fK3nCZPqQWiV63zmkE2^uB&{
zdi#<9Nu-Jq|8xvdkj?%gf-@~T)LZqYZb+TjCczV~sT)alUb!(+a2{a`GX=4|Lt_DM
znZpRUgNO|OA!#HlzI%_>s#-zu1JFR8L)}3XSgsJ#@%syibd?oohxsmT(WHGP^4Ajv
z6WiDWUGBeS;wdK+i!-+1{YSJP8)w7?D>5<mw0(1btnCH8$>q(@|MdZX9s_T?v5sj6
zld<jG_mUZA`L15wR+Izsz!~-PdLV=uhz7PN74ost(0Vz_CoIgUwRo3}tXwywf7<9Z
zq<OH`dhD${DX|2Wj7=MENE#uN)>ep~Az?7C8q*e+HEzf4PZy!N9(KktLN#=~2!R{(
z<lFy4dE`g4r+hnNcx&3j8KsKBUQ}pDZA0M6jyhH9T;PcVkG5<B<j9;oYjrDvowBaR
zo}e|#F1$;8-DUw^KC*SdW=Fmjg$GD(-rIl@pnElT^=;u#cy2@2jKC+&kxD3732EWh
z7AyorWvH$cH9FOIPq#D(q4guRP6c4IkD<4uagYgBhAp>{Fmy7gJl4=^S<u6dAY(Yi
zVnn_4(p<eAppvZv($b0g;7{5p2DD@2Y*pIDnssEg@LpTawyxlLDlIqaWcr`1z0vk(
z;c`3(iFQWIW1I;=%K;l7O)wHz=6(Pdm?<R!CgWij#eD!#-wMu5g}!QEv)%^;Gp~C&
zQaL~Kl_p$v^fSp0nw-SW^)y<RH#JgF=G8Uaria}yhI`NGoDpz*SicvvR^gLk4y^Ea
z$!4fl`3__<v1sC_{|wN^0Z>>mVhBJvHSn(1Mx>*j44Z`^8eA7=x>|=Xs59APt*}*6
z6&V>>$%O3v=v6plb8wHoro=@~6Kr3hG-fYhgmlz_m@lI^?4vT~Z3QKJtCTg5G7*TV
z@)BFJtZX}rwprnVj8cQoNG0l?M7$edCE}P!(Q4&FM07JdLz<`*LQ%Jy*do0-h{N0$
z(K(0o*0S!*QM>VX$$f2!a0@2QrvxmIs{y?5vhg7w0`%fKp4)lG&tB{%nE=qYPcUe?
zIhniolYY7K7m3+9Cx;x+1R5hl>Xx`Q1TnASfFgWZxDK2!;vHGS1^F$JM6#MmfY>vh
zl>A5gC9@|C4-?0+{z3Q@gc8WeXwA@%`@C_i_&}z4rR7dDmA#0&{5f*(=+vl8q4D}N
z;&V+O$ByjapGMd-k||+kCf+VMt+7L;6#2+D+*AF`Jia7ygd$k%LNp`L{iNjiZBD~i
z@?w^sGwWoQgvS!0>8!@-Rx-U}skI?*1q@NDd&z<1;O=ohmbyyLtPJC0vnOQ$YcoKC
zl}9Rfbul^=?uda;*NEVXcdu{-*76wDw2;2%P$A_lxrLUuoK5c2ER*Y3s!f`E1&}Z#
zhKFk^nl~ZBYxhK|z&5n2>)ci-r1zZI{16tvSqZo!a#ej$cWq1X;9QQGm_-}Fuh>ls
z#k=)Df_v9LszRZd(EgZSUh~4KH5DJV0ru_D1U3;L<0DVZw8(ZEE=_g~omH5$h4C!P
zN<%t0zX^G#6(>ITEkFd-uQ`MwK}F2sEyw|f^XvEdF;2GdR$PH7@~)y1su8k`AaTNo
zT1y5HFS5TyvkiUtET4l$BtUWj6>GghfpJXFhWB|^-ITU|CaCYV>_;5!1Z6Hu9HsC*
zDjL~0VE_ODumPThYH!Tzh?CdzAEcTm@pStp#i38A(ma*2QApC1>o8JSdv^rhM5O(6
zD%=Y`)=SaApzol?c)fffx?DG%TbEqeW>85#3Xju}ZI8U3E8h8))85otmCW50PS3I!
z2$yi--cyluGsYw`Dr3hTx9B9`6<Bsf@uHNm&;+YB9Ox$4TT-f3c2WnE-}Qi<w|)Wx
zC#Z?@N79#~e!PbDjTIIXg(K8y?vB7ft|L?|p@;o$tPQ_E6?7n?dP66tuOOG_gY|};
z33WhT&>8xqJjW}j>OSIumfYR5lo+HuJj5%bg!bUz2Gq3#WndJCfO5hD&MglU80T8<
zXD#-mk|SzezdzLS!Wm0T8%5nsSIV$!^m#4E#;b1Ew^gld>RaRQ!_6FXc`hEIp^-X?
z0a%O#p&fu%-S@1WzHzMXZJY?$QPoLWS|!K;Y(1pctsGXVw??ZIm|zB=&=p~3NP4>3
zV-P_FdF*|TnEckHfm>tb6rgpj(rZ6(V|$>^*nt_)i(A}kbUVF9p+Em*2fU!;@5Ix7
z6S(;&n${2(26i-Hch%-f(hK78O=$M#Jpcd!MF9c{=vJgd|MI*lD)d4K8hR<$26<Z9
zmT_E3yS6XmDKzV`;2G<Q-ZzzHB1&()X+ordrEG~JRbBoBwZpYNFW-DGdAm4+!~x&V
zH8ze6jU3XHa;u;z6YF4E7;1?t`p$ZwtQ#z-*~Y6d!7aO-o+Oba&ZiYm<k0V$KP~Y+
zot9l&asK#*GTikQXrOczHl`r*gh0IjQ*IV3F|R~~An`cjSU*}=bS=$dKVK@?(LGN>
z@`--<*unwMZh1x6S3{gY9?-9U57l{1RGs_=DlCxZpJ+=vs^IZsFkqv;a6;yc0Xf(}
zK!dA${y3ix7MWt}c(QQ1VBzToJrqBtu_UHnc7bU<w`qDOd&)wl!4J8pS^47$0x82d
z$%kWUL9+nxIsvCH{-Rt8fz&s6TXnh02lvwep|6X%39GfOdOaQjxg`Jq03HDW0(upx
z5dZSIyN)%qRcb<spaCk1Dgw3UUqwNI0000^0Re(~6{$1-*Y2}T5W+BeMI(-nz;=$F
zN}RSK_&L%p@M3ccsua6uPHuQBp2uoxvhw5?JF~t%UE@*BEtpis*^)(;wU{VYL64-D
ziXi|12W>%`ib>%LQe`kA-*q+#5zG4_WE2d4R!8oaxwWmx(&NIVya}8$;_EL#5Lo8<
zF%~%}%K#`cz;|FOY{fzpI`3P@(cI+ga$^dBjD2zMTFI%4XzH6(n|>--m~<0e=_yWs
zQa_*k%@DPvY7M}#PNULSLGp3r;+E3aE0ih+P{+meHDL%3cAt@M<bdSyG9R-+X1k`s
z!!VbOlzFZF$J7hsZLLkf8P4`sDA6dxqN2Zy=@qAH<C2>iO<qfLXXWo*ogVwrnk56a
zoEMHqQ-{<xzR5^1UX*+6?Nx`9?`!d!NeIA-l25ks5uWwec_fToQ9sE~RRJiIQqR10
zescH6jlCYB{&W8#ADHsQVK+;qCvPTYw^Ifc`^gcxk?Q(UPSI_yXq3pRN7_LPoI^{D
zwM~*~zfM+v$OM+$U<{dz|BwwYtpnhc^aaClA12~zUT~6N+nIY6^e&}{1iat1xy(iq
z)|h4dGc#Xg%i8heSs}{FYbhel)edhY9ji!RUK=vBM;ArxCBuelU5C)1$XGV+JR-fX
zgqz@~sUh?u=_3)j!cx>i0{stjzxoaxdiNng)k^6Ri;7Udm!}P#XOj7=9O*)(%zWu#
zrMoV7b!T+x?1$Sc!rT~8;sr~Cj$5)o^WJ;d0BVCQgr9%a<b4K#@6v%7hx8`1p<%Gy
zKQbbpasiL}o3P8Uyx=ujfsm7GTyrc!R>f)$KZ=50lR2B5IW?sBX|6?D%$H2*HIVb{
z;pD|{C~hw#7fy&Dw8xCns^YOX2|Q^HEuHXmqaQa)?{jffR|HX_t3XA!5&dHU4kY?m
zvOVC$GusJ{9<ixdCo8(8eEEAFPH)?=+n1P<5j<>Wq!XgXi0_@Rw{zuoBf7hZR5_hw
zMruiq;ZBIgw7JS!@Pq5&Lve3=R>ro>rN-)A@^w478nqxAGIkYMRASuW@d|AmPpz=8
zyVVWbojfi8;PQ2uf&N##m8eMaX|gJ<6>VVoWN@@)56+wd?W8(n*!t|d{O$UmpRz#x
zrGv9K^C!B>E7zC*fyrS&WzYG4@eFsoJ=WTLsRn<t%Vijevi{O{5dm)^0%I#<yy*LE
zEsdkuprldnMZiv?nta|Om&M=BXD6XO3cL$bGxp)KGx7em!EK2C4YpQ`AswNHcU*Pr
zMqS^xV5jBIFA{~IRk|G>JXW-*j%eZV+&n{k%^-1{-^>RUHs<_L$@wR7w|@)F!h~BZ
zZ|X4TRI*VX_U}4kiM_u6|3sA9--_9&7M+s@Ls{vlkbS`*l-MIKj9`~c$mTybNFGU-
ztAnx=oc?dA4V12EQwE_6dks0u&}Y`^9S4>1Z?iD}u3PS0factLfF0yhG(0ym=9}6~
z>lxp(lZtH*;hw<;G7&NGdv`u(WJDFe5_w}^Y7g8)bmd?u1>DbR>=&bxWOmJ%l1Eu_
z<qNEcP`!}ARkexWhfBw~F?xGMH}a|raVzwQZ7k(MYR>3UFlKTwt1*8<K45~B<P9@T
zH^k?6PYb>{XQPw&_$&V@R)`rjb!pXM=Tcp2G-m(Ua}5_;vKmdD3v5Gb+rb)oT*GXm
z)I%im8ClOpOla94AAOjl{$PsYI0>ZBY{Nchxi-Z~>|2H|cJH(e&*B*VT4jKPMYNQs
z%e4kgC5kUhT1jts^e|`V28i+`f2p(Hx`l25C0qqty?8<KE&mhU#Inu7*VP^Hi?D1T
zgK(ayJ6l9Jq~4q+s8r{l85~bq!I`nmTjF3g54w-1Tb4@WoyEdgJ<1(}MQd01k<krw
za|3AoGh^<w@sSO+l=qe@{O8(ASd0i71NANe*siOn8c?0sg`ai9_T;o9Po9ra^Jc%Z
z5e=ib4BDm(GTEuP5BvXen=FLGw)C>m-4&S((1j}n1Ks0;U|JXqAEw|vTPj`=Zn@N~
zsvQs!kHzAVOZyS6)Vo#^Z^j*VRLIgBVN$zAp0TZpTX5;0+lsv1sm$S1v8)3ldk<5_
zD!jSZ3-4sI1Ms$RByC4yK(!}Ecle}AuPd|8aud5BDTU69f>E~{it;-)qIy6MOOl~c
zgQiXN-JdKgXjm;6S<X*h@IyRE16-;EhVCAc$P$eN&SykcUEbW3SidIQ(WCA=6Or9#
z<1V7}?kh-fI?hZYQ&XWnL&^vRobI*Zex)B^k-1ku2U5R~)v36vaXr^8@SW2oY=}?c
z!`-Ab0?gKgJhzl<6=2daqtt_0N=?H+m|O;N12bH$Ke@8_m9y_-m<g<WCTworsMk|!
zxj?`)3da{hrPu1+Ska~TP64j!sdtQ{jSq8sq&(Ae-Kpp_3Pta`5B3ezXruF(hnoh>
zNdu<CRXc5rsRR*7CaKbcQaYBxK(y{YXxl=Q79FD%iaX3~5$>d(_=R5%{_ds39)3(<
z$~6<;I0swdhvvbxiF4WkbOS1lOn*CKVDTq&v!+kCuR;q(LIK(|pUB8Yk3m1inT&8p
zksIyxJ&MnBn+9WMFLvK@QFQYBD_$7JCQWkndO|2?CAL8T00_!K0tu+3o)D!bQvz52
z0^IUm#9dnQiSJywak0VU;=t3ZGvkelPJn3Ob!*0?JcU^f@tOkM4NToWVX~o-Zxh-V
z!{Wu>V(Zt{2Iyu849lxQjQB6+^->icbbzrTk;H8KTiv)bYvz0xm{-g!-yhAY<5nOM
zjqEUw)oBQtT50TUaVXI~nd*1Y#;Px)8%$_+^C^Xa?hc<wg(HZN?NtFdPjtP_%$00R
z6F_U`g~H^6&3Zz~T_IlKfm$vZSu3L)Ib<+)6NaBzwGbjguWMqav%%A)Q6=Y@MFO*d
z0ata3{LS3PFqEpddN?U3k%-*@DHnEM?#!jL_j4M6@~aCT0mv#q4R^Am>;D2EZ#*8&
zG|`YZ9*VolMI<Bor`EwI*jOKR;0sFU^ozmK1H4^Mg66TEIOp;XTRXDfAdv$@LJ378
zNE?E739lDpl39)Qhpa)Vs<o$DF1enDf??+O8lH9XNzHT?@eX>>IFzW10hyw?cNKHc
zQa^&s4$v|0j)erJwtA+v)|*kK_n-5%lIAD4e~w^C6>y3Yo3}68&HAOhBk+Bwcd=MJ
zJ?LW8Od8dm_<5P$HZi5Dt7D@}N6+p=?^^*B)R0h5Jza1;e%X!;$OR$M-j`}paK>Fq
zE(#}2y5pK0=dF7#Y;aXEUhcIF6D>}b3`G&6EQ1!E_V~@FaAq_@qE^sJuUNg`EtYLc
z0%ksdCM%aSA+_%VTrDtzubgNo_6L|2=tCf^knR8=cLYQSvZQZOS+~~w1gyX9I;p4z
zjvSI2$tai)k$Eucm;D9Dl1vA}ABiVdyb^wBVWw==Cx9nxNmJ!d0)g4unrdp#p!c%y
zUf%;=llM}dtNv!1o<a%6=;CrYku=^B_2i^_O=qW-sn8{mpS3aveCX+;PCXNDRuX4b
z&&i)qbO|~BL0OMZg2@n1U4AEmYbgu4gbEbiDji;oqNHl^k0vj<+NkINUBckVf~ltp
zMNHQ^nlkghd6ap?O1FgdFgGuFI~Y}d(W6U;mZze~mDARLkvQbmmcGYsZ|N8F4om6;
z#Is(|b-<mE;~n79&o1<Qid+AT45zE>E+7P&TX}aOW<9M`gpO7{JUmW4jW0XxMWrSR
zU%j1o%miV)n5r$w%TN>l27SCdE?@%H&s-Mfc3sSodqCib4Br@i<xJRn6G|<FkXL}f
zV~DiA8?l<vl^*No6`rt;I}m$A)|hFfYkQ@NyDm^;u9AX+o1BN$El+s4<bWVYaiS*B
zRE2-g1vm2HBx#IQPtsh=tk&s&(mW03dORAq={otq9B_0cGTE9y{I(TxxgF<q*zjaF
zHsMr@+OUY;po~uD#Yr+nf-yNC?-Bg`{N1FoOjJnHz0a$>o>D4YC}&OrJL~?73O1h|
zDWsU@+PDMb%)F4a?GmG`X<#(c6FUdM6VI|@$|S1!W&c|no|(f4{*nDyhIZ3z;1V2p
z_9W1E<DrNYCwnY<+0;H5+$Z)=U-!;GnXBx^2FnsVwX0h#XDL+mdK%=87+)m%)g=BS
zyDt{M3erZvr0o<&Q)RF`Lj$4arfi7I6n1&$;Lhsu!5U;2?onuCcd`yOJ!uNAc-gWE
zWbYxJ-H7T(&q=mp3p0Ye2u0}^@TK2O8;u_I>Ln#he@<d@J#`Fns1c@0w;ms&ndH+W
zK<2U6vz6IEb<X{MHKBk$X6=H$#IZeQLzA~U*EVF8u{(SapGESX<y3=_KO*I=9P5`t
zmT5m79%}d(h)l)iVGbop0o{p;)uV+Om1)ge4Gx;%K&SuMf?H{IBy%u>W_H+tGG#kt
zx<Mvrt}I5UNN;!YZ_5Z5ve?wR$mI1J&kBRFg1MhPZjOB+aZ>DBnN3RaVu_V!XS^~H
z9hzobeHcrgFfFy1RBS&WqumyhZFa+jf=J}CU?xawDM}*Ac7i4m3v^Ny;ErLwWj`s$
zA5xg*y3EC;tae!<`DO9qKc!kYdC{d()HGS1`hYWVTYjUo0}v1xwp)!-lhjcsWg%d!
zs@pHN(wuX^Y3aD%3us%{5ib4mJ@Y;|;b@kei+TpAap4`ysBB{FV|7hvOYZ%zO45~o
za&Kf&`--bQHj-zxI8fY^^li^%=rP28XCO4lS`Io2Ed|?apXx@tb_o|3Gsh^;l8(n*
zN|o*<W31xG;R-ngZdUAZ*FP#^ldmZ@{{qHVn_+X3Vop+GADo*W%A@7{)G@(NLHcDk
zXmx?9BEdj+K`*s9&v4=-l>=lzFxn~t7sXU;{uQsPn9|GwE4B;gRSs1<<@jW_qthqB
z`GIlv^RiRMFS0=jB8>RxaMR70rj!j^o|CH~I35Jp_%~Y5w!_#|Eb<Z^EHzns9q?JI
z%}}w#G?(W^0EcyKO0}k=WOtM_fN$cUJ;vszr$e0Ufck<-*DoXC0>iiPA2zt_Uv<JI
z-6x2n?iA4nm$LhI#{c2~U7sXx@R!K);`rmZ+RsLOPoXRfyC-<mrDn?6f|8>5TBHs+
zVryu1Y0JcHgFF-s0CJf?X)^<;Ay3OKXBplPib{y4J<~Vj=uw$-f+>at&}U}hy0km0
zQA{E&LA7=9Qfl=r@|u470hS**^ac3scXr<T$iY7iY%`RFS6)paP(f)k@JyQ<)bfj@
z2eF?4+R^N8VUbW8{!b*nPro17qge0)dv^!NvPz4-y|h<(T!MF6o0CLa!3g0O>IaT(
z3HF2sQ8Tw3x5r()aO8TtsLLu3)YxEt4m;fQxhJCpCbmSs>?#cU>D11uj%Tz$6-^AB
z@@{`P^a&qg)vQhS{g<~i9@K*0DOlfDf`A7t;_{UuMAjJO)b|uojzM_a7+OF)2yU~9
z_Rx&G@WcjLDASV|{jXPf)^E3#sq~Q)3t5LvZyG^J{RYnn&)kQl<qQxDBrknicjrJD
zXq!jw<#SElNT<Q`?QL&TQ2IZs4Fbnx8f|sbZl+PwLi`xEm;mhx^F2Xb019MPhZT!R
zy#N3QF+l+WYAGj#DM^&To&Q>Z<?bU4dGcTX&5@la^2bqv0l&QN!_i9Pc4cIGT)HiC
zXlib_3hCu>H#8-jz4PvBOS0r{`4#2(ypt>0$C(A?bf~$@Se*#a0_V{--7>~EgB78B
z4=R0AtA2Zmr;(}<u}tFt)RE{HQtet&COZCpCNbaDiy5CJ2ESi8s46piryVI8P&ylG
zG1A<dXN0Q9J-cw)GWoK<v{ICNniuPhn*f+b7S&Dmik<&fEqIqDo2Gbs8u1K4C;i$X
z%Sa<g$qy(zk!0HdaDgwwZ^`08((92<;f`{jP`q%ViBAePb{q8Mcl79$C?$~jSWK-V
zi&!=`-p&g3Gl;3jNtbL4B}D!(piRlxzzd3Jl~;RE!&bmAa&shyT<`a&6zSisOc<&{
z#W+qja3wC)dW4TBYed`e+;*x?XJ;R#9D@*`I>0%E?()L+s4SIPQJ$nY2%9Yu*$xZc
z97VQ<r{I8OwdFh&)|pSRf{CrJ5y#MKAcurMF_zXyZgg?&K4m>sapJRBmZ5tST%z?u
z67+#P+}DMRf6);G_-pKv)M+ZpAi+kXKqy4)@#hMeFxjL7xHW}lcz!ZjJoN#TV$PIe
zYkiXh@eQGum2aoW&;s7LNkg~vC?_mEHjhe#6#~LPz<Bx%Ee;r^dwb?)H@^#js(Ta`
zk|~P$++08?kz4%3!bszk_OHF*MmHq)40~ljOTv4YocNekipepz&g(g=0)i>(7uFA8
zg5Xb}qns}!G}fFz9e=8&1CZnK%RKQXL7WVLgq~`O6x?7My-izYF29%rxUJ^}l>VHO
z>2(-~?>W@^F5Hqnk9`uXBm;rw)WgQ)*9?b9Fz7h!aZT{a*}S(HS*KQqOC6kP`)&n3
zIVfxcAu6&S16b*Ze+pCkF9@vLKC=c-2-&2QZ^Vg^03M-C<r!?n=j3w4p~LuA{&Rzd
z6Jk(Ev3GlLso7XqVAQ%618Bj_aeYs7+Gc0SGzfwavl}w`HGrR!LpSL>D;<7(I&?7D
z60+3arJW;}*#yfS%|=~=*Wb1yQ!f`rEPS3SSt}`}O*13tjw@$QlZ!!(H1Y4uTjCE}
zk1b94v;tEY==ZuW8D%~;!YV0r1PoTS1u&KAk@DJ3vfy<)wj~x4)tyXft6SHUFv67N
zF0AO2!bGBuF8|z2iPC9fGI+hDDth{|JO3z9LYJ}qLD!c3V8&81D2Ek~`>}_cT#QKO
zSa;F};C4sX?0I@hwe|e*k3tjfN5$GVUrv>0n=R<wf*;-POajW~&3>+3`?tsnOoRYp
zG;UiA2qX&!_PGgNfctx_;zHPca)gW?_s-zsm9l6~xXoMnNc~?P7_czvc4?#6g==UZ
zR*`>VbTF5R>ar_-!ZDKPIa6G5;_}v`fTl_C_1k5V*1aj_dTd6T2ppQscBeu*vTng8
zog|WW)H(itZDM)Tjw2sI-BtnTPLPaK$H082{T<dz2U@T5b4${0Gh#eT;k>kB40Z8u
z1vcAlp}y*>gfY+Ic>6bX0qsReb&>t&cOYx~6o*Hq^bPXvm=8hzXA?*yRUmba&A408
zqSf~4I)c^u$IPowmnx0Nf^M>h@=`*Mm1*(yQbXJUMBgnwVjglO{i;v4!_kjEb8E{~
z9z2$jqXS)Zj$Y6}u||Y%ek7hL^C7&BFhy_8f48^};|T5?v3#O9gP)8(a7}Ub8w#cj
zVJD`PfHr!c?_HS#&y!GSh4xVk+&Da!)y;b_&fJrO4z;u&<1+M7Pfe|d5?|;$f=tog
zX?@BegY|6mF8HO`i-yl8!1xf}ecKY5IW<@Tr9O|y;EaEyes82n58r(Gi*~;#j^F^y
zsPcMA2fjW1>VYNE6IFy~*!xAH;8J}VpYn#hpOe-2x9;OjHgR1XvKe~U_hwU^UDM6x
z^9Z<pW2jzuS%)7u2^JU2fe_*@{hnVCAA7hx0Ltd=kT1~lmv{T`Qq@c|m&gtNKPQ2B
zX(g|)@(}IrJrzdsieoH<@f2~Fr4qNkaQJWiln*6U=X84WMiVQ)$#wuvnP3jl6Um?Q
za-!jsUI3C0{WHi4{;s5fDDe@|j2;p$q&lbSV!j0PYv^Gok%ss=6@Pa9O0m<C4JE?G
zTbyx?07p)@U(5G$OmOEMkiGF=O;cC?IFnC%vhl6h2x7G;044bdyDH-Z2G*Tcfh}*;
zQozk7chiS3rcquc*b2RfG|ufyKrS7}&DZ0a&|<r37*Px<%D6nJIJ6&E<1UQs?7Scr
zECU(}fGvR7gc4=ILYRcxa|~gi^diDpQlFp3bc;ej-Dc{{H(45&L{O0msG<ii2m|$B
z84*dR?<w#Tb~E$*ec1Czj`)P1DaSht_}|6^4(~6Vy;0lXy=vii9u=`0kZZB+oadQB
zF|>2=qHv(e5$!L)X6}lx6h&#f@KhCkn416q1PMU_f@&!zgeggsz=?lPB<<r4_lm$r
zi*Hj~vM}fRaxN(4_@UKBbck!>@jBvTA0so-%I^Or0-Rtt0EG;j=!gI!4bS$$tH)04
zHk+p2H<zjUOaOvfNK^Hj)iiA5QCHXA3eKDcJ*ju+G{XUriD`0TgO=%(AtM%>qaG&#
zpBb>o)!35<Te7vizvSQz32VnZPtM$|)5(G|%%=T+e0^u)<kJrHa(#c0lEL`sB>nRh
zhhhx&L@R!$$!9XDseX_ks9U^=&I%SlG7Mq{m-^b2@V%=vMP`VVXxB!DagLbUu2b$;
z^3!-hICqcYu-2?S>DH7ojpa@pT}7PePK-X<`)_31lIzwHrJ`^`);P!#JUv@<E@s?y
z+XM^?=#P#|`AMZWnbP?n+OmZ#;V8N@usO+iEF+Ds;rvj(e>j;SlP4o6fDCIj2s1sq
zhk;FS%SXD{(SNH#JHc(sJ!%cZrwd6AVlUgUP{JjL#o*G{WDg^wlm^6id{EsFfI=i4
zc6dL*Cw;)NKi)ijcxnjSIc2KE5Wr&OW}FhXLYJ62=PJIHgStP3_iBoz-SX41nGiyB
zj*^i27^)xo*s|_pOUsikGa&gZDYB2?h-CPc$Yqr;Zs2_5N<?-^+j*HCxpY;Ji$RW9
zX#m6omRA0Mbr417^Z|lSNNNrL@`bnX-YN1hjq+v4Cu3p4cx@(DOJSGr&mLr#e!(Ri
z%wyljN8dryADx%Lg%0nx4}n%XJlnUFD?ilWkY{RSd&58;kaxnVTq|$-(RVcy$V9`v
zjU&XbiKOg5j$HzXC16xTT(U)%Yv^gUFpfzF9q9l=cN`nXR)plW1zje#v^suaq@xVY
z2LT)0`3vIC;wc6k1~i2x6z9O4@$roxT1l>0rp}`-2pkAJp~4&!7$b&_09Ey2dqpEy
zO|A(}3P_s89!;H&;}$&nt|}lXQlfjm;99(jiZ{|@O)u;|ea>2rkCm9{1Mg+ri%zzC
z$uOLMshw^;xVHI|Dor6Upnqcdnsk}dCngCY{+H@f2Fh!ZXMVhV$HPxVtL&nIn}wvl
z?s(I&&YwVK;#Y6StJQ@I&);8SP#AEAH#Q6r^`Dh?0|Ps4l=Q4t=3#FRU8kKI<TG#^
zc<7pq5}Hl042FOQ5@58AD;6!0DHj_4lwFn3#nYouf`h+scGA7t?kdTZ`o5`yTj8JA
z=)McO{75J8Z64Cco8M$Ss)g^t(+js|>%=C?0E@{e1i}Tiy@GCjQwc=|9LWX))6#_A
zgu(El)+;*i0!8%&H99xE(6Lu-rtpRuboBu{(`#QB1_pyw3!!i1xN<`E;<JhpCatPe
z%6C=?fK!ajM&6`@wcK|W5nTUu1bY(z00N>xo~T7MOs~3OoBVyCzW2>#7Bs2t5Lf>q
z9-388ClF&CeChV$AhZfUXA&wzi^y`1vYJV!BqFAK!)wZ!FlpMb`Uq(16ub=p&_FN0
zBVw{e^*Eu<HFb;$j!U&KZ5DqobJ`w98zz{b?A{SYpJ8lz*g}JrfSsov>mPlJ@st^C
zjvCFZm>@DxS*1l47vg8U;-u`QB5Ke!g)7Y)Fw$9?F}3X?`n@5eRqt)qVQui7dxk(y
zph#S~h@H#fG@4c+7P78e#7+u?R56IK|JO(YJEq(5rl)Go?`u!fmpDq02GwzMeY8~s
z$*}No6;?i;pBLIG5L|>eNL=yG?QtrGOQlNZ{YN)eJnNBbWsw0ipkafXCAs>WcWk_-
zoyX$=LSAcwP2<YYt3nMKNO}70El@J{Aku~csm}#vuaXhkQJp)m^ZnXB?wUqyO+t7W
zo5ufHgbu_)TJwV#baJbw18L)VSn%tY968$E?-?zG>Izuy>l-k17s$hg1qwy3fE)7<
z+et@KM$?XtXoMY^_R&s`N&8Kiu87y{28QXX4`kyJps}YzDOn=#7usfJF3;z`;T4IS
zUPUC%1>$&Oc#}!)Cy==9SHh_Psv`6|_<LJ-tpmK5Pd7i*_1xVpR4&8Adfd>kfShWs
zGM9s|b7fKEpEZ?Ifa@`;B3!AXFw{@L5RP5hWD55)-Sw~tg=lnyGnRJP>C5o~7Ix%a
z;sI(uU_QAVVl8Yq`N6Iu;-pHSmK~j|UsgvIlIU!dVuB4P!PJO3{92Xhx)#$ad;y^2
z0i1F+z%Ofhb6l?|>E_`VWyc^b&bA%Mhvjx=msq2aTum%+ZBA+%r5!xfyxjt{%|w9+
z;Y)%J#;(NfOY(4>C?oKrL;x#$YjV4NZgc8hQGfsd13*Cn3F?GXMAQHKsza-8CyDUG
z?ihDVY{z0<wwJTw4uk610PGoe%mMn&qmx4`NNl`-(s7UlSZ}OUf_n{f1nuSEC!M+k
zH91aQa?5wKup_gB)ll`<_0o2eQ~>xjhXB~Sy&Wt4o*VOwvVfxm0Rw~VYyW-?qpXD6
zc?;tv7A?@DPMMyGKsb(;)FORjW@N2l+K17yg-Q*aq^1LTUq$vHgNp=B8}0riMGx4Q
z4O21#+f1#}!-E$|AAtGKQySKK`rrR6xz|kDSZrrx&xe>!$+c}GLKup`B=c1;I36Sn
zU$w0yS{QGKYRgp-wU1N#F~nF;@_>aeEM0aWNI(-Y*;QxN_GHY2+QS$@=q7ernTcD$
zxpHA}IG0$MAEI;R@}QMO@1C|~zA;q+?4>+$wPpIzZm|Y<w$M>XQ>qCBEg?Em9Pxy0
zN4a&w%+((CS5=(EiMz1rr;LdDEUYpZp3Esy#^2{7G>ZB`Q^^fEb%9XEvo|0#0%ip{
zOpwEt!I$rKaG|%Udg9t2uI|dLBh8$^YYU)aN<9CDg8h<olEdh-PoSux-@ONFJ_PDp
zpK%W;19<N~gW{{FQP7esk*?<MDQ!#e-_;nmAK0e*Y!GKRRf=u<7cCw<uY8J-+b<qV
zW#9;DE+PzU$#@)UZ$|=!bjpFu-bYnd!BTvQ-eZ)XrT1DCbBo?0@JK$jJ5#r7gjh=p
z>`8qTg=XLd(Lkx)CDlygeEzJNeYCNm69WIULXu1{N0|_se7bx_nMV#^nkv5-8hTb0
zhs18(P{$MfvI`7ncLparA4zk5ybpXLd<+LfvSb`IXZ;jt{anxupl&ie9w{kOFYPY;
z<qDvj7pF&fWzg&uPt4TnClck5IO06<sarHyr5h8xz1nXj#Ft+WByzy)>hi##^y7}|
zB86|i>lAb3fRG}`Q_~6#R~hq5^L0)7df}OKz0CCCLcm)`f_vpzHU9DA7-a_Z8IMEk
zt}6MPe5rKCCYAARD$&?^r{*JvzPpLX9>Z{J0hw#)ISr7zVT&)8V$JGMCG|ZnOqdM_
zL#g(-Dj~qPT;_)ZxE#c{^@L)}QywNK{POBL9EC1oY{&or0arl*0(zko(KP@5x&4Zs
zwda09D|Q!u7t4ZlAudgiNvAB!$~;>Eo?!jVWIKTcsOrjppc}T=NL(}|$4jqRyPpqE
zxFuIq{6-Dk(TcO!F-DFpbP^}ccAMDh>jcn5l+apzSATS)s+#<<<7^w#sbDfKECgoL
zU{`59A^q1}DD;WQ@5$3PbA%ldi!>B~3^^C_UMyT+2w$>r0|me@%kpfbwi9<wZyrHj
z7@e*K9ycb0@zrzQlEw&>(`$AKZ-!a*IAe$NCc-p|UiP8@KjOZ18@nzLDmDKE+T%vp
zuJ)~&0#=kx7>%apAM43k_Sk(?1PPNDRrgcuz_=Hkf7YD;h##4LyjY&j#Da<B0ob5e
z5_Bs|(8K;fw4<z4N*mTynFhK}y<M;gZ*O{7V3lu8S<rM#4Ia5e*+rL`Q0aB%Cc+>V
zXFS)(uB)Nm$Mgo=v#sRnU;qFBxj_MfdZ852H2<o52lRtKWWGxwqmM*h7+Vb=AWa%P
zTGi>Y3zm4r&}D1u*pM-(c$N>9Xqq%Z%1jzsDtVLNqr?$n%wJD1_Sm2Bx+Ax3Su5%O
zJI<3w8Y~D12*zOVCBrzRz|U$B(Ij+q4Zk9zH%9YK)t60^P&qaY-hlTo@~v)St1z?-
zyTdLco;%01t)e`UiY*LTRJyQThdEj1r{EpGK=>fCg*U)WY~Ep6lnL)J4<9c$O|Sp}
z0dWDI$8>Mb0S`gL|7q5ZW_TL8+9dO5SeEySU#U+a?HV|$hFJAkEC|X@H1;lzBx<UA
zxp{pp+a`nVE|mmE30_<qr|_&6;#8=d4IC~7!r<^|;2%QxF^sfGi-Tei^5D^<LHD>^
zD)d94IG50}<{~GGT$dxFu+nUJNr}5x%wS0S_QjOaN-rN31TosL%hKJJ3`-KIA+&aP
z1%)f>?}A^~rJTw0l0$^}yHi+Y!x*W_)2rc3!<1on8f+pH!+9?J2e+RMKT$ln-7&f!
z?><T61<_pv{w@T(HUSoqTS5!pUHjWBwf0?YAADOY<1GBo*S9HtI~@*Ht9H(Jccx>Q
z*+VYUu2PZ@z29M{H*gKMy*R5Qh&oT<$J<^(TsH;IP}z>ZKdtHBvWw=n!imf?gT-~f
zs~1ul`9~=(LcEM!f9sI37OnA?|MJ@&maMn>ch+#&;UawF*p|o)N5J-XZ!t)&LR|k~
ztu?K<YE@7G00FiE0txJQL_`1mt|R*HBa93{S5I@Mq@~0Y*6z<FX4{WA;ki*;GmHJ0
zWpw;1K17zfSAonD83ZMHbnXlV{v|cOS{Z|L-x3*VVyGpFedD1|VHA1=1M>K__Oy%q
zSrOTRf8E@E^dJ-tQcm~75AoQya0^aco8&(lsW7mqAxHE&HOGQa&U>Az(_jvFDR&Wf
zK+p=^SbZ~mFS2J$L-c$9zB*Y0niV8al;(fUbpFreU;Kfsampji=Kv!NO(sLG2QJfz
zl#EtkY~Q>4y6^oYziE|I{YvciQ?_;rkPQ2t1@VaBi478P5yv8aS~dsvMG-ZmQ0>Gf
zA14`J3r(5Y^->}G2gS8ax_zX*0!fP2)3ZVtP|p{Bu^*^ua%cLTz*)~ofHWy$G(vJc
zs9oR4iEEo9xU>=XqrZQlrVvX?<OzAH(?$vga4#)X3;975YDoWmcB39_7wPy&hJ@)d
z8Qx0HTR3F}JOsP1_S&XVwi<a?D0NHbg<L0D*oscc@`@i<rUNeykZc3VC4c2myGh+~
zxhLPfGr$i;Xf-CJwv4eAXDGoJ&!5nlQpN~+Y55<ANB{r;TLA$AdmYgb|NW?HXdj6m
zv<3a(#c)y=L$bx3>bY8X0mMwjhpFnlh+%d7z<c-?@UA$?BzWLwnUoSuK>bv3a_ew>
zD~oPD4;!=hc;OMGUz4ep8Tj18a+lsb0000$0Re)09nmxY*i$p;3E{IjayJGik~L<z
z*&&|D*EN6a{LoE99FD{{$E=`1Y3%5Xn1#!L|4hKFNSGQUhBUzV0003t0iMZfZ_N9n
z!GOq4=Ehq7x3*6>w<HytXm`xgiDWi(-P9^|*u5Q6#e4rpvzd4CdDbI#ycV2suQ`tV
z;>o|3dRo%5+aFk43CFd7BW|HYQq4S38sj=hnK*z`t7|ijtSfK$9LTrI-hqGuNoocv
zB^=o~h>5tsA?oI<!VaGqau4akK-U&5YBfA3Oqj#G5vD_=7q#5QS$yMG8*>%30^&N}
zsU<*P-BE|!MPrUr5|<muQ(%Xl51im&P2c=8g~fMdf!;%C?`fp9y=2=#4>}7=kGm|v
zB}nmskenOu8K~yv-dOymh-`jsq~t#Nd|aVJK&7)-8|jWyc0qWKg)-{E`N*b6mhuHo
zsO<=M6TCV)ehgD5W8~Z}&WK)ww3o?m1#$-gEB4b4uD4oV+-oFW<TU^Q0to>E3G9}n
zL;v%BNX>32k%~<GxqgKC1E?3xQI=@{(8Pc>_1#s3wrH-lZRIq`{NtQWxrbp2*R~FV
zu7ic%cxPDM2R~H@b-o>p#;si@*kD8lC%-$qvM@BRf@0ut=9x=Psi;3I=Y3;Exo{b+
z?~v|Tu1lp_h&O6Ai83t=lOYPINsn`^KH0s!^L0EH{vNUIlc1KGK}^Nr29@7B|H1eq
zuVtyt-@l-d7M!626&r+SZl`_g=Txs3dP*uRU_Y2qmfAlB@bmn}SYwd-SSd#Av|A#X
z*9wflX!z4-Qsn=78l?yaMZy3e3J=r&I6*}<$iM*>liuf6J-<T}O-2UW52oN=?}X$5
z&?sk%sV)D<C>P;0cS;6m*dQ04+ast3T>H$IZ$d=?sC)<0!9XyAjEzgB&F`i80gQv{
z0aA>DZl3!ZR{kQvUCSL2EGIZ!gtVCoBgHE+`QosVg<HJXW*PS$)w!$69T%-Pm84~1
zy=!greX~4k7hlEUs(0S@;~kg<az=G|gZU|rhOq$;)%UiK#yi<;7xZ1pJ1KHP-@X>E
z5<JR+`iHH|72L07yfd;6><yvlya?P+m;;6k9O?i)dH~=cdGg~-3X?CjGs&NKE&AKC
z`=vfzbv8rRq>k<4CTA}phh6`!te>Af%y@$$8j0OuKh{1F#*ji*<o%l4U}{+a000^R
z0Rnp^sSyA2wAfrTpsCe6TL6J}S%A3ufdBvi6#)T)dnKte|MKr#?pdA1y2TX1kjDT3
z22(+r&Pk|2Xqik2-)Rk#o#%g76bNt-qF73R5d|ldQXhkVVj4{(eO7B#p^gYEEi6om
zT|pk+DF+hKf~91fADU{ru`P@*ag0QeahI)OdqHufjQUIANc2vFcmc$io~~$}Hz*g}
zVl0Sw_uVADkfd5)+`eTyjm%pGyi5cJHO!Awu|VXx!<7*>7>NF-mHrU*7D!1y^$r<q
zF4x7<61?Xe@2&+A9bn)M*4*=FH7C8BV0amg_KpLeN<1){5IyHt;AaZ>8e!I~W_J=F
z7y!mWg%fF^PlY{$!LMz;FC_C)48wCok`sHrg~<{zFyENeCK>|yvg0PZ+P=s-x7et-
zDn|}JM;vjYt_PO!B=gQ59`D^igVP8d6lf2?{>c5ft|%x^*U(Jv+Amc#CLyr<pfonI
zwvNcSA%gbydyTS2qGZ3<CFSl|RLS04KTO+?WYELwlaI?=hDWBBjq~@Dae^a&?$PcB
z-igl{A9OT|64dYDQ&-??Y6o#^Jm40UKLxxWXoW_+&dbxRsORWqDhTVtT-q&4C3{C%
zP_G6^@>)jk!vul*csBd;4m7hO0Q(nd@ymv9>q!VY!{2A<Z(6${1H7=N#$h%xxe1UN
zRFNr5(j!>xDXRCVyp!m1h5M?eK7D%DcS9|KKofp}PDsY+{^Wokw)B&zy%hA`D20so
zl@jWV>v2ORP1K{*x-4~|#*ncB@#uE$%^kPOl?k39lVS#jcA=v`jL?{3;YW+RF7e>1
zy5toU)?vvuFEUt{ZPZBn4r$bqoIhT}x@l)LI@;J?@rl(-4tPoBrUxERLO}<roh2)P
z!o7#ICXwO3l8fOZd}}~h6!zCOXb%@QV%MJ$z+}c6Z|?TP@Y!JqupL|+w!LYYk>pl2
z2#}4|$Z%D+1!G_z&r698iw)xn_1dLz-xG1zebM%vCt!w%MFVlaQrwuB1@Eyg;I9^!
zildmdfR5BjD!pvtUn2X>w9;SiNwP1twnpBMl_%~o42C4t)1}TM0m;nH1A0geCCg*6
z*<C1zv8R<1fGV>e8<2QqLYYr4Sy6I-x6E{GI9P~_@R;xpo^L`9C&X2Eja^-INWwk_
z;5tmA0BuXXm0M!-(}JJZo1KvY>5Uq{6ZJ}G+uJfoWW*H)vL0&11Ofz$ZnmXtBByzE
zZc^5#lLy^+)n_2_-DS9H64B*b2SoRzjQ-zZoQ>|oPerFP-Xi%KTQome!Y__Ms}tM@
z`1_m|=hd4=L_pt>{at5HnIcUq=B?8RZ&cq@z>gR9jbd?=aAP6Pi1dhFh&{JE#Lc3L
z36`Fc-*1r{n1_JUxDk}IckNj_qr*}@3Pr})#qAjVya>au)ZEGNJpD&YyUgB>kob!E
zL!Yh=cU6t!?>91=Fxf;H&{ShPS70uFEk^5H7+PwHLv-SK4B|q469@op|8fza2yjT*
z!m&P(p*lPCRIfU_A^Usjl`}Sw$IZv8c^S!YL!zehDV64I-Wo+roP7OuAy=AG7N?w?
zCauxc#~0mq&};!Ts(S>;;FeIxS*1113+oO_lHLP8z=GJ&Sz;8%N6sF~dbtHrtU7<n
zFxI5wF2)?-5vKmAbIz4b_7D*~VcmEubG-eNR5&4nYN1B{&6PCG3CB!jHXFJGH0ABh
z#N+yF`Z1<X@=?H_OIyVvAzW><03;FYb{tp-yY8}<<EpGA9<^w0Is;M!co`!mxnkp=
zjoJ?T7*@+_|L4@x*#k$%{}VUPX23+a)Ww@yV#!$h%of)x8<MT3*48WuEvP2PR&4(L
zN76@_pNv>sGJ=K+fZ|b1B!T4Ic{BDCmA7VTsDB;no79rCgD9NqM{-DbX#?^#D9oeA
z&i$@7`cWKny$#;e$1bINgen<Gx*0Xm$5?_-*On+16=sjdEl%NF;gX3h!<Y6f$lV$g
z-tegGp9@u(8_6NkT7T#f-hpt8+ohMRV6F5^vw_cf2U=e}qfPI35r(j>#893bU?}`Y
z`#wIRRjecJ+;T-SX3^?C0G-dSrN9o_a9ojZtVg6`<ko|!e~Tu6lW+#bs=$Z*5?kks
zOL=Ec5_dN={oYxwcQ2s3-(4UCuivoL@6?#Rsd-K%r{uN;pI>VCp*wf7@Cbr9CLcWm
z`%II2X}y-v>;M1^{6PW<tmK-67KxO=mHz`+c>c^(;0A~NZm{DN>gac>JB+jtv5?!m
zLs<g^Zy6(Z$60jMK|z)}JqIPh!fy~V68GE!nS}+NcE-pFiyRu25v0*m&f*1M9YmD(
zHw_8*jE-#z`AO4q=g7fGks<0*GAgduskn@-_oRyAH1h(4$T4?_DtRT&m!q0Z;jqIO
z2`BP?Dr)?mjOb`f<1pdQ@49Le<_xtwM#)fh0c-*SPX-J>oxzBscN|KRQ9vDuK%aRm
zgeFvm)VkBkdSsG8WwH~=SJltr<3YTzOh=4A0iOJg7%eE=Hln2i54APz);eZ_J(;R1
z$JCRqY4?rjRtQ0gUgmu6-Vph3<b(PmdB9bX`WXoKm4hHbX_SL!#c|2}-v@?*Ha2q!
z;}>;)4oK^3aIn+v&4&MFa)x$FH>aqu>=Wmyzv1WNBwARQ5LkyfT^yVIt|VeFU1Rha
z@h5?2X-N>o(2~?{oNJU#4m+6OhLAVm=}T`eDyCB>Z_I{o@-nIqPgAS7*Qf@R&o=#f
zj``fGCdAV(i%*Y^nt#tFs&lu6i$FBENib9dCfYO-&jC$8yB7E1<bNZs1Y{qAI>(1?
zQT}5r^d<zYZFI`zPnz`iZ84_qp%W^ggQ*U-<)qL9$KFYM{G%@z^hc~itW7w6KF<<p
zRx>vxefPnpaP7rmCUxECm5#}-zcF`XYgT5Otj<Ks!>pA`{7JR~$?Qr3U&jJ33SM3;
z<s}BgB;(VsWA<O+?4cJnMCU6w(QEz-Dp9>toI!z&|Al8WW#Gz&p%l@E{zM}Tv$rJ?
z|D#ByFtwweqEGKnfJj5JaK*npf&g$g$a9a5`-~=-Sa&&!lypKIBR`y%4Y3_%H3M1V
z-mU3&He6|tTEh!K2t60<P=e{I`j{IVpvrvGLaNWC!i-hcR`*M?iKFICISh*Rzy{n|
zxbt!tMu)5=8{S?&{NvQ0=nq|<k}osDH|c|d3_uxTfD4@e)ZiV?r>{3%4iIfX#s7*~
z;b+--Bj1%BQ_wpk=>@UTxYyI8m3q!#+s@=&v)M*7^sK2?Zd<F~b;M)m`pZgZVK>~w
zOUD|l82SwR*IAies#6oIt2YJJ*0uN+_Zm7Z$dsDPmd;xxgm`F#H`@MJJlLbkoQ%^S
zU$T{tfe^6&39_C&xb3{GHR=z-d}zO3xkpVN6Y5h?zy((8sS2}w^BUwF5DPI0xoxp<
z-nn2sXe90RyyZb<nJ3mJuEMmUe(3orbNYKjIOsi>WOE~>SlIE*YFoL@C?b^8_Fqxg
zSp4ry;lFXmSeGb{M9MS6$1?0Ra9uGZyP=sXu^<KbEH(3vwtkjBF!Iu)N|l1rfiHPY
zowhgD;e9DFPt+p5nv_>_o0Oe<w4UqupsXt#0i@M;RwN;{AWR!Pza4@G)kyVo%7xvJ
zZAy4+WYo*75d0VyX7BP-`CuxZPYyW9=J0V)pIGCgjLjbUr$3>fEmJsRgLWe0l||RW
zFdzQ~ol*TrD(TQ`6(Q5zi*gFaC=H|@QItUAdO)I4RkFzNULxFE;z6iCRUEfyy<x~E
zxHxL~kOngb@3)OfvqDfcgJZ9`d@vfB4B5zH)ElwfwIqlLa=99!!-c^i$Bjx<wE)sd
zYkcgtD(6eN^})`Aj;INwqj@cPutnAHJ+7p3-npuE5egdoQork>Lze|o6^G)%mFI=P
z6R!5EH^ee*egnD+Yukl?<v9#D@KnLlV4J9(!k@Ci4~xJwAf@+!f2tQg^sR8T1(w5I
z?8bz@l|y7!pc*PrD8!oXnNfHVh!syw^2=N2e84=~Xq9ce$amz~AM>BT3Rpge#E`V*
z$=yQr2@df@R@1?dQAE@JwB~55lyg~+=8wwKw;x5Y1#z1n5TN3>mi%VZbn}~HZ!a?4
z+w^S}K-W}xn#&SD`7B#4;FGGf$*ks{k8^pe6b!F4Ic(7H&B;XFM?RGgUXIUCA4x*x
zI=-2V_%Sbj%ldNzh}s&1Q)h;!Ngp6Vr*o{)XK?=@N#}UvgNL`ei;;fUueqD%Lza`=
zIsa?_u6&#P8&h74RgxWsC=4||q&m!yzFVAY7UA-u+KP%v8_r7}Jr0vVW@;DS>xE2w
z<zsgy7x_!n^`&HbzkfdYtXaa}YqKhXL0Tq8SwEX3iXK0;?PyfPy=C_#+BS<0D-=)5
z^sFf=Q-THr>jbXcxOV^f*jfONjF=dt_Z2q6IFqR=DROl_l~RN7X*HR>Zhn&djhzS5
z@1|WN{2XEKlDAq@09ev4rLt3glok&<D_-zrL8YCR!Jz0k^Md$<UX)VjXk8g!;gR!s
znLX?!6;ZcFp6#g}Lzd=f#3bmu0e*F|oJ9TlCVK~A^?07^yd!=GZ55R7A0^gj@Ttrb
zh}wRx+_QtkxT~v}qt9b(o<w;{RaWBxaO8Uqp;AK|?@pz}h%hbNezl`?%xsn}Ng1G)
z#=@8CGF3Tn>k^TJ>+nap>Y42nE}E&Xm_p10gCwo0@Cy+MGtjUVG;yPoV~=6zJ4B0#
zXX5eCpAxm_cz0HR&^&v4Js2E-;w&e%#F*MV71=;fqpO9taoOc*`Ad>)$>?n-J=#MZ
z>*{$BZ9TF+SwOorS>|zk+~n+6>SV3r7-SgAn+P?Xd&-zaO?Oy6G>k1@w9^KLDC9(F
zzZadQBPmUxvrM#C+Gi3gKQI)<A!4Q7vFh@_RBf(a@Ned9#;9=ns&Q#XN)MoP#}1SP
zoy>zpKf6i>74#k{6fYy#8J?Bv?KX0cOEKw7Mr`L1aJ1kSF*)p2f4zrEeq8fzBNjvO
zfT&GQ;mr95bbrNmPCCxPz+!x3P%uQU*NG;V9p(>J7nkC3K|U^$<)OzeCI&2h0^6d@
zfK(9I5@v=D-Md0ZDQ446W27%yz`~=OD-A>IvWijN^lE17SUpq>Ojj!1`F>`4+gDv1
zA`A`!AFeb~mUlbt3eZ?7%eld}tqIHWR#oz8E=^QeWo0u~SH|O;f@T_JOHCLs>IHg>
zk`ZtcX6&>Y*-*+5GYonundg?3-G+e$U<QsdP523+To!sFn|<0pRHq%LfS}11)pmEh
z!VWnXdQxZ$NL^(o9Xz*SeYU~&V>K8+J+8hYISX?zuQ$4e3j<Rz<_rcW<V5Y09`a1B
zC3t8WhvvA2BIRIH@xSQyL6m15&$pM#rLmGBjbLxhTyU>cq@IHa__D`tlu|vp9Q3pB
zA=37yB#}u^J~Ihk?#07Vo?3DH`CCoI!cn6MqX?Z$UGe}Ar(C}f5+fyzYhu^ym9lmH
zFE=uG<zssnOwU57g$<HeRt+r*s=MI>hrc0%^h#ROFZrsEP%tl&PEiLOshi<;*@D3_
z(~%jCCom|9Ew&0GrwzYtm&pGu%J8&Ke9Lb2)wvFvjUu%mR>c$F+#b%uxc^NtlZ+F_
zoWmETH!lEyN_pz7x=871{g*2nSefflY3p0*hopHscR!!*gY*`SRMF0D1Gs{x?}Ly(
z>dW8l`34kV7UEzf!!Ewk)wv`|iNjm3)cpo7x4RfE)jL0vzO>9188*BLV}uM@82c00
zz#<%ksvLWa?=N)l(N>~wK{v}5+E5#Y+N=GX34T-HE^Mw*NBtKLz0<y^=fQM<R}}j~
zlUe@Da~|y9OY$PSbXE%|&zPB062L2>0kNZyPX+SH;yblrP?(+kh#I2cLO3MAwh*wk
zM%qnZkglwA86I(jy$w+-yDF)UG82X&jU8$dsB*XEFXv9d`tSA{K^FCK_IsXxw@_6b
zPkq7qi=sZDHUUgkCO4T#DD$~xn*fuYAwqRvt3{Z}3|Zla@?guGn0fwH#q01M2gED^
zHG$I}@Qg|Q-kI_AfBP#yb@EDMrgh!BKjT$*Ma<zk$cI)%^T|Mf1z@t05ISfpSa#(x
z){3X|rUMTfI`D)RvA~zlwoG77<rg1+7CRSZ@P5RBMZh-r#RM(y#PUyz{J>1S0LmFk
zwK9TXxV&NKk0Zlsr<vKFNT?Vx<chXldu%JHjhE&H*Zh+m`^H)_rN2{~m<X;2WBs&M
z?wtPF2X}B1xEO)LDEJ<rwL5*2;+uEDpGZ#WgL)gPXz1_V-8e%uh@#7;82yLQYn90%
z2jM^EqfLDzF7;xrR+og;#iai5x~>-iFZbB7qrXeH>)zQtL$}LQ&+79_VOsfTZ27`*
zFEae-Ct=n1QqeWcg1^RwxL-7;{mx~fk?Y+B$nJWjhwB`$G30yIo_Z8)s#Sor02t&7
z<My6<5+QAAY`@CEhCaYEvo9)2d3zr5;IPrTNQ~sTy8a<SL#FKl0!JxWf60UY<9L6W
zLMF+HC*M<)nT?)@w?Q6cM>_{U`_8{<+YwWO!irIwIiJNkqjvjX=R76!bgLRS$52VT
zaJwoWhRA`mr~m*6_CWyxYdI#N1)^mzC4U0BF#P$(FB$hAaP@-W%u+^02i_2r<x1rY
zAU-!fUojqOAZQ3>eq>sJ)X5vZhx7SXgORMLp<5%KE@F#+@CGIAT@^45yP47ZCx-HR
zN8j{T4IZ-B27~>mLn)ogb!qbZdHR6PE}<ZT71LnG5$rR$^|iu1*e?uOohcPEM&ON%
zFl_wcG+KEPvr=a?1ZqkOz8@Zc9!bs0KiDE+H{TV&8Z-OMSfs0yiftS=C!uDq0mY8b
ze(`Rj>$fR7j*OHR+;2&|UvhVlGlRuSf-kWiRKM4Nce>mDuauQxL&I`O7_}K}XrAxp
zP~1EQe2T%IGopk&1yLBjcNCf;j6@UAzeZVD|8q+51LU6RjyL9>_&Awyz{9er#g7d9
zft{D)k2t4($)k6S5kxK4SIscmq%BsDQ)70UmxZCsd~D|jmXJ;Uz{acG*aBZ{S7SOx
zz3v~);HY=0-5OgIH}M>)&S%+qQ6z;vw8j*;2P_qe_fxrsP#X|L^rR?Hd1pJA^3jk7
z>so*F7lSJAk3#e=+Rime`m-6}IC`EY66^Qk)|qCdS2oXgZKtj)L(5Gi-uk@93mx`K
zXz@4P-7p%&D-#J@jImO?|9LqIbELO-3tT*TzrpV#3P$*|h|lx*0W%qxRH-wBx(4~%
z=TH<?EDhv&Jk>-Oy6JeDZI0R5AZ9WtR<!;hqW`I3nmK1){rHogoRXOTuxCxTYh`=F
zHB9zO;kbAU`=u5U`p|A6#wNDk^WliSfoJC4e5Y;bd^h5rY!<hQ<Bx>^V4P~u8jBtz
zj?q&zNeFUd?&Sx|JW5PQK)>jj(o|bw5RX118ffS?OQs09p!U<Xil!4%%Mjj`M+7bK
z{^6J*eh*VL;eK{=mDth@CihQlZ1QL>)Lvm|O{X)hMfV)vk4_Gcsytf%R$l1U3U#9v
zLB}dJve<pc(K@Z;+1a}QrPllC<|6hvfL%=LMhn3r#(wKkdD&|2E}^wojA_)tlLs3!
z=^b&y-3i=$-zGO8N*McMhIG}<jASLYz%5_srb9jRTW2RuFp)Sbq-xmrqUp(;Jds&D
z*$O17rF>woeM}>^rByLpSkEe}6ugg5=oipANGz(|+TkCWdWFZ_g0o>GcwC`d763HJ
zWDc3s@khlkK{^QHcbF%(-8<FoD%cY?xmQ*5j(`&EhRUplObl8-$*#?w9j=DWl=`r0
zg56Wa=Pih+GmW+TTZO-3n&c&RluyL@pCt&<pzct3-X{_N3&~pc5h|_A2F92gV<6Cv
zV|J_P><e-$1JxfA)(g}^@`m4=*?mX|p2s(@=iYa#O06*A#HC!W+Ge)(+SV?K0PITZ
z(29{J^=*wxZdOZuhAql%Oi;aW$~`xf-x(Q5I@(6YH2`pL^krW$^OyL%Ovk1LA<vm%
zSr?(i^BnMgce07-O0Jdyr9No+QFm{-64bM(w~y9y*3|hADF7RhQ}CI!tr|Kh{4<uW
z^Q&a?kUXs+$7oeMT$7_>$iH^4LH7M0N{f(2Pe+)3Ef$XF*lL1d^+5JBTQ9Wvt&RF{
zU6OyR=O}3bR;2gHO$zydqu3xzcyEV9uaZw0X$qN%PJW8HY>S7TFb}fRDSE-xB@Cjv
zUap&KS1atk(-fWyi{lK~<&?4zMZNcDNPf%xEJ@Y#6wT$ZZ>SBq2Iv5r;BjIvk?UHn
zcX8hBCKwvJyC3)t<-#-|V-5VmlVwN#T2Of0AMXbsi=862cbhY6Scu7AWjw6dNhbhr
zI1h*haSO+y`lshPhE(e8n#7t*5(sPaDv@QX4Ck<&B@hev1B(Gqb3W=kYRLnQVH%Tx
zz^)#UVafD2X>IZ%qz{@N9gZ)oiP3<_WU4^3`Y3nN3&B=@`_=;SA*r*J4|nJt?a;+?
z+ts1s?$2M-uRHV%vz^4?>1OGU9={ws*W9QAY)i6?M12jms&-6%Yg>NWvF+Iv^%d>-
zwwrKxhRs&}LYCIb^DJHTm4fT?N@JVmV?yF}^Py}e^}+=Ihm;P$&_F3Gnd)jG(H#I+
za~o|k?+Hp-8R&>pU_1*rD=PVUbX211(o(xvK4fFSO>(<Rz06zX#tWdqr8%Gp)vFPf
ztPO<9S?t~ucb_-yYoPiPTav#(J@w@Vbvl@v_C4|?5@KdmO&(nG!P0vpwG)dYkhZ$J
zpaFW`%N-ielKdDplH1kV1_PZUGLka0s#HG#a_<a%efGRX-&lG(JKP*YchI>k|IIeu
zsZsG0Tm;F#Mz>9~#3`((P|d&-o=qjNRNtKR9%8oPLrSw%j$CgXQ(_<Kf0gs4PTh#|
zV;VW|uz_O^?20|7hs*JW+MDLK!s)L5Y2%%w*4&-pz(AE(K|loh6rw|tD^$<y*%=Np
z6wKj6l-`nwuR#S2+S|FpXRhe8vP*pd1^FuRX$^V>k8Os1b9@}qwQ6V_b7I?n_qB#e
zYu0ghM9NLXkKkV`TAQ`KSzMxDJupT|4wninPEO<%yc1Ytq0E(fY{;Ml{?37B`GP$8
zZ@eBJhm!^W4aRo^dqT;tX|JeyR<vd8X(ypZ02)=(dVCs&Lv;I?&U)LJcEU_%AEkLX
z>7-C6uh-=;=EaZ7R!e*2JZ3+7WhFeL68q;Bl(n7P-(9HftY)1$i>@u@-5Z&opNLwg
z87K5!YI(edVg~TUz*ir;LnYTtu04%G*`?UI8#8AuHEC&8%hkDY&w&^AW3BGn*F3|G
z8>IHgn8pl3^VO6E*R9y}-nQrM<LLJxcE2}tJis8Q%mv+NY46RP3<@gyx%B0Ctpp(S
zFWltGqn3DHtOsiRt)|tMTg*$y?n2~4){k?(Ba^6vaD+F1$`%GvJmomw?1Iw#heH}x
zmkG7{`N}A4P_94}T-HInNNcpmtI+=)=&&<OYU@k>kf@&KJkW%Vz@4G8%Ly+)QTV^6
z6jSx^XYU<-iI2YjG2)syM)+exjnpGUCC^hmfQYzX$ucqc3<&5^LrDCD_jfyt&a{<G
z=?+MmZ(v1&e0cfhUV#ZgP+x!m00oah0fK8eCZPqQWiTRNjF^v+z~=!fSmA!=6y#vb
zKjdLoS853OH;N7KhNQmJAAVfi0Qk`hteapTli1{#BOls;E{`?z6On%H^(foPvjBoF
z)ma?x=5&I4dpV_vP-5G&r0^aiOj*RD3T{?fB6yoC(XdB+729biOrB0Wb?LYwhZ6#U
z(-<MY3H`h?#}yh1=EbLU1F$Erl;c<F3OL<fg%Os5$aZ|-;_&0}OQ~GiQ=gvL@EXS-
zu~<XS7Y!=%Sz|qldu(MUcK-GRb_aN?=fqdWVwIsL>L#t%nb-Rr!IxoDYYh@pK;lt8
zl)Qi`eZGcltHqN<Rb)wPd2iMRWs3_|p^f}EAJ5Iw4t2%rjd5-HNFuX2h?hhJ#rSa!
z!|Ybu8CGrsbwB)@>Xv5!cXS3&h6i_MBOmgsgunk2Sx)=YS=O*D?a5&PjHK{2<rv|C
zq8lbz8evf_e^ZPI+L<N1Pmx@&VQ}btM)zY)R9b9=K*-J2N_!j`iD{i(?VLoovKDz1
zg1G1@pSsPHXN=1Ip<*f%{F*CSAJw)aehZ|@daNaFpB7qbn52m-Z33OCK3iPRGLT6n
zM{d}i84%B8+!GO_woukU44rIw(P<c2?ZP7G{ukOV%irbswErhQZpsqND9;H}$=5`x
zJNlsyFn!h!)Wf7kZZ>5GWQY1e38-3cX6`sAQ}QE6{Ue<!iI8=Ybbg$KfveuD=t>oN
zQEHAAbseg-PfwM6p?E3j2Vubt=&8E^1kExJ8RG+OiXGy;vPu-wm>fm(huZ!wM?D7p
zxU$D1f^|)@qRKz`oK9!VXbQ_;ly>A}(EUig+MtzKp8nTYSm-1GgKo!=8Upu4LIn-D
zpdW;G3%7Vvx9tSRi(ex!kp=2ZT-G1cmfiJ%p6QU9H2$RTqnq&Og6f~p7O($jK7`Pm
zww^c78-M<g7{B3VLev&7Bc>+MliU$*)VAe_LF<I!eXq;}H4(X%?yJyRa<YkJ$Hj#u
zwhxyRwTvLPt%#@#G7+hl8<+~zoR=A;sAa6Ihq<<<Pu7l^b;8pFvT^bKjhD1GnZ2z0
zSQk{Y+RCzQFu!X(bHKr5HR{b9LYn9yRv1(%gFR!neEL-Bw&Zz-J$7n<sONM~oCAXs
zWcUT72w8ovv5fX|;B!@2*ih}7;&Ts3JhotW1*h!){16~Ll}aTrZUa4z3YUK{5U{Zd
zs$%;qE}G^1PG#FB;2hurL{lEe2Xs&>|7yey0G;*dXXC~=b@zEQ`_yxI&FZ8e#9>{S
z;a;eqe+E5TqWb;RxjT}|LM25V#;+<kOgd;5Ez2k1iU*J#MvK@OZZyVwR*+Ysh$Gk1
z?R$_o<qTk~+0e?z=5Tng#$~THdsD^+307HOze4T*%i@bbgyY2JYXWKu&D+&<DK&?&
z1ab5uvGfI?@XH7~`lJssG{2JWLI`qholIAORfi6~%3SD<pa$TQZmMBi^lf62HJXBg
zIi;kYZigamYn*S@LpX^85D7w1s~r=;CTVw_Z8%Ni*fd>s^`Ywd3th)S%zTcNT4lTq
zw(_`*mBleHyB2uDlg1!eUa<_#<lbXYS&DX-EdS>z%wpFslj=6<^^#A$=ev<nC&#xo
zJ~in`rzhTc*yG{lfn9&*-u2YsGG#xZ{<v%4C2<NA?ZXjeNl-0Uun_Wls@7(EGS{<2
zU}I&4LQJ5uM|zds)bqa;0#S^ITh(Qyu4ekj(RFfCb68%RA88Hc@Hxw)sH8FZ#!MNZ
z>E1gyf$1|uNqR|eHfX4gOEStWoCNqziW_C#ayfJDSWY;Ev$KZ$*>YPqdE%>Nnay%i
z_j!{5%JGLkv%Dnv#0g_yo^%-_8YDFQaAt0fEav*1a!IX+ly=K4kZ=G110X@3>_rhQ
zuh~QdE6Jsy+IW~=DT$_coGy4~t+NsyCp3j5NTc}A#Rmmx6!ob<*<_1OwIi|VKAS|?
zv>I2&K&y>yf-Hn9H)QsIRU6M^9*d~q!2_HOF(s0SKO-7UNnAGSn5JPKCg{h#p69e?
z0tw)$$Z3+b&9dc%f;su{EBhqMj92U=$dREqh#Pi%#zf7Gqgk`x73*XUs&K??sRp2S
z4eJ*MkdEVdkOmQa$nwSwlGGzaHev(WzcDf1QVX0W+;?;CRvJ2do$li;Lc9e->DR7Z
zh@JWM>wwAz9EqzoE@Ljf=h%i{xBgyL&cky|s50XSSyq7?NN-<}MPET3Kd7QrL&8NR
z7qFbioB*&tm0Ul)s)-$WUDDc#`1j;0!U|&HXBr!tT-Q2%2WoWQXTYCKXnj)dEnoiV
zde+eVxcg6*_JddK{)5uP+I9J5W2bjd<-y_y4aHe@0%y$tEIUV9j23&XEKV4tjHot=
zFSsfb3|cUw(n)(82_O8a?X$n`a<#cq!7t-A>fAz?6xBiHh<5LKu*B${{{p4!9aL-1
zT)jJBvYA2ooMV3VR*j6mhRxwIrHp{x>p=##stSg^2qX54g*-kOdI!!L4m%d-sA;rm
zFtaRbdr#KdVGyUIcR*oyt{n4+oAW2p=FOGIkpiE9W-Q_hNeolPv-;G5B9JtauK^DR
z6PA*oeo<bf`6h+{X@GBGMvF^`_j`qRmGipCJwcq3GtjrdiS#H>NT#;j>HFoORAw~g
zH-CJ03a?ZG<uI-?h46?{Fl2v`(ms4PF?0()u-o5gNs({#(XhD15jnP(*z`>mWBEFX
zrp)U4FwdIFO;G|w*CG8exZh|?-uRvrFBH{BZr@nEgW>1wcBat9EJD#E4YmAMl0<vx
z;$GdI<l!4@WVv{7{NF2o)$}zseciN`@5@jlH;W2*E_jk}cL>h+<d#MN4am-7$dGTn
z3LP#J4coal&MYG&X=O`%o6AlbRxtNRR8L_6VH$!j-MlJ{CB<ZJX}I?kL{Ss4uPHVU
zrDge%!<cKB+{M4-x+VYs1UEqf3GT#E64U?mreZabmV0IawkRZyQs)9p^O1P9=WL0B
zn8eZ_+OPKmGKC{&T~{@58apqb0zo)Y9!MX2;3<RGLXu<@a%HO|D>ssPbL>Syjh?Qs
zjyf8}&o~*a%Ks5Iz4ZBT-+y0Z5guUtKIc=tsgEzLk7MOKIw)Qd1!)A`AIJF>>bQ>U
zbk%<`@K0PkrB9IVs5VQmYv*tB#DRPvpDpa}sthpwz2);n9h*_ZSce%))UDD;#+N?e
zOHZBYVXe@fqWcI+o*eT$+sLa>kXz$aTfMVM#J)yfVQU!3!iZZRH!(_zqcE=1X$s$}
z#UYR`4(``?JdHo<_uBQ-D@}3E_5*+M36_LQkpM{3G{4Z|Lk<>31phHyKFuVba0!94
z$L{b6qKwQ%+KHu3uOTnJML>ZrN{A}!NC$I85~P0!b^zTxm!^xH`ip71tpsNK{qYZ1
zU`{*Sr_Sa!1beVFyv{d}$*84hS@9QtDMjze)I)UjYr#XZ^$gz#RdsAKyq4SPAHhP7
z=a;6KK%`|gmLk9_P`>kj{wbhN8We!??>Yx*-|ePm!a8Lb$zZmy-C|<Te=VMbz7AsT
z!6d-u!u9WTEI)VO7?sW^eVOE6xl?FIw>DV{CN!E0>&+9)mQ!CRnUjs))ePN@;F&GL
zkGV*LIn~#4bT2{)tO^JJsCtrSRbv-I+DX%7mEOb??#`u;-r;SnT?I4(X>wPvUS}aJ
zs(3U=KW2b0oEia~dQbg?^*(<)HS5T|`}RM_iq&WhOQg$uny!EvksS){@ET7qz<n_D
z&d7@REn2|xqwu23cnvX5jL|8zR37R$fUiuDEB`{WI++~o_nM#4)z0{%{80b$O%#&T
zOO*R>h&8!2ep`P?EuaqOmX4^l9?u?48S;U%STdTIK-abdS-4we#igg~<^`fZ5ahag
z*2^2`gm@+|kZ41Rm@r0|7>|YAuCFBl+1oQ49|}~?_-6}K)DFi4DpV=5Z}M7GvJolH
z2s6VAN<Nk-|7m|^IPWw?DFmi-H0;Z}8V(DvURJcn9ZYYO%<&{bB6ev%VLeA#q_-=l
zWGT0XtXmgo9Y~`}E)1Nv6-Sj^I7EVHvCPMA8c9+2>W@q#wD?RoGxXVK6zlt1BjY|Q
zJ@6(O%VcpF)uxvc$3sH$J1HBs(S!^eCjw^z$@uI}!VKfNRh-tkw6kz&WoCSsx!B3A
z|2Yp~0nS>OtPTiOs!~U?<@wrSi8;<5yfE2fF-5*c;KiPg`YDl9D9VDv^5)s~vac%?
z-U6y8A`gugh{I=`YBvLD!&^TCIrhvRhVlXhf@U5{{O>{`b~`!J<lD`XBfdQQ$#j!?
zxx#xt(qrE4pA=xPQT~M`4zEpf2_ED>9H%vL+7$MZv<Mwdid77U3ES7E6;%r}5LO;Q
z=N8o`>;M1(;z0odd$ANmvOo0!wgMUxEDtQ3xDMS!*TT(^c^=`Clw-s{+>7JldIbqT
zM@7+C=hUQ4`dIb8BduQqv}vp_FA{QXspG#v_R56{HbMaV96cxp5jemvKRyKjO;m{i
zfGC@mnd>JfeG5SJ4@~vP^@}~17m`1V&=V|TK(yegoev-)0W501Nf4!7(-=`mO?@c!
z!XFHyS`4moBPBB6PPfEA;VLR^cBNG$e6M()8<H6t>hYL*RB58>iz*Q2+n^_M3K(W1
zR8MJES=kr@MS#r>hZsMv(}JOb%Kixryy3ht25qw{+4@Hi&x0g-o#o;H#30kc!iisO
zoQR2PL<SK4?YSYf=xAmtz-1|WY0pS8Z!++~q#hmw>p%}#YZ{ci>3R6#$K;19Tfo#T
zV6lUr@c2E2M~G+??ms6v=#AXGqt;lh>hdxQUzX6LltP9Hfm;;Tg?7(T6cP0Ajx}p2
z;^fPpnr9$ZVBx0k;T%LRc>@0-IgFwTfxulhQwu_~$1xp>aYfG~Mu{JueKwqk`7~VG
z4(!9_(IW1%c8&wAI{#1z21{fc+ekF|lB3G6zpf;#KtKs}ToScd?(c{8=}f1!0jNvY
zZ52g%%k7J8hff@z!zPB~V88t{gHdb%009I+0fKw66hyTD!YRu!kxElCzwsEmHIy?}
z+4fSef&(`4I4CgkDS?;@v)nW)QiuW&E!C$Ykk|<R_v8L4;qZ7^Tk#DDcx@?MzvDdr
zdJMNfW`y|S@yP8uCQ858d}Ev7yIfn~smzLql}Jgr6ro}U-M@|8ZBk640bIGqSe={;
z54k^WqHldo0qk~Oj<e6$w)333YX-A?MP-R}Bb{nnyPIxGlTP?f!t*fk%W)pl#_tqr
zg_@W2;ha+_SZb+&;iky5Oz3<V<4e_@5ll%fCoV5y)oXo`J#TW{^)_-H+4wCm?-dn;
zNgt|i#J#emKdLe0Bt;YzJl((@Z>gC@XpRs7009#Lp9^$v?p1o&Sb6|D$X7PnO3UcR
zkbh!wgW+`DHy8Z%3vUrnvH`%@SVdEI(G}=x6@+5vpQ`o||8X(I5?rT$p*p6S-5L=U
zRGh#&m84G&ZX6a0<~|N3g8dab!&3%>wNamqw<=4lSb2><mTVY~i2ztqvQVWs!=_(*
zS~V%`Bkv&1E+aVxUGT0egktXJtCfxKS%BsMNa|kVD?R7pui&e#{%g!(mHXGqyq*qa
zqbMM&np7@<zIsB9@Hv&00%>KR`lT;QZ<KiMPe9%PK2rRr5_`!E($OPr2;kAwWe>^=
zPk%#o#6OlT@xbpS4OSub_NmzC=t5XjlUHT_bH8(tAua;QfOw{Ty4mH2`UwC40yF^v
z3GlZ>L;v{OI`plvcXi!J=O&rU&2SM7UYb|53V0R&2Q{927U7(1ZDS{-!qE}SKRAH|
z4A>|JhK&UcL~pfp#Hs4TR_GO7wA}f!q-p}TYG+q??4^>YNjR)+mq$(-oAFFcU^S$_
zP)$aAr)G@Jh+)0u>qa3z$<;N3t`(zbuapvM1$8n}_1^FUFl3ojcpBf&i0Ue3T4KFF
z#mj(>MgNWvki`%LM3`%ClgXA@UJMgtQdX#xj=MT=yC>t*Nhv#PN@eeGa_=u2<helA
zg=B|CjBLU=d{HK|5!7Vbt2uIXU%D@X7W8Jyjh@<W+-cNk`f8x}VqDrb6Xy@4*M&;~
z{%&?%9I_RB=Wj*RJ$|rwjc}6V9gUl7CK|k7@<Imn5x?n!&HKI^3TpLcc$w*adL|IY
zkzSY%vto}_y-_GlEr&;%p)MMtsbOS=>&jYo*Og{Gm<=9FNArt^S=Qe$pvO~iImjPV
zRY5+bB(#XiY5}LVDi=LoiCg3+vLk(88sFiac{d|$h0~x)$3km71x*a(f;dMRCcJ&m
z@}daLnB%u=$}u96UlO)o*fN=mU_HmH;_!d_|KV+hy!w%SO?2C8&}x!%HQ#)|uM~FB
zw94+*_DGIrU|qe4D^#GiIXbn1TJtYgHS3zs+p?|GZVUKV88A^~kTZs&H)M?@BzVJ}
zYn_SMcT8gxzV~*1HyK0!+Yu#u`_2tqg9CpNzeH9?>e@2cdCPC=JAnWI05Smq0(>pe
z5dZgXyA#{4s!>e&n9-m9Ei{%y-xPd~{7e#bk#)*=@jo!2iaPi+&m1}-0000|0Re)1
zEzvXo^t1H?wk-2^04QRUluH&qizTnF#h9;XB2R}vpum<;!*zvIH7w%m!z6`JwF1-E
zSoCfhmbM<d&2CUjf4k6=z*=)>A5z>jS4{u_0-FJ!4Qg-9?pR0yX9wlq#e-^#bHw{L
zcJNTp0gN~fH;Dr_P1gJ8pmHGM*-B6Mb+r}JiE!SGL<06|11j|#d2HJcDm|4DnmyEm
zH)Ng%>0D-{*rK@xj*q(OWCq~ZTR?`8@rnG+Rue@)$ats0Z_8F#j$dPSw>#v0tt~)t
zb96)Av>(rAuA<bP>23saL<850Rm2WoX?X;g{%n6i-xlb@RaTwdM*yD!LmqAA(&6i7
z;pm53c`#0yR=2=&-Ta>=^mv-Ji)?QT%7RdPU)8D?<ANa{U7)V3nRSOBa_4!;{6RhF
zBa>+XD|m6<tY7posWJbwx+tDw!nB%uF2K?d<yPwpYf$8@{h7|{7QFrl$zs*=wB(D@
zJUjw@a;2=^WROegZyw}7!noZW@m60>ep(-yO4c&Nz`yLwN*5fd5$kB5fXl*M<IA$o
z;nil)0TWc11Q*BXM$;n}Cem#<Lbmgvt}y=78~vlk!{?^i!cY#d=6AT*GrdGB*v^@<
z_q<c=&OPp%vwcRkdY0A{1p^B=CcqtN*Coc@d&JgxSU+CYQ=Uybz4oyv`pk{outGm=
zp0h3i)SXk4EnOF8%eHOXwyjgPZQHhO+qR8UIAz;5yWaSs|3Ke$-tUOa$enYom}`vr
z$mQnp1Pzw^L$NJ#3I(TQbZ$1VJ(|WVdq7S{7H(7e?~_Lf2RvNa-F5=oI&$}Q<LMha
zG^U+y<ZUycxP0K0W|VxrsqzM2QeGW-d+>)xw#W-~NE6+&iZn0|$&&*)9$YG`GZ2yQ
z(wDH-*A>$(4Q!f}MFwG!aIZX^K-b=fOZM_QMsk>+Viyd>67J@0i}D+{%n{K5jgbBt
zR{8HgO31wc#$O0|3oR+rl$R+7JoVV~Axddv-tTsi>%KBqIq+JsqJF1aq%Y?*>*Z7P
zM+-|4c$6TbFz`*m?q{C44Zm#+kpD{T@~`f6e*uin5<0!iTrFOIVolOR;b&B8(PZ#h
zi#JKEB)r}e-~idDRh+KV)EKke$3sYGqI?tD#iDSlxMu^5HSP7yoxj?<(+dwJh6QX_
z0Y0nx<Ej5{&}2Hlan)5oReSC>A9H+3%#qd}r2TW*8u9@~bqbc|Ga#EQC~Xg}rM2be
zrMXRe#C$U+c!%(&%tSr{24!9A#-OOk){kB^wg7C#FN2?NeQu#i>0I9}>ngj?$*oyb
zIRwct!?HiRmKrOXjm7d7jzMX1X7ivdNonkDy9jC==VyfLP=nbJU*<YU1|tR2K@p2L
z;Xh}1#fJ_n*_{&1y803m2Dpe6<`_M4SjE--mKPLavVG)Px~5IVSP?Dc^Ns$V5-#(v
zj>4r809Fnz;&|g#Sc58y<~$KV`t<N&*3O97zk$GvAUe83x}?-#3dVYh#*#D;Te{68
zoZ<cia(b{4sL@yEuD+z1|0FrVIf&ApGk80jh$OKGpjugkXDKwhh_p<Ctla9<?tm`q
zZ8fg!DZ;!vU>iYRvrmCWXXI<|P_e|5<81C0&YS)_7HFT_RWL(|iwToy@Tm(*@^XlG
zeAsLU89uUfQr_-${XHI-1d#;>m$&4N=kYP-(ICizP`#gxL79#sql#$XL71IeycHW=
znr2q%9F)h+eT3&G+JI%|kDw{*md-^Nq|qO1zBnr*3yR1j=s{PraGZ?0eMHDhb(lZ*
zeKXePD;#g2n@}b5C-0b{2w^DkZwy>$HiAk!=G#{9`5-887AwN?s!ZgG`ahEJ35QJH
zy+oP+uB6CucHq3&%CEKV0p{t}91o-PmoHmJ(PK#NDVJqH1KP3K%kRI~aKt5_977Qn
zLqQSgxeKqw1zx620nbS!+B|=_mZ&C)c=^sysl-6}!(`~&=iGB<m$2=#^K0Eg(B-%a
z2MDx~_!k)N;{E#t8O|!IELON`B)~<%d}!wkEh)ERZ%3*&0YGOiWS6E=BteyJ1a5&a
zw;Am99Qax_nX4|tes^Gy|1BA;zoyq}d~6X8S}6PwXnSm_SWxaTrE*MKn8}C=-$udH
z&ihmV#Yhp1(P9CmVg`{26+l_3fuk^Tc?s9_;q(=T)S(;<8~8VR*22_9CcNDH&7~yR
zqKLp_q7Y49Xz!@JbkMM#VcenWF)VF0Jt(H4|N0HbvP2bGj@eIP!28#a9d=I)Xo^8u
z=nL@END<Y1JZYi`=x{oc!Qq5gEYgt3zahDz?^PD$ZV0OO4&oJ8fIPVGHZt2AAbe=#
zpZihCRjXV<{=@n^VQFt(pfbu(TiXkpkw#$GXbJpr8!nAbWcf-iYG4%%c|*SA-fLXo
zGO$2Sx^(F}Hj7d=79hX$`wF)IR>IByRzlYQ#I{G6O0w5R@2ukG(6uISenxc(#(Q%c
z#y0~KAzp6Dbk4uA^lp0mUN8vdAiPyrFLaiPMr;H+-ftZW)xlqYfA=%L;pL3BFdM%>
zcg`yT!SDK^Yr_gjw2RV2K~^%fhs!r)ePUw@Fok9N1CAFD!HES?rdA52g5qB^{dH3F
z@I0qQ$2!PxawbbT^(=O2H>SWrq|ui6*2hCGXvmDJ2)8V8(hrahj4RIZjM`q8+lg90
zn#bwykR-*>8EW3l!)hc#O0P3d--QH)Ek=5~{+0zE{>EHD=n7^b{rY*<88Q%QKOw!*
zN4SS`1P+pyMm$P`FP(-xh?7Is7+MSPF}zonP8?OO3K@yvkzsjKIC$!8>{R1bb|3pn
z?tm#y`<qk9rN(9~^Z+W5odoo8z+F%HYu{uz4=c70hOoDXbkc~F0we*NN<h&*PTZ=u
zXTIg3K9h6m{k&~h%vNO21Kd>yP)7KQe>@~I5e}&2DPn?yCqY-3EfJj>EU3oshfhS5
zco9E~4T`y@bgNmBi8O}Wd5rMju8j~=LmX>e3_`P2G>WUX@n!`Tj}wVg#6i`pAT)%K
z=lH+d;-5U}0#0O_`$GGElPHw_tSt2-;!}n1{j=(rp{q08#cnY*9Me!e>h12vg8g9d
zjUSnBJt0tEj0A6`q6?OE{QKIQ;?tl=rq#7+-OEpauJ}37CC5Gn-{m#!*)s>Z3++vq
z_&b8yMSt-oex<J88C=u%%T-`0#=~yA%*Sg%1K#>Y@5M#&nF4`%DqHyHinZ+ouDAul
z5dVNz(uP5ls&R1t^g&Is<TZP&@MI7P2|-q+P*=v<&zRVbz9#X^y5A(NhSE-Ch8(9a
z_iBZ;ja<+Pgg&gNmx?k#wcd`X+6gRWR{!!&zSCLL+B*qg?##A1I+ug6A;(F!ojN6W
z;ED9_hbQLnB$mdbFef<ui)p6SkJ-`u4HbH_nHhYTdz^A5liqBl8ju!X8LC=qVrDOT
z%jI4WbSW4Ila85KkTxPs6<y*aFpMq#bQfZ63k|jA2N*YlF0t7%X^`+@8Tvt&reaoE
zaklGisT*sTkgut?8t_M|YfAApP4lIk`_P)zMvJ+|by$a~U)G;0FZ@&Q-k1YVb));?
z2I^iS{pp_Y^D2LNT!=1YCfanEL^+NxXHcAET+HUmlpP!Y637;FXU0;Xb(eX3{Dc@4
zojC{?!1V-$JMs@xiCeUwvwx7lD#`P8vNR?$cAZEt?BP7;8JgRdTe*_#4f&7iC%}(P
z2#q3%B#HSv%lE+f&%%r^VhpCoij>QD`%9itK9`!8a<*75?0RG8EfIdOFbkYivvKYn
z)0}1B<LCYDQ#TL`tlV3}P}k^nK=XWH$LE95htS=j_Rx7|X^<t7_FQ5>AC9T>LueQk
zdsY^~$g5o&{IgBUxrb{;W~40ExV9}TZ<iXqCj!U}=WxU1yuw=YVP`amSsRfcr&WG}
zL)BHWb2j)SudnTb4*rDzUJh`XX<C5@RSf<;PdBnS+*jkl@J;LT6(jOwM%@fGIZH>9
z9E0p?*evkQXzZV%Fmyy>Gxt?-CIK`1SU@JvY;C<g+GLY)-A|^;S~LwGf+LsLV1FWR
zpRtfIZF!2b;^*b9-{r4cOH*7}h@e??1M&qm`TNx(qh0})AHeZ*k9q)k{(9oSi5T@*
zq5jMG>Kmk2zU@$AD_jdX5o{a3IfL~`8L9CPWd+z#9d2N^AQ7cLJW*?5Z9ku&sOF0J
zH#MplaFDbvSa}+%=0sRGi4O9q6pz3ta}|cM)*)K)4dUDViZOBcYN*lFOY-y=Q<+`5
ziERUhq$e`E$|<ob+6*9e1E=kosgRN_5`J$}!L|i~I;lX&{@l6?n|i|jWf4;eXmsiy
z!&fgs-NLAspoK0I-q}gRN5a~V9vT<nLg4$e$0wG;=ppPLTZYl&6A{>7blB}6PNbQp
z;T0mg>#F-|4g**)vY55#9;GW7SUIdTRBEE>x9SjEn~K@J!h`nWec~PK3HtIc9mB1{
ziv9@qZNE>&DATac%p|f9A9agJKy;k!b5_bb4f#_i$<RpXAjDfU5sWTfke(e5@Ab<&
zdTpI9RHc0U;kRui+THRYt>*>{x=1@yi|O|2XucDbI}(O!Y{WC$DE<oUVb@v=5iNhQ
zxam??xpT<Okffuo7}U5+dRBOJrjn}FjC`ZxkENgj&;U{UQA#$@%>+EYNGIvOcHpBx
zS=P~RyxuoSBX^RvYN-_K`(^FsYxoj$xb69@Nr}W61Q4AEu?QY@;=)Hrpoj{bfXfg0
zCor*>z(szNA-XHVKsc77WPopj!XrjG{6}eVa}+}XJ14!3K-yI$@xlaY((U0}DTInV
z1eLCIamfSf*i|_bt?;R~nfJ+%S^GS?nXig-lBIwV=0c{%fmWx^OY_KA^NDAjmR&#7
zE2s10^&c#>(*EYku~=EA^R7R{4J)4_s<bl`^zZbfDg7g7@Q`;$7rR_FDt`ZlSk*_u
zZC^a~jy+ILa-{dkbwE^srMsU+BdDL<QppZ}x%qgQ!K5Bf1#tF$(<6jXFzzT}8pLCy
zW7?E0IDE=@o5kxmoXfWB{mcA#7CK!mjmAH4dJ<83&;lae94Mb+$rMv0(z+rpJ$v<t
zx^QyQrWX|BY#2*CI*{pJspA(pt*HwLiRtr~GkW8Uog?WXi{P&&$+sQ_K_6yut29d8
zd}7)pk@Ic-|5T<CyMDV{kYtBX!Jj(Q-XhRSi?zd*Jo?N$j4#Pk;%<X^_H!4Y$(vv}
zpy0G#oL|&#wmA0bJ#6pqH&@fqaB88Ixk$jfwl0Y>S&lcgHl-kNCKpSuG72*SckRB6
z#U}Z@*(<*JNL2BG$I~(2lzC6uNB>3}vSgP&UxDVs2~FDtBzrA4>*AJ%k;J@akrynt
zl~nHreM`xq4}UO?=eLQ9vv_c!I`H}}O_W~z9%vlze_*g$yl)*W%P{SZHkUfi5wwg~
zpijQv<c+FUyR(3*+)S`}&6Yirck0h)5J=W(EmI_2Ly<Zo&f0+G**B`UeWPBfZ^Ho!
zz_!ki(k#~Sf!jL_7#w#oF|jC890*x|Uo?M_UBZdif?5N7r)`&5{;QfOqW;Oz*RQIH
z`+M@$K=P|<P9*=Tn)ZUu-sM))NEHYG#Q!+R7jkD%`{7kt=H3yBzW<;mgN&N|j#UHZ
zA#Cck#-nfF(Ye_dc&OCJXybLBTG>1+mU@+h-BDf{5M0V020e>wxA-D%2=tq9nq<TR
znG}s<xniM6&cgzl%OG4k|FL|&=(J_+pfC08xFK<Ms@PF0|3jENL&+fRGfY}0TZ%MG
z)RACbVR;X@0z>2W9lR9jZyx4~u_M>o+_Dn&V{`_agS|`qNKL{lFuAE8yQgVLr|S|G
zH=8TkP#fP8bBP0M(X8lesDSnP(MN7?BYf}9s~mim&DP5w(ycqyu-=fp@4!|I2prqx
z@Wqowr*}l<wC4((*pb*zV0(3I7rp>i<s|yYIl%*MDq(rdt_6WUolRdI)4@?KjhQT|
zS9rUPnJ8}-Maqfa{5`F+fJ7ajJ1-E1K#4RYLz$(Wjb>N8w~&t@dQL}WHhLrji^~aQ
zQRKr`iOpCI0`6?2qk7a*hNIj@57v~y2>!6&R*=|*-|uQH?PUV~b@6C`p?ZVUhex(+
zk9VU*UAtmndpnn}SZ!5n{@7P*r@jEYq3->~tCU(wvT8q=d_wKh`#P0;f)7$BI4L`j
zP)f;YRRJ0n+U&F+WjC8rw%#jaLnn`oksZ&}e3(<5I?PRX_)x#}M_8)6x8{8V$?RKf
zrfO>q-tXQbrdb{ZT<50Mv>8k$cA_2_j>ZEuay)#k&tW`NhpKhU5I2P)iwCLJMt;Pb
z%D_akkvRyeC86>?8pA*aB!40QMX}+a6aII?1jCT1VnVCO{Zhrw%mJN{U#hRqWk?5S
z(C}=uZ?ujnw;A+x$KfBQDBDk6At~JyX}XygsY&C5a8|mPTwQ4RnCu7w4};W>u8V(W
z8yMjxcV9Tc<3$7V95_@XcK)&vz<lrK_*)p#=~UR_Ufy$a`gJaN8`=f2=<%QT0^WZE
zdKTm2X6U%*kz=bGES*7a3q}0P^xL)4o&Ffq5Ml~TqvCRj(MpZ$*Wf*&X|N82*DBKG
zEFZ&cCIpDI+DyuDFeM@xm&#X^6;(FZ^^Gb#knFX%hgTP3Q||ba-*@jv^@IIGn2amd
zF>gH1VU`H4VWYRj@GN$E%Nx^p5plp|51uH4%;OitujCGCWsVhYIee`dI0q>)bQ)0{
zvYSck2ZgJpE5nJ>2H(LaCL}*ZoO23N8{%D&+Rw$P5<2h@jO<a4rv#3b9B*$3MNHta
z$gW4H@>f@U{%IX_WRpZ7p$kv4NR=Hph7jH?TMK58sT6KkbI<s+SHOvLK9g|B>tr|P
zpi$#;<4Zn~1pH?X6{A~y$uLtIzd;!jr#cpe2rFeQ>soeL(i132j7986hU7S^l_DlQ
zW|c#C7GhV&<Ia=7lst6SFh=6++0r5@Exx?`vf#V1Nr>vtZ($v_`cYxN7CB_c0(SX#
z1!Iiukk7@!XT)~NZkCwQ1^r1@pRys`+@av6j9iy}m4+tAfAGGnA_meNv0UOy4p90T
zlA9NjU9kvAeRGi@2J94Bzm3KI^q6Pu(phT4(s!_n_G)dcBk|b+jr}o5-NtimHPtLc
z(<EP^_0WR<9QF91Rz(Pd+dv*XXWxF@x?ay(sD#GIOIz(sRyXDmh%yf$5UXt|@B@pZ
zJVNqsNF?`qUZ$_HTC3Uk$*SR<UdFD2n4noHxd&*(m_kR80EN&jOlU6ITso7aQR><#
z>_+sZF_%gmPA&X$X-qEy)+Ba5mltwgdxYqLQ^eqJU^kF-V^S11Xb_aL2MN^r8ANZ@
zX?s7>xByc4Ab>u4s(E2r-Xdfp&qK(xa<CClmA181_<q@*4MD*%o@A+m#k3O$ys~J^
ze$^!Lxh_$ZkChX$=~kTF_NTgaNa3Cf%>^aapTDf@AE0BBkpM?w6FV6yj`&=jk}ZP_
zd_fGGtB`zgXps<F3S(hdREtfZ*_FgAR8u|8dqK@i5=?iYqurKxQX%r%Y8g!{fd^r6
ziQm&`bFY9ps<f^z=>{yf;R=9lmNiVK0F@t)l@>naC&|Q+Kpe;QfaTb~aJgO*F}6lm
z++}IFG@!}W{l;kAT^E3PX#Yq&Q75h*<ffz~`C<YbS}9x<)s5K}jv9(-5~Hu{K$k~!
z#m-ZCeW9@@l7w6Xa=S5q<m;3ZO)@W_ey@0<p|EU7A2NWTbKexfHnwC%CCqfrv*E{g
z0eo4(%`2EOtB@KKMk{fYwQYmP4W(!6Tt-ndCc|fzkhgt)FfwGs9tr6cl@V}zZ?Td>
zSV};G%3plD8U%4fQN|E7#^MD*sUt-Cj-=ZoS5A;<&9Jw)Kp5wV6;64ALKu5iAR(5E
z`nSDmsp|t>sqF~5v?+vED8h&@>gyO1U8wlAM>?4K#%z$bx&70$qUkB6FkZJ4`zjFL
zR>dWPl<$i?9QsEI0#tvl@IX{NadmU6IaRC<*VR-8qSyPX`g*oVP2>@jqe4!4pa-1Z
zIhg0AEK+P3>!vpcByi5B*DFIn%v;HQvsuAU6KAmQsevah&3>~l$kv_I!G2vh|8EIw
zvmo4~G%Y^L;N*}-N?+H7_Bvz%k#!4gf&$>srs@2PmgTkMY(vU<lVLNrzh|bBhWRSM
z0}qCB0+ROdEkFoq)t=+^M(}}(*Q+i8M~-{L=8F=GpDHqXmi9jkkvew$Zuor!08j}9
zMvY_JR3Va85zGm11nrRnNp3vq#lHK}Crb{19$1Vc52q6Tf+CWr3j&GWeXHq9E7{3x
zy5A2njF1G$I;<BWa=(R3WX43j<S>M{l;oz9Lr=3r8W{HxA>+s`YoHgfJgqq(ZkVC)
zG^o0g@@Dt;reuBqrq#?3&b*w$iLU#>Yq~;3DeIKNFo&mm@k$0ZG=<KX>OG;>)N#_3
z9N#at*ObauqjS*Xydoe$dgS!a;RrEDu^m`6p3=XqCCjpJpW$A3`T~lc`8o$J%CE4i
z&U@44NUS-tRICKk@D_>5CiohC0u(gbA{Y92vs*JyobPUm&o35Rs22KqT3UyYO)xA(
zy)88XUP$N@>Z@x3-6TNyp1&>&hrK`sml)-2z|dbaRpn0K2R~86q+_({aG}x}z61J(
z;O*(isjgz#wyz*t;!XYN>&Z)v+7+Qr2XahP7nFe^L962v{rCc(Q761Eq|56Vz)NcN
z>|cgzxvO(lBsCNU_yU<>vuD03f4NVf0t`=&39LcFr^{8pI|dGmSXg*L(2|?;-oFPx
zSm6BI`eVx+(wds37Xtqna?{4IuXc-32D-#!e{I1Ki!lmE*<HeEQU(z3iTG>SN(5{o
zQymX6VpH5>a>%*wNg~>TM$U>ilMc+7R+7j3tmDW)JSg}?=w2-oWK;ldLs6Y1lw+x~
z!3yx-Kb+bZ`$iqQo1@ueGGY`<t1n#@Z{3!z;_KF--esl-+Cg=v;}RG5exvA{3e2}O
ziBt#!<vu7!sLaV|D{{?q);kgM!m#uRxMN*#nX7+5u@LE};?iebUcV|$NRrYGd`$fa
z8tgI0mcMLF;eqBA`yZMux-W?%q*nA+NHlYPUNo1=)|iX)FTfPg{`<qntJTS|WcRa$
zXVBt|!GrYOT4hk;GlIUzCJ>SE8$?|cgXKZZSSW-hp&u=6GQ+KaA7&zm6{uyd@R5M2
zV_5gGgHc1MIby7kS=;Lb6hot_!C$4+H1}9LO+1Q<Du+SZ>;OUC%ou{w&VXeGGnJzS
zdycT37`ku}^h9>r2P3`}$0FHj4YF|bb;oR$t_onZ8Xkg^1^&-a!8Hib)p95o>H?Zs
z=X$9RR1YSG%SniGSTG`~_r`E7g{w<v{rO*kw&4!hCKcz}*=W=4P<Ut7`DZY*k_N<F
z5q6COJWD)ywLm@yY9H1F055aQ6UsN?AXvUmicOq9|IhP6O`2d><HOTmvnHU+Fh_Kb
z>90aYH;_^fKT7nj3h-sb{iqz^1s!n_8eX*}%WvNA0!!uNj=+!N_Hmu(9jav_`wWid
z*k6>RB(XFJjB0!(?TZVyTDNw^nNtd8Z}`+eU~FgXQ)S>xvjt4dmm9pJD^1j1xxJh&
zRodaO2Nct?KVo9b3G93f7`b2!uuPi08SAeMM$2K>KwNIYZ)^?EFW+J<`Rg(^nlO<1
z`OR+P>`IfA9Qg*XZJk^&6k8zT5XWgj4AL32mo2qef)-d2pvI|(;o|9N-DDIPhs;rA
z-Q<kp+}@w>C4gc$;VE$A7&Xz|p3ARzyR;oW>3Pu0ZF%<vQCQ-ccWx#9fv*1!2~FP)
zooC|cmCiTbE1<yh$Uld!5IUvLf<87Di(RSEg)LvOhHo;KUYf)<8YloCr&7#fw0dhD
zz{azsU`$x5SXU-dWC10=%O>FeXb(F3`;TdVlSb@Nyxv6<zKYy8u%zs(pf>5?Dm<E@
zjx%}C=>f8BsBk><nT3m2?V=6CH)q1)0GuNj!;E`}83;EHL20UUI@jwJ4caB8n6_8j
zfRQXef=}e6&(YnjBFhEFFVPZ}`A4eq@ZuUBiElzpfUbxl<?wKTe>AXCcyLrI=1?HP
zX8}b_JdyEh{*tlo9>g?ybHZah&Q_99Y9pb^%}ZiI0KY^gBdrMcdDTmcSe-V>7cp2d
zc0QU<0J}F3_fkrPEvqqEH5swdIh@B;<0|Us8e81(dZ+4ZkF7t&z&hklF9Hvv*=7(`
z9)0x7Y^;DT^}tZ)T2H%R0di}7du%a&aC{<xVRV(5^)wEwplk6V)cr`e@wd$$h;0yR
zoy#80#!Qs#2!eQE$OYXY&LZEggu}Six)p{kE7tXPW$FFxE4ZvDyuc0AC|9E><|)-#
zG~m<|Ql^4SFr6k3=c~apJ7a`DFdopCQmx9{jA2Rlg+Lg|6RT&vUki8jRxCRK$W(RL
zXI(!f)j`*K&Yl0=jj+ZhOU+|JCEOECT1Juhi-K2T9huvW@759}7N4BH`WX|5Xg9+S
zzO(bn;;3+>l)B(d7W(~jKW}Cd995MxlMc^ZoeqB$$I(T3#<q|lBscp%HdAf-2rRq!
z1|>R7$o)caCy;%e(?Db{(vDhV@#g+T-??nX!4$W<HM!S6eBB~>?wdmd!$oywsy&Aj
z|K`Zd)Q$V~sh$7T*@KwK2SU*Jj*Y5NhhZ$q+;~2-HTH7B>8D+}t)hfmO@C@L8c4Cu
zk6^PiqF0xfg?vedU$k;NlL$_G)`IXSWw*b^iOH{<B$ch#PW49Daqehehd4cK+24UR
zVJb7Q+ju=HdvA&2s-lT9l%7z?cYbavRve|NYu*eU_RK7t{{wQwtqnonvXwiz=#+nk
zaXh8_77~rB;=Ojc=6UnOAHy7E9;Vxgqc!T}v;)`1z()^xx7YKm6kK#RYSAb8R_i^x
z+1=gtZ8L6yzX{Zy%iSP{9}tE!K?c1MZMuq`;N^h%V|Zlwbvh%u%g(^%Zh#S4you7+
zuWOs~0wR-?c~Ony@#msaJU{?63wqdzyh~U*%tY1kkM1ycj8@SAdzDKi=mMh3-jZ~q
zt(M;YT~Na3)VTRcal~=Ilr24a{A!;jfu1IGWpLrb(eiuQ-z%vcfBC~W24ndANb|`d
z9>i3wP^5f;7|9muJ?s%<(KP@m`bG3jczzAzSnGX)XKLMIa(&T{P7Bhnas045y^>8V
zhV669!|~K?z?SJFt0qlHl0tt9d$s`Q-Raja89pnsRaSWi6l9rE{UY7L_))AJffA8V
zAo7$JNc9`K;Fqmg?POwiHJogM)UZ9)yfB)^rq^_(%63aDc(;m2#jPGJZXbhuYw<Ow
zM8+|?E-xQ8)QP+zxPKWGv7mvw;V$JDQ<5*nD~iJBa<U76fUK!aiu=qVv8{D|GT`pN
z$vK6c<?i%84eeb_9wRMzURJUkyOXj}DkO04An7V+e`J~ss_cS@ad`MSv1hs;x5^(*
z!#2fn-6suecW8Mzdy{}yduY;p^rE9kl4nK_qH^`9Yzqoc@94sRJoZ0u6>=mfWnzW;
z(*yu!T>Vv5g|9^;*OlX{v@9hjdrg;SFFvrV;7q{vr1;oN^~J5<4hA8^PnD<F#Bf-o
zI1CoSBdoj8_PSq-`)3xA7QFwQ%2LmLCB3w47mO#rO>jXfMAj~eujuP+{H3AYV3NQ0
zEA)SF;!0ShwJhqPbu@gFqKUh)m@HEgSnUG|&Da1qNe;=m%BKESoXyi==mvBn-x7rY
zxLGllc}U@vmG(;0H1tUt8c3_tN_)tQI=K8U&Nhh%4S&PNpm_WVc{|<!xZPthT;bKL
zHQi9uzNd2{_!PE(Nm9H=p?yX%9B6awSDrj?<@Mc$G*6EAFG1_#WhkBO(AdDc0@%}2
z-(iv<JQlPKl)swWzPK22hk6i-vT6=<Ljykfbx59YY-0hNxOt@=;sAWCqIwP3;^j)r
zsCa#uA^`>x_<SdbRh3m^73AF}BUaPQXeh&YiKnErZNY+iHc-%+*l0^UEC&JO+sdv#
z{{Sk(vRulb1YFt{XTnN!rrUjC`f@@WAaSFpyczuZ!X=_k)3a2{x&R{`iyFyf4^(JG
zU5<a`DoW`B6ZpAVS=R4GnGKq*?|zLquwQcU|E{0pexOKDXUQsRvi`x-SHNw^z#ZOM
zjAG;8_I$Z@5y>Q%Au1hoMWS(yq|urxV+PgbJ*Jq%Kr;{2UoqAP#3zn^dxS0J89^!b
z?4ri}UjyTEaQ^^lqn;{N6#wxyLz%qJ!hT91qanLD5;Y81N_+H{<6iBp3Gp>*=v}8C
zjpE|8qHPEa#l}T`(Ss1XucA$LLA0t*P$w^l5Q)r^5GO4xI_u0sw2d^ThkhRSM6=P{
zedY8KZk(TER-O;TwmW6hTur)W-D`KPMFGTU?QY;RL60RN(v5K&!9DF~;y9b!kxWDf
zQ_f*Rck^Y1!oYjd)6IM?wP%wsGI(6!NWHTaim`p&Ds?-WJGAj1l?g0#{vI)Fo91{P
z&Le<x)>%&R$9N*9hlt+Yz&Ms<t|@$c<xzBOUdVfgrM{?m$Q;Jy{E`SYXIFHQPGB?z
zeDjN=!J70@e){bKTyog7v>Osu=Vy{l&)P16Ug^bAGMsKhtJKpk9|a$7BCSaKRmh6j
zAa>rD85?Bc3l9GTUpaIry!K0N%Y}bxn7bqb0Cn1y+HjwDa{9F8k8jme`W0hOT&Z65
z3!~3HPTWcTtO!YlPEH0KESwE1lauHg)CPyy(K>;2I6X2jEyHe)@lZVUmfN-751xx4
zICf|*XUyzCHTYY3Q-wj`1r4OYgRdR=Q{54T!!dt=2%C)O6&`8O8PHp^qmUSy(G+5M
zf$*k>yaB$Jk7y0Yrk&q}`o<p8i!N==UlEh4+3WT$Y#=OLdEWO)z4B0#`Ic&psqUmM
zs_dr=7JzNnyPz&6RCtpoZOa9CgRrw+)t6B8{0{IFfm@&uNuQ@h^1v5u+@Tg+Oc`Xb
zK%!@?I0_5IJa1lC9qnX9sk%>MZlBk->~~#Oj<&+UKtS|%e#T4pqWKphmp`<jbL5{K
zfHY~(V3Y8xCA9Ud=3wIhK>})N1I})#%zN|InLo<LG!$ngL7RG*b#(1T6w#-GnY0yy
zBopv6>#&k0D^!Ptk?1`td(u8^2~t5=o?kj)bvFF0c~Q&sk48WZO|HE!Jy=zIiYboT
z=*{m}^lC;JCY$X>rsm+>wa~nKk66B_b1Rw9j95DP^r%DqhpucbvcPJDNVEKPDETc8
zR2A8j%}K0c`y<OGkI;Pe<(KBFvMAIFS<kGvb_8R!saCtY-{LVtydZdgT8pjj3#j^c
z-1E^|X2$EfBCUZo;CXs?L0<4UB?Hc`2?-JaWp5}3?8AK8P$HX6HG8%OU=#)vXskPn
zHWH!^sPRCrEdJ8|k_Z3Km-R>Fa?Z7g!eI|3y+m>9?1yT&P7ZQ3d#L}JL+)EIUN4=N
zpynNJiWnC+QO@B`I!?1}=^iort{}`!72V<cd6QJyjcq5RATrTtgSd>w^kqd&k%|Tt
zOFuB7d?SbHF$wZr`xd|JS|5!_BCj%=!ux?u2}XhOlL6wMx>d5&Ub@{jC$a_r_M#18
zFmwDC%F=8P(RLTCuBRTZ)Hq#{oU2Z!Z&oz>*GPyPL3x*`r<iL$7M(1CkJb7RU}Vk^
z>&ev!cEqT#;SfMHVqmxl5cC_uLPZEqyvtiMn2e7h;G9!^Dc&JzkLzmbU%K=#9KwR%
z`mv4p3mCQw<}rw(($sww-@6}97V52+BD?eJN5~}c98iyWV=F3&2bho-BT+-JN70v1
z9}?DKvs#f)h58Nt(ZH7XkG2}6e$|J7ah*ru@(JonA*<|{7aELwQg~Ptnm|t)ww>53
zk>J1<TRraq8^^Yhs*H6~@lqsD0v#aRb~4At;|fhPplMfBC-l6mmMonLz_lv^BkveU
zm%>mh<1Oj4dXl<CUFiW`FvizJe_j5;qe=>v{wpXsYzR&Osah-NhRO8kkyElZvMquc
zdYqxW=G4?y*aM}AbO-{<fIEzgc2kz=UYid(sa=D?&kEcn+Kg#hVJFPGjib7O2ZU6{
zFFbhXw8=W<HHldIDN^7wzIpxcj)8Fz<JHJ$xUoHZZIafzM*7&N43c!C{XBB>382d2
zR1?gKWNu?q&Nb$T3wL!GBgAJt%bs@P9#52sYRAF<Guk~{wZe`p0bou{&N8qM5GBEr
zkHHdlB1p(TE@RMzdMkr7V-hwzhOGF}raOp?C7%(%={M4_zf7CVYt~Vjgx`m*QsC-<
zd=tTUPx~+2mgqdyzd+Gt8fWBr$N?UxYZ2y$3&IDC%=bOpu_r2b5q1^`dVbAz=FC@_
zSh39Y^yM~>&11`UxW7W7Lq}1Mmm1)ah*l>Ebtelp2|w$vTrq|RGy@yXWX61zQjAMX
z(H$MCpgbEcFmv28TqwG}Jtz2QdgqeI(q9r!$)Y&KwPjTk$<3u}_B4zqPkyBfC_=1q
zUZ)bev3a(Jx1zlZC8$O=pE=QlvOjNUEuT#zH`7n5SLh?C{#rzh8tT1OM;Wri<Cjz6
z=66tF-y$uYWk8+=kWAf#RNxcdJIRY}pe2nJ^Xld(cK6&bSU${9<`W5Y5id4e3_?VQ
z84M}Ny9D4KT*Sz7+%%@L{0dk4MHe>mw%=3zztolQ?=jf}h@#Ts&nq}1VgVADtg3Et
z17vf>yvb|T3~*VuIJK&xW|jHz3qUe%A%!02k*85f7Zi$^iZ9ecilJcW3oWaty!Aqr
z!anh+lxg_bAKeYa^T@z}R7Z<Xh_A5JAiUJM^2<nrC!SZmuWBrg16ELr#_y+{%VT|v
z!S;Gpm`6{4dsR_<ZeC(VfC;$)-qx@F;db%NMqY)m>>I3jJRA%Pql-P*!ZW53gJp+Y
z3q4lSy><$}`geuY+1cYaNtk9%)yQvOktceAB@J%L6+I=U`YI~o%k_@AGb%$#pyrWY
z(trz2`^Tnevq`an`>WpsupCcM6OhJ+SS<CZ$3A~zQAe7zS*cAluuLiwy%*{y5Qkl{
zPNrzN)ZyY1v)W`QqD{RDcLV41r`75W!`H+~`Ozn8@qI{Yi0(NR+!KEXzZb^gEIb@)
z`n55|@_%i9RX3+!q8sI0y*0m6H)u-eI$5i<qmSaq(X1m=<&-@32s94&it^jo4BLOJ
z?c{GC!g~Nkxsm?@rL9}|wWG~#-tgA{6L6UUevLRN%zpsojp8qS`x*l{&NWkNhMU2N
z6OYt{^ro6yc9>IFykcv)`Q)@HKNhui9onY&EB(d7NgP?Sz)dbTsc&cq_iPSVY~N^W
zD5(;xs@VuYG4qO$geQ5gjPo(Y4W%?$(_s$;l4}zhY-i<Lt1PGW<6H*k&!Rx3a4a1b
z=hI0pYzI5e%;!V3%OhD8Vmgu0LYOfQ;~cyozZ~yLDi~Jr)m8oJm%a*A$EH~R@6+u3
z?h5~Vn&zJ`zSULl^q6;T-3uXr=Fp8kRJ@EuRmz;~dUC=LMn$Ay8>~{i>Dm3RMd^*X
z;z?k0b{ZFTdKD7)ub9)+AsZeL({qQRl$Sga+9bW@DNF32XmC1<-B?`ASJCcV6>ku^
zH(#u|hk_a<=HRiHpsHlfHn3B6nlm~Uke+~iHGe^$7*AIsWNthH!Ipy8iF%J)Z$!P9
zIT&#`T2Ijfl}l0mKxcZ02l%FX*$mmen!wh-xU`ABCn{>jO~85UGz}E8t}VX64Gg|1
z!yD7|9DyoYiT0C5*T3an2{2FI+3rSyt=PaqZroA}y4`3|!tF5`0(cJ0GY%LFkI7_E
z4vC>YG)LAJAc96KS2g!3yh+`Ck{Eq;P${azMi!wV2imew51%UVW0yvBxvZsZge?Hm
zr?pmy=Ud?_QIwk1M1qc{Z{%H=B`l|uCeM)070@@)GSE&_D{WBTU6feAu8~>|pM=eV
zspt;I-0I&#RmAlUhQyx}Op}FcZ1)&_6fbtSTJUSnm`<=xx#5&gdP)Duw$Qo^JYFQC
z4{&^;fUH9Opv5(iRxCOJ7*6s~ILP~dN<8}kf?&E0L-TK!oNPtxpGM+0t}-<sXo+3^
zwwh1EK9M(!=%DRdsC0tQ2G@;Y=->emQG=#qdp*^8fw=S8>{kss^wQUl37v);5I!M<
z<V?ty>vZJ$v*~Caoj8O=YY$?;xbAtD$S#TC=mb^Y62eczkZbMjj>$&&w2;-2m6_{N
zs>shZIHW$avz-00&=Pz&nHOFuoN)7^<m4)$lnXn1V$=R+^jAo6z^W0P^{zVv^knBj
z<`VXKGvP7|w9HcL)Dhgoj!cq>wzeeJjsAFjyA@FZ&s=mH9`{x_Ec7=|GoJMixJqV3
z-m%U@yS$*l=~79QA0Vj;U50&mCX3#>90W7=sSfq+fKt&3tz(&(Ue5Hn4I*>eNMH+M
z61&oSDR71ez1(USq73z_3(H*}QkkVpV5$9a@y#0#;`<>{WB`m_@8r4NO~9p7w>{qs
zsII2H-x@Fb?Fc!0&2oJsj^xCwG{xmg{nE!5i`)G^qsg@w)L0b1|0?xg4E*us*R=Vl
zPZoD+zs1Fq94pCQB=e7-t0hoaOiVfCiasBy>Mx*wgu*?wViH+GgK+sf%8$J&XVa2b
zC2O1_JCQYhZGCcn>MQ%A_OX<6b@%sAvo{(1@s93=wsYX@Z0pB+zcG}9JD%1oT-KT{
zfWuG&zNsu_*chObHp7Z`GiW0rJ{!0Qgh+b)<ZdMA0SxixgG)1ZapH>cz9!Uj>Ae!?
z%Va^!&v?D6b8|h69zw9b0=BwHn$=GjK@5y}mPEGWy}DP4_%p6<9C?ZDqBEhrLox1t
z*?v%`b1!;Rkd*XxDsr{qQ>RX15u6j1U`>LnGXUVkmq<L}Zsq&zWCA3dNI$Dn9K_19
zNOi=>{yPhtfn}czm7XHi?_89qx5Z*RqG@&}e<KInBW-?bGN#Iz+zme^#0GNQfymo5
z)RnDw0Z&zgB4*tf;*N;hm`hdAKylo+8{OV45>}B(PB?a^ny%s)-w;caT)m$*T*;1<
zXKV$c^KVpt4z(Jk0dL70RiHS7=V!fZo>O(L1D4Z>4*Il*L(MZVMG0Z`4ad7}edJ#V
z40<JDdY|y2r6_N^z3YhMwE~EhVK70!?lp896FkD!b)2*?4?CW&-L4ATb8{+4fI9Xn
z2<1mQa|jGZ<xIr{pR)#^_b+NRiI<1f(C{7getd0<KPfemt?#zI)6|CB8|u7oB^-{1
zzYh%7^Q1__+agvRjQ5kM93N<Jj)l~IuWo*=D{X$tg{^yk_h?8tF_CD}*)zm4J#6tv
zx>Q%$0fG<JU8-pOMDU(Y3CR6#Q7-*Ot9^KXZJh6FQ1@Ua=xOs|B0a%#@w#eBmpy;?
z?cUM#7;$)Q7gk6r(@FEVG~Is*+YSgJRT9c^+Z&n2`B$*7z<Ld`PMmH1Ynh)Bw@qVV
z;}R7oR(01@8fbcfv=X2y=sSYqWTVdt$awfd@sO5c>3yREykTM#ohL0$8}pz}{wza0
zN3hIof0%ZitqXS3KkJ7mCGW%%e3GD8J?#Ou2A={2e(yB-mwW!bQ|6yFg28Y>SLNc#
z7|EZk*c#eulYm9Z5(>o$vZsYMtc&A*2THVDFkiz*h}VC<pA09QFNMYNyrE&;%?p)V
z0GJW1@!KHTl7PRFNe^nMB1Qq`VpmPtuXnUn&8ad8as_U9)F7={>KO_<3qp+m@Q06{
zq@A?d4`NNK+{PifZ4<eTj0)gt|LU7;`sp5iw@I~td5V^Hm!=c90H+^2>!5Aj-{ZN7
zB2cqN9Ab!!4`xja8O*0;i%5iJ(04NZSbk-k+%H@V`$vfv`6~FqjML>@*@}rrv%_b<
z!zYpITg+0IR!Raj8=4cTXdU`0%Y6NLEHcE2{z?0jKya2`|CH;_YSiuO$!Bc`(@LMW
zaiM-{zDD2suDyXXhj$TPz`d6z&^mjfJma=hGUD<%;t+M*S;c3t!K6xCgS?Rl5XiJt
z>hRyyUh~Fr#Pt^8*PV)9t}F_SEazO6Z`K;a%Fxn+I5zDtJL(gR8Cvi-W{bigjS^)0
zv63}q#kZpVMcrn#af@$b&<DpWzm~-sk(R&Q)&}&D*3)(TYd0Zxx!34Ry_O0Q7PTk9
zFRStzi<FM(Gbz|4TpS{EZ8Ch(QF;np&O!45z~s=62D^_GC=|gt2rG4^o#f%S_<qhX
z#)r!PTI3_)CXuO6sMTuV@$iiTF9U|AWABHs*SCzl)}lAuo&I~I@7fp8W(1YtF{hoo
z6R4D}Y1slR;z=>ke!Ci9x>Yamr(a(mX|vBaVNa;|bp`N+s<Ym36TOab_qRO-;PYEg
zpV0o(Y8r~6YVV@H>H-HZ*tjxFIZy6l><cJj>%UG((qA*;9tS=nicoV5@)BTkSWb90
zFIK}{0Fd76tQqelgN??rNI)t)K2`@QEX54Sg<lT-@M0AfdL4(={8U#68;rtO|G>UY
zaMTGhjSaTxpp(D1K9(-ZM*Q9Y`+pyu=&#R^M;D|Nrm+IR_Yo5~W)wQZSJRfv`<JB1
zZekb{Ohkg`b$0{;`X7o%4*GyOF8{yZpVI&L6^Q=Q|Iu76FCqpI$xg*@Wt{a6&(7Sv
zNNczEycS*3949lwy{U$*24l#-!X0RoV3z9tfO^Os5OFeAOo2Bb_o^hkwT;N`!HFoo
zD8{6|3N`~!Btb`a*Elf6HSMM7`zh(JKizqZoXRIR(G#ZW*7<?`m2!4E9Q&afkOD$y
z(IjMC&))l{{*d+sMH#-UuhQT8tZfDlaScO@9S_r8a*zsLVJjiwBv$*%+1dq8M)?3B
zgR6Qc>(<7Efx2S|^0Xdd_I3--M(kzTL6g1L8kb+x)c@LLu$<|f1h#UAsw0o6?2xMW
z)To!1_7D8O_g$lfkBN`VYb$Q=0<r8S|A;O?5<!BYzJ00vjcDC$ol><D5LsZxfhn*5
zN#GQeM_vVgky!eR54G<RG2^-4CDw1j1o3S=ReF%<I1&HGg*aK~*0S4OnW8p%s#fNO
zG)d|`4$n#XDh#qbE^Hl+jd@}qBdA9z1AD&kxnL#sx}DNCB&w99u+G<h<Dz)XQz^)b
zUg*6f)xaHg5Xd43oRw=i*d88b;1<9!C1BURt{K+<q;DUk+}tU`ntI*fTX`nKAu@$V
zm9>*xuYhQWc23KO306x`kT>Z$G~t<*jte|I?SZ{nw@gS*Fn?C@xO%>~<@mJBPo)*L
zOdfKwdE|%pW$|onX|iZN>v@HXGPH3OMM;2cOp1u`jp#FKo!LbON+4hO)%e2{B(XO3
zD$}I67IMp=o<u`-E2~*cl;@tRwtM-f@oT@yiNNf*OkPy-71&45NVXqZf<sZTJZ$G%
zUpP|$LjEx2K1pgEu3=nx3XZP4-a?rmO(`?)822Cz2Z2l}tO`Hj>D-Kb&i_oGE{zeZ
zf<ADt{jPcw%PCi1A{Ss2V6Cu8W>&o3HUWma#V;n{1YHRXcwSWwm-5H4!zcD0exwlU
z%mZvrj&Bwp-YN_wA3BCB#ar}F65{F0s=_2K`p{qFE1#AUuhpMoNhvPxC^{DGxv6Re
zkw2?$bP3WVCzKD<r>Nit=l(&X1kc#YZ`hN`#m6h-O);N8!16w72ZjFe5ouY-w{OEX
z6Uev!zWxa)dO&OCu@N~tu{+B+_;OKq;;w9r^P~xLTr3r_JM4LINT&L17myEAIOdQ9
zz1A*pehj^MUEAT$*P86JbeqrC*k_)yq`U77uI~5Z!PF#Sci%GVi4Pax$KkD_*QRJ!
zYn@WI!$K?LkQ=9Ua#ST$sTinF73PO?Ig~6{!x$*i4e$mOg<lK2J_rtP9o%#W&!NxK
zvl^D#Jf`gE`bt-3=fkq@B~hV!I3^Zrc?I9UT1H7Z4k-+Et&y_{{33Xnstq`nGX=2P
z`g}9aw8fOZE&wZ08-JixzNJ=)Xu#{;xD;P&?tc$Jvb4NA*=Z^FLpH&KrQ*=MBqGXF
zOwE@%I$|9v#E9*5CbakJV_oDrx{mLQ<^Ae!^(9I7Evqt48|g_i7E7?M*}Pb#al?Vs
zT}v6Qx)+CS536sfjQxH6fONh}bOQT44Hwe0Qk)J7^9}0jNCls==I}^h=TtS7{>7iP
z(u9A*2g=2ny1pW0Wa=uv4L6ee{wji*j)JSG$#pM1pC!@Y1)GJy5n$l9t)^hMs}NP3
z3PA<NlZ>pK{s2}US>^o3+FUBJXA%|Zh{<|TehHnJT38>{eI%0R<!|xRh-NpVQ3du^
z63kc5sHRM&IoQs`NS&ZSD~>BN(&>m01;Wft-uH)p7<nz$#WBkY571`YAB@rcVyWrH
z4H3m_d&i@B1R`QA?;%QMQo*P!FU2aHKj`JehM@J8uCG`KUq}4A$Rgy^+9mRM+ymtn
zYGm7npB9ie6*9-T+_=pxBGs{Du2@J)#X)V})EB2x?~KPLL5HqxRo<gLh5_&hOx^03
z4eAB5DyrP;T*<B-x8Y%!<p{VG;;~BW+?fEH?9w|$IV?mgjlWXo)^A&-#rS{p@lM(g
z5I8~GS5iEpg5;o56%Xf0=gT%)J>soOaj<0XzIfnovcl><TF|V@<g8bJw7(OI8u)tg
zqHJXw_p5Ax&>&PdQsvKT8BmGQCoD=*=MU{jz`|W4^ZZhjPDVN%q>+3*oe%|Jv-~4z
zmVvsqfFs}n4-{RVk0!C|pZzpR^0XgSGum(O=tP_Y0L}rvs~E!Bq<Zp_6u^y?o7hTv
zhc-9h0#TGXyMa0#!L)kMB0k1N)!frJES>a?AudH?;%Qw&ashJ~em!Gn=FW98pYA*B
zU{k2iAeH(6u;Hjxq%&SAjxKd*esw5_&PnGm{c#ThVWuJgJjE969&?roml;g>sv9vP
zm)V5SZ+*@sA!L&RJiq8o!zxxIx0~Pi2w;WbF7EW-gm2P#(fPbf#QsZug1>H&JhaVH
z0)PA1^sFj8|Fl{RqgB}tA4qf&?Jk6}LNR2i$>PpE-p;h`>4^16F<tDubJ|zuQCj-c
zt5~p+%)f-x3zh6%aTx2a{PU#Nix`c=wE>ukP?xixC15*dH0PP6=>pg~j&Fzih{b*O
z#De$SQQAU5sMePv$>;$bidOP6%-3DULDVf2MG1f3ND6C0K%sbMaVQj|Sam?3^&P`~
z63Lg@^&8JfkBCJvI616D5m~~&6>)Nuf3n+kLCk7cZ+z}%^{VmSBU0(4P1=WVp$4@B
z@n=<6Se6XRhtDcgk=fd8K@T8g5pH3#7ihPblynfyd4p7U+He5&r+;NQ!L>DUd6)m9
z3ef}87RZ|c=~^NnZ%^LTv9v+mw3TZ5KL4CCK_Iw~Pc8!qNfN@MM!0p?|4L#ErySV;
zU4y|<b6tGRP|GFAdF41XWb63XmW@rzWru@1rZzQzU6pX@7t$y&Jry40<@pXh{N69@
zhv4b38HOo^6xOZ=exHxi%_0}Mdpx?J`8_Dos<Fpe*<<VtwF2gViDVSk8sgOta>=)A
z+_^FZMaEvGjCiu%S=He&rJf=8uoqS?Q4lNpoH5sB!_{c;7nqq8sJQb{2<~-{@eSp$
zuH3G!h7SK>B}58oxp6aREx3KDD{Rk+`ebs<xaOrwd#_j@#BH+URlIidcPsS&@5NRe
zI@jNV6iikA)WrJ-dtIwfcn<TROhJ;t(iG*^F?Edx<z+ujKeT%JCQp$0Tp0amyoGJ%
z=yjfu{yqX~MI&3fYYNsmY4uiI*?9e4c|`4(J+QUsj*6S<(NXQzACD-njd6~+sKkyB
zKhv`RO!St-3o7PaCdDlr#9L-E<>8%_Vcxo>&mKJiviX~-{J|av$u-@V!$~MD*f`K_
zpN`%Wxq9ahjn}<gMs^QwEUl**;G~v~yM!RZKM5wrC3G^L{z+xVx#?2Im^;T;&WZU#
zIu*ft4rMsys)$ouGViQ#);CiN@(g)+(bKaD1(#$Ea@|CX1>F)Gekka1;myY&7^!((
zfD2h0>~Qgqx^r5uCwiZo8<+%)sJw%mtg-9;pm~5@R*9|QWx0MDB=f-dL58b(;kpJi
zJJIZ=WZ!3mf&Me-{4BG}bFtsWtjYi}2#`-6LdN|{#D--YsNf0fkh=o!c9oSVQh{}5
zK>K!IF<aKyiZ&oaOJB<N+TFYWJ@gMw8_gts`ualllWAe$%@q*Ea*QvObc6*mgk0Ha
zJRjiFa}b^%@DhVGUl1DrgRZt(<o%eURAMFgkJ#D-P183*QR$i?E%m3aorUv$AC*wN
z7EZs53-Mv@Le8r_hR_qI^^8(-vjc$BRq<UB@O)xWBPY8wl9ItU;5x5j{XN>3NKS_H
zJ_&FlzwNetS7Y-^(n?1TOJeO4Esyxfvis&*39*bC8uD*rEtmg?sFm7*LZl<E;>l;J
z1eWxR7Sf3RuhpR<Lp)Y^tNj9W$f#~@iP>;M>C!i*Me>Cb-O5p%x$-5Mm(->L0kB%X
zrS&SU^R`Pr{ZlUD$gnr2X2|?}ZQ@7CX{-BZJ_LIh@h|{5@{v2|ItaR5-&fb!H&C{U
zQg;-3-p>#SJ0!7*%$LfP-Av%$-)>aSZyVP}<9~1*?a%r)u#u&>00Q-2S;BH8V0Rt*
z`3BNQnS`2Dgw>tBzr|Iy;mr$x$UcWYm1$tzl$Qr*eJ?0VTtc{?|EIREj;dpM8oam$
zcXtVH;X-hCcXxLS65JCYxVtC8JwTA)L4rHMJ-A!==H|V(`@Y||=j=bbbxz;v?y9M_
z>bX5tGu=7N!FX!to#-1-){griTm$;e)ZC8f)=$gyb3{UNJ0-D`c|I1Gqh*4fTyB{Z
zyGskFw(<9Ap*vEcluX@F;5gX$JF->Ms@xr@P{OUZ3aJQg8VGx@R?N|4-;ka8|NPFR
zV575<`6SKY8K<h-{DUdGOl<Ok2&(>7-zf_4rVa;K&eCb=tw-+%wx?p5Z>e}>N(&!(
zJ|n(@b?REd;iQsRfK!S{`ms;Rn%@Gg<ZcMtPia^bJ>C6&Agt48liD@`o6eTQ-4U;2
z3CA4Ht=yedz_g|H`)BBjk&wnSy9L5n<eU}+*b5tix4u=oF_XlU(;?}{^2{LamdfDx
zb+767-}R8rJDb4>bQ|+b)ta29Io+{|m>mJ>U5^&0f?O`yQ!ll|O-j!n-jIf{73(hs
z(B=z2H7X_Hg>Yn^9=-lL^Am=&k(OVFQX;{*e4Hevc(CIqR1C9#Fp&ns*poO4n6D)S
zW=y?`4W8eIM4H5*mBmsOeN*FpRX8^i?RyLh%Ry5hqhWHm<S@RK2Em5_8aGNVB`gK*
zG8cuXrhUI?RDITUly5H|?Z{u(GefB|ZFF{I;Krh)Gx#->IoEp!;wmK?jd8$hG07eC
z@r^wkwx!RueZXCN!;4+5dePuZ$V-r;dWV3PDv93km{3Snl8|^2z-IC(Y53e-oNl2~
zfQ&>mS6stYbahRvIcUr_Q^w*p?kd<o<zun@7UzZYGPjai>PJDHJOZl#>EQ%QIV>(C
zFG0kh<41udu6V!Ugi;(LP4$s;ok=`fQE&eBh=-Q#;~%6;fPw~!=6OJTuFFTRZ|}T6
zC(hT*u>2?-Mv{~gcFbQi=MLKVRHUU}lB{LO)G`WP5G}Vnw*P?_{mKUI4%Y6RF2$Hh
zF#guZW44CM-N08!3~kC_FTe`+(gHaTvrkm;MKnj&R7U--#j`(lyC$kF_eVQ9-wyiw
zNaDf1QfG$JH+fTjF5+bUGf_in3|*zqlp(|61M2pgema)(n5Gm2MO}$I#j_o2<eFwd
z&sk-k5t{QmDy@NaAqWMBgagGZ-6cIJ`sN|p@_Wwgs@S1SvtbpXY#P;?t0%5G%%RPm
zg0VYg0b%)TPIlCkASG$GWj>@8PF*T{jtI;eq=a)_;h82_djIkT9Mu3qQQaDyuS%zU
z$P%<*j$Gh~Hn+@~%GN?3X3OrI$)%~=0?h|)Mb{MdkFJ|x`wX|2)dj@hY4>@UF9=~6
z4faZ~h9rFJS&7drpAHwKt`I*J(VGA45*<DY^iVWl^9ASZ#Hf@j?V3=^4}5i~OLMmt
z#-?@j^k?~2!noTed7$jZXEKQu^&Cu-7=@@uh)<i6&8s6qR#&s~5jrlIKOe_;+3jA(
zS6*jJFjdbBENccO`E^)_-=Ux7(ceB(s?5EhdYnnVfJz#3H0_ei6&7FB2b?%hSC66>
zI#=4f-=rpcV!|J;?#&Z*OE%RVi-uaI1%>dg0C~K==j%MtR`JGfst)JN6!FztMyNt&
zsQbz;11~JYgir22KCb2D6s$6deK^l&KR5_p5;^@EqW(NymxwgLwM=)fF!`K^b*xrz
z5>9Kr93D&u&3y)aI90C7&C`U#Mil?Rx~98XV%Pz8k^ar?(lB=4K8I<#?MdXk6;6tM
zi1ktounE2b*7!FPs%nGJXZoaPeVa<ea>R$HPKa*P%$=sn;Z!viffjJ5WaW|4&Im9y
z5l_330rRZ_!t-2tb1+K>lAyR>&b*_>qc2I3dpzH7bFx6Gm+Gm**P6~uCR4I4^U9P<
ziEEA<3KUV3Ik_|5BcLiRdCauEu$#FpI+Uov@j*9zRapX<(Gx2yO@7j6*9ixgUV;cf
zIWn0AmiH8{y{8X&3_~s?1IULr$BJv7uf)`eIOptXzT*W`P3YbS2>%)}sd#=Nv8~LL
z4h99PUHhTAqM#^Q8kbN<!#9PrOE5_O-T_fd!=sVQg*h0~?box81(GkA&^sm=uHTMs
zBegT{gL6={@AYz>4PO5u<CIa}eY@!Co~^@dZBuNRo^6>+P$rtD?3)W;mp6ctw-G(M
zASzYKYzM69%mA0C{{+P4S^C?crSBn6{19!*IP8Uy#K2J5qUus6-uJ7qsA<7z^mGH^
zF>P-@!2<@q<gj^KfwcJM(}8V4YJ3|@d7pKji_lm#$tQCJ+vULkmHX&VnzxH-E>iF{
z341^({=k12q?Cr|pY46&lU*`n+0|I)@NruE+jW^A2n5%WRmckZrsSBjG!_hg$E5bl
zQ__(Xpg3#3-0X=ccal<4M{h9V{7DmLJdsc1%e`t6fRH=yqnG0o`tIgY!Lj4S*enyo
zT;x?IQQC~n*;M4zAE&r$lgq<%=GFu@0TkpVR9=#Y{<G2hlPqWhaD`{jLgRLtOR5hO
zW`|On9vU&#J(oY`z(K<YcgkepL9)=QHz9j9m@*3WmdKtlUHD06Nm;t&ouqyb2j;Pq
z3J(~9x>t*+?45igYTfEvGhwR8Cbr#ZQ`t#Yj*20yC%&#epN$!ui@9gJ3t!qUt2lc@
zwFS9;@j9CReWTP*rjJILGWf;*nx`y~`an*ab9|NWeReD?mpUGMo41XWd_*eeW~^4|
zR|BmmZvO|}`bd`awjT`zh%Tt{qsUv`Bfn1Yc9TU@Hs5aWj`T<KmoKmH-N?T($2-;J
z8J&XhueLr*fayd9qgJqVwWB%L(SuUwwH?%mS?=31pja@BZEK$}gq@YVCZbg{Kh9@r
zu0FEiHhw`qA|p8spoC8YhARUw*xvC14zTl^K(mxzdMltyI(&=Ra)0aZL6Z?6clW{1
zJXKNb3`zxLL>V`5{plK>Q)K<eq(d_{1!>n<-gv88=2c6pDl~+u7HxB7F%@Lvpl0VG
zo}Lt`+FTz+UG3W4_q#|rQ5y?<95~SF9C~r~q5ch@+LCH`IoOG-UwmM1DM3Ve<af^)
zahM;3HWN0qA{T<gMf#BdR1Oa?<_0JN9u)nt@t7}>Jfcud0VQaF;T41h*lGa^*Yo_j
zweiW9bIB3l;{vD~sJs+A{pUL-to?isZ%yAMMxUD9N$mTw1D3kI8#J|rG()dTj0TLU
z2Fguyj=T91yS|!<Kv#kDJG;&&mplRK6!=4!+MJ%&6H#$O8^a&=O5|29q!{g~73aZx
zo}b><OsZZs?&nZUufH<HF{>CIgsJ6iHan^zWLXuSPRCwpvGOJj7?IfVh<=%mLuvB^
zVQ8g;4F~hLt>&*BD4E@4E?g0|hx3A+_I9o6W4cEk=sD!nRCb4-kF%OilkW~=zudIt
z9y7cz%=~yA9f<lIAwPRGwE3$(oN4bO_+TfZsp?e=Us)-hcljC8BwsznOYb)}%7-vG
zdW7?-Di!_PxrppHr{CxtamKB3=Yj}QU4n3?ZT-HFvcd{k+(1q1XIJ?rt(~@c<B1$g
zLIpkK@jYS2e3cPrjThjPnlR1Fvot){E)=3%sH2@eXfd|!EnTV`kc*5T!zluPsz*i6
zc2GJQO_8?dGn2W)U<!lb{GPBhaYeb%JhN~Fb<Hi|8=5qMbrK-empnrSbDo+S)ib!{
z_B(@#tM<4eWYjeMu@cwm!SeWiXiZ}cf%dg~1AnTKShNKprHtY<ILr}T$-a3h`RwFa
zU@#=gOKA+eOKmWie*3^dRfaN9QrPIo%F}jPT!<D3O@REkB|*#^C*p6~oZl6ylnyAw
zJw%VSAdZK2H~Kkn;YVh}p!AFNr?g)kgz41Zyd_01pY!z{NfLggU}+Y91%GupvAjG+
z%_g<r;u^TpKD~}|+`J}2-6TI0FI=t-;m!$3W;QaIUREcCpIy8gIA|W}amHi5K9^o>
zd+om0%~*CZ3cezEs9kp<hm$HOn@cHHC%pSgfBx}&ad2=3@J16@6=$qtQ$?N%T~zI+
z4RLF`BbL)cr-Q-4zpM(AE$8b)ohRGW%P{t4#Y~KqWvQ9XDWiC(iC1E^K3F_n(&(kx
zN%6j3&9_cLxm8v+Yhg$M^VzD_zDwI~NnDP6KJx@04*40UpmE6u0j;lwOcYJP>(M)M
zSAhNcfH?N<N=d8N8NT6x(87XYV@USMbVV{a7eji#5r!g%ws)+Ij7{cdk4yVS5wnct
zh0(f5WDa+u5-#%?+XLy8qMN}mJJ)iY%_mTnfn}~@3ih+PBn})K9Tlp%<3TE6GF}Wl
zc0O4YuW{OC0^Dqm^rnVh4_#R3K=C(k$84Q4DYwkoAhXp)>8%l(-mw!;`Ic^CXdH)$
zBdn;%pSLZIBEVe5fy`h{z}qsOyhF+!pzKbh)7ZY&OM5uW{9>oayTin9UPIxB%@=kv
z;qDP}d`cHi%{x0Wg6hIFONB9dP1F*4*;6BfknsfB>yHJ5h#0YKg3nF@rb4jw&Lx!L
z->PQ>1af>oqedP(j}$QAjIo=Xj2=e>ysS>{d-r@zu9A-%dm=?=smF`;tBvJRMm(3W
z2>)#QNiov$triVJb<rhb+t-hUG%z`L<8(dEl*3Wc&lhb`_g<f>G>xW+O$Jt{s-&E4
zPho7>HBVo>KiUfMQCgD~ZHC$m&b8#@(BDT{r@EUfe(Rs<qfO?!Y}uLT0DsQo+x2vs
z%jaix?gZbzws)l#R;`gf2<}>_T7#cm2+%>M2iB^AqFEV#xs*@>=P@i3QB5P*CDru~
zy7SmdFi_E$xeQJGb%YIrN#^->;eWe`qf4&);kq(T`{2J|A%k)l;g&UZYoaHSMGLyX
z<{*m*7Yqz~5H5|G^)yU(&&QuU!l60CVh+$?>gD)!!Nv{+55trB*=3(cbcf>!FN0fN
z()~%H=|O)jk(u(x6uWI1GTGwEctIL_B=no3TNz{zj#^ZuXD}F6ZhJNRsn2m-6q#@!
zeNovnMl`;)v!3(3q+`M~`lp}BSUW~W!1C7n6Kg?qCt!CfUg9yu_s||`dW4naB)YQ)
zbr+Vq##%&&O=CUg-4)Lht9<pWw%@dEo|)Ua9FI3%Cd=QK&2?_s7d-dDO37LW)0D#W
zs$av4Oxi_7PX{|i$6+h`2@s}i@yN~%>T*3%WxoxJxz^HB4hr;qdV3NrM6|rfUR6b{
z&^=LE0FUdWCA5G<cDc-776P4_mU<><LyCz)I@3rNYjGCh^!?{-`t=?Hi`|IkMzOvI
zUV~?KNPjB}v*`LA!tQPzUQIo?aj~4%Zh(WaS~K`{h6A+^iDs70j7N3)+NP7`((c_x
z>GR8RjC4k@wdkR52&Rw62hXDEr?XM;98C1%E9j}K-CEM3K#$?7nG(hF8@g@Ycl@jz
zgan;chwfW>X%#&5tImWmEb06j75QSFAID^0yY`Qis_v1e*DvSvB*qm7sm0i*4QbyS
z+9xnz)+_v~{?hwVgH4`mcazU{vYWpiBqd^MdNz*?qesL}y-EK0)q$=-Sl=((PcXac
za(cA#as%Z@R?pEa8%aq`#)LuIDjY3QVq=kxI?m^VMuPlkh2^ka#sdT<NRxd|XW+8w
zYSdqOP#_(8Go+w-H+DLOrZi35qpsGqq+hmvhfD1YKH)|;Q;Ud$u1Sx=i@#zp{Y%RS
z1>l&g1wT0romck;Vhm9>cCx5~X3lWD=jUgNsL`$&#KbTVH-<zVz5(~E|2J?lt3hFx
z6s|a67%^PV5m&F%SJ-LqLYr@JiQB1{AaQxrnYqSHJFj8V1O5yx5j*U$<9ZT5Pd_{I
z-0~Y)LgRFKt^1HdMxt$fQd-eAHe5C^28Tiwc*-RJ9x=Wo|K&!A3T)6@&#$X6;pt(h
z?wlt{Vi$8OW!{LR=c$e5t!s6W%A6T2-F|4!L-u?SGEJ_GmKLg?-Z4F6aSm{7=&$}L
zi)n7!Qb0?T+Ri<+;VZ9u+vVX|MM_5g7_#KL?Np}%A`8ud=e1osnImRg55q=q4<C5E
zMpr)Bw-Z~)@++6%&I-WBPc|!0T*@kA|58!xaS$JT!t4i=boRXieWcWLQ`y7MGL&%0
zL1Z|?AOCj&)-Sq?Xb}2Z^@I%KC*+4JwF!0Q-9F+J?EE0<G0+9=Iam5r_~M4k8(1Vg
zSZ}0CArT3V)#EAp3Y(ETlZ@K<B~ovRku%qydZR-c58e^dd9kXrhe;#NriPJKC{=1J
zTe02EJRYrr<nNd82xwP6a(M3+MN*;-Vz1y(AHzw2UWx@p?-R-<a|U>C?Xtzxd>y2V
z7Z2ze&l_jnC2d)H^BQ%pNVNI2k<NCHYXjaI=6180IrzXi(#+mL`M1K{?ikB?Th-B|
z{sT|)o5)YQAz)GwBZ&sd+O*dEMlM|ed0Gqo#dGxn?bk*yo3GdrhIy!71Z?&?Q1V8^
zM38vsAxt}+?V%V}v{Mz4ZaRqQ2x}6WPi~HLjC?b@x;c(wGNxzJR;i+8y~JDGeaR($
z+_Bu$d-;NXbFw{HGNc1X6^)MMC0)WgyrOft^El^l*OztshFFr<IL`L12@OMk9zUU=
zoh<({(NU30dDP%;^x|$TTSqTt!<I0{ypGO&weObnwx}kM`ExWbnB9h%qC<_2P5pWM
zgy8GTTw{G)7&7^x98QW)cYe6AN$tPnzK`_gy`Zty3@v(RdutNy*FU}xtO$3nOsn`E
zRY+CQ_uFLz>Dr|@d?ozClABMq*Ku*V@!r{i4c}^j{FSz3G`<?=?Q>N|eb%`2wTJVV
zmU2PAoad=P3cfQY?2F=jtQIe`&L+lcbou8p!u^jwF(>M>o;*O@s4|N4wf5>>8K;3Y
z10E|Dnl>{IepkQnVoVRaJ29uUWP{(Ahnp2K$Z}QP<^Ei1I4Y#Z*z={{$6zqNH2ocy
zZwJ$82AncX%&}V}*EO@|UEr{vrBFM^_u;DQm+NPATuvhrRBvnH9l-N6T?|>WcL5KE
z|MVPPAb2OkOQz!Nn0R7UjbA~trLJrHyQPBuKD6oX>ZZ{`^B`>h2uzQHC60={ym;d4
zat1x(398f(Ns7i53hRw+!K-C6PJKgoYdNRj7LG_ZO5ITtN&a#YR2oEP3^h>9tn#S$
zg^&}T3g|P^b>Fdl_vV}8L}PC*H?5NOM{1|tgBU*%UPGfgW;VvpTECNrWrrdfF&-*!
z8P@h3qek5hkZG<j3==cFL=z3vx-W-hxfjtO8*__@<~2*=vUKi^gR;(3<#=%g$gZWt
zL}ifc9O&+39H&>R3@i5D<8;h_-Ex!kGTXRK)%j%a=<(BogLQ}zDNAyoPo0B8npF5)
zo(m_s4_z_vvnhK}Cr{&bWItHOo;hRJUu3j{`|WkAEJErt>B7U!DY>{OOS4L;UES>D
z?N9}ri+bzhm(RSfuW2q4C`N^LxbQMW*ljCyL%x{7N#f{zP&QnTJc7@?aWl}gamb(W
zOd<%~Qf57R6KZ^pDrhr|)T*`h#_~J5&JN~_^>`7Ahut(jW@J*dk?Rh10p7JrLE?P0
z(3>KDwif>bC<0B)1TiKH$Dy{iD5}>t#i$*Rq{bTunG3FlO}2&Ij^b~HP-}Hl6Eap(
z;*ns;2DwGzRR|6vU>nb`G9rZw!CpoD9q*Q8-ilDDSlkCLqPaHA(6S3beG7Gs@{}$O
zAlZ$x8O<`4++BK5xDwG{Hds@Wo31I>dKDr{L@haZs8MYulWU4K*E$=VdGpEOm`r2E
z649wE{TY{cTEGSgr!Bgo7*2$oJYX-J$U6zm86nhc8A?oTTZ%+_73P(8c3KmimW(^v
zM^`G!rOpNs+T%NOi{l;-NxAPr0#{lT1R-*g!q7jYx9+ftqBTmf!!b(^89R#~Dc^bR
z@{mq^ffD=zZvM#Y;v|_EMEB^N=YPU=MY)v*N8MYkol)Dm<D)b7Fk*p^DeVY+Ea1lP
zUi-cZ_HqE)QB?^hzHrUKeXPqkkFWL}>FP-^^mX*}%4*81)>d8M4#EWdyF7TPs`U!{
zs)%Xj<1zAo)_j4vq;9T1CGVqwZI6aH#kivu_1jgcVlb;7PEBP_BGpl7oUe1|J!+au
zm-y9gyQ=B{wL(~722mb66;*LN#qk&U=wxv^rZ*(4_(<khH+p4oSm26ztv-~(6xUkn
zZe#;-+Yi^a_|;}8G$LsV`K)!tcoZ#^Yl#v!XE++S_w#P%0@$}t;$apN!u9(&nd}I1
z4_yMSq+boTVXY`9PdQ&3b<k#@fzK=5nCx6EjST$6iIxoM)UcWBF2>#^V2vHTm@Mh}
zkO7BDTkKC^dq9>rw9+pc;RP)m#8*wfNl=CPvtT<6xi#fx`m~N{D3iy$7uWZSye7lj
zpD~P8mJ_FMT*0Xd-Qd3N%$2Vzbd%T0KED5qTVMgWZ4?7mtAG_8jau|-!nBtL4L`@W
z<^@#akXS4BzvPxzZoGRy9BI}Ej<;RIX^pcnW?#u-$II4$MnvVWFjRWU#Q2up7mFin
zow14WYxmqN>wH04aiv3JLwww#Cbk>fZw^JyiN0fIY5l$*XGFT6qVNdkOikO1Xai(*
zxK)0%l_zidO%<M^^Qrx0i;x&-Ok^dActZNXU!~N9dPj)0Ld})hl_%Z})DsK1y+P_(
zFYMv`fs9lF^*gY=E;Jle-aKHD*bE-#k=;Z~wq6=&kI_=~(EPReH1cfsdl+rfvDgwL
zL#vqLGkm^U@f3sAKm!J_ue{$i_wr75_LFgUbhr$<75Z)Z&M3Wk>UX#<@wavX9dk-w
znww0nh9ucN&9SxY`g>Xlycjp$>BGn5n&65wjwd}_?du+hrBXIZ$2s&=%7Hw%(1r!C
zKkrpd8m}GcGZI`Dgtw;J;>fXlBGZk@Vj>o)IMaXGm*sE^3h1wPCNe{^n0=cE8@s&B
z9?rP8iVUsg$;iEnhAi>LXg`&zcYvVz*LB7>PAOkP298FOYMj}yBVwww`_Py89W+%-
zlaUeucb2&`_3aFl+8((A*3w@0u_Pa{FP}UMUmbRpT0e;*zc}AToN6Z;%3+1>Wv?vb
zU`w1!fHIgA9VMDIOQ3rT_QU7DS~n|h?%8;?Mx3ncqe(u+sIkKsPcjqB1dRxj4ztdB
z$`$oql9|Xh)GbUpblE~?Bk?V*BbUZA(Wpq`PE_2Zq5N*%%&>(wLA#_)^Hl4b*%o0O
zzi4>Z6zYD1c>LT^*2<xEiV$cujW(}FvZ4D~YP#+9a-69{3(L^oFve_E8CP5ZsAa~_
z@f@vO+#hr)nG*Kst1DNj3y)l}>Aj6cgczEcIzMlolWt;hDcJhD6QZg|nD{&Jx|VTt
zs@~Pa6Ou`?nTs#jMFjz~o(Hf`JVgNZiG8fBrt?bVWFo=#*ggs>nd`3ey;eIB%=w`H
zb>|Z9t#^gGk2)6q@|ASlEfPGWXh}@7L7L*BwY}#~S;Q;v*Y3FBW;IiWjIQlrN^23k
z0>bNF-z#4b@N*7I^NwvPAZlBXWADcu(BL~%F}{)2Fdo05zNA#9q44Tl*3WztC;WN0
z@KvI+^YHQ{b<1tUcmf6V1Ic!i)fU&K-?tq~-R64t+Fm6)66J&2)bl6@ya@M88@jQ~
z@MwL0Wf|}{j{ATtV@eaHlPkCxLdMH5H`os(!4XGtSR32Ic}>CeM7uxR_55$WFm5RA
zS9~<>+Yr0N&Xc=~!&?!?CcDxHRKN9YSs^htKvU5gKhnKno3QR<(|$xpaz9R7%+N6x
z^SnlI&FCGbL*#%GI_Aa)=2Ze<+6XiSOdI9xEZ}7A2w#%-Z3@;wk3|7GqCB?s3G<yj
zFx0|VPAOF-Rgc2~=t{LPH1PrWvVKuo?l4A5_hm?9E1=enJ8d$4Msm{0?(XBb(}l4_
zJWyf?z11P?5m(R$x9Em2I(<5!!qMnLt9AC+%<maOUkAko0tlfoGqzY9XZV?*yTU=R
zV3P+tc@q_&Qe=Sf0;m+%v+K&+vcrisqDphpH<i6T$hf>#JWTPx5n#HG&dy!{r-P%H
zjRk=J1u%e_97hEN1!%>;a{qe*ApAEX2qf}v-v1eg222NG9&V=g0Lj3^{!f`O|6==F
z8c^<k!he+We<~Nk@;4ierG<w(kR-8maQFCI6hMN=#rT^E`q9zC))b&oI9mK?+ktN`
zK;t0I-al8e*jPBY{VfCJWou#iPaJ}^fV8uWsgs3+C4>jM%+c1#3W(r%IsOs)cYPqN
zAR3FmXe1VHmJl8YK<KERZVnWG#i6a;J<NckcR<zd9`64P2!tok)XSU$z!3Pi%)c{$
zEEymQ2@v=n9Dim1>zCm98A$N}QpKSxK&a2_AOdRuS1gEvBElfPNLa`ZgMdn@4ipI9
z=5TYEs_W_Y79s)1%){-kK7}T~1wjG`MBwi6r@tX|NZ~}lHr|jp4$ut9I6wfB4+-<;
z|C^WauRLH({?H)(4&nJH4M{@Me`Eoe;6Kv;H~#<MbP7m)0G<9*KN6s==8y~kAnk+%
zpdbF=w*bu-i2nmXzIG4@MAZbmoqY(1m;-tn45%v}fL%c#;$T1~WUfpCApiuT+ya4+
zf#XVG0Pi+JG6+O53S4&rgiGMs0uTu86qw7!Kv53>?ge;q01T)L%pFi5Fb3w%e-{j?
z6FU%0JuN&;0j%q2`3FPDl>gRqU~agXy0}18@qYoc?O&6L=A(x*L?DTU>7R=sLO}|q
PURJL-Sb14F*xCOJAC9)u

literal 0
HcmV?d00001

diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index 6572616769a91..6ed21de368ac3 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -18,6 +18,7 @@ from .utils import cosine_similarity, create_video_from_image, normalize_image
 
 pytestmark = pytest.mark.cpu_test
 
+ASSETS_DIR = Path(__file__).parent / "assets"
 NUM_FRAMES = 10
 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
 FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@@ -140,3 +141,39 @@ def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str):
             )
             assert np.sum(np.isnan(sim)) / sim.size < 0.001
             assert np.nanmean(sim) > 0.99
+
+
+def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
+    """
+    Regression test for handling videos with broken frames.
+    This test uses a pre-corrupted video file (assets/corrupted.mp4) that
+    contains broken/unreadable frames to verify the video loader handles
+    them gracefully without crashing and returns accurate metadata.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
+
+        # Load the pre-corrupted video file that contains broken frames
+        corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
+
+        with open(corrupted_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv")
+        frames, metadata = loader.load_bytes(video_data, num_frames=-1)
+
+        # Verify metadata consistency:
+        # frames_indices must match actual loaded frames
+        assert frames.shape[0] == len(metadata["frames_indices"]), (
+            f"Frames array size must equal frames_indices length. "
+            f"Got {frames.shape[0]} frames but "
+            f"{len(metadata['frames_indices'])} indices"
+        )
+
+        # Verify that broken frames were skipped:
+        # loaded frames should be less than total
+        assert frames.shape[0] < metadata["total_num_frames"], (
+            f"Should load fewer frames than total due to broken frames. "
+            f"Expected fewer than {metadata['total_num_frames']} frames, "
+            f"but loaded {frames.shape[0]} frames"
+        )
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 369c5e6cb4d10..5c75bee54dd30 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -63,6 +63,63 @@ class VideoLoader:
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         raise NotImplementedError
 
+    @staticmethod
+    def _read_frames(
+        cap,
+        frame_indices: set[int],
+        num_expected_frames: int,
+        max_frame_idx: int,
+    ) -> tuple[npt.NDArray, int, list[int]]:
+        import cv2
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8)
+
+        i = 0
+        valid_frame_indices = []
+        for idx in range(max_frame_idx + 1):
+            ok = cap.grab()
+            if not ok:
+                # Frame is broken/unreadable, log warning
+                if idx in frame_indices:
+                    logger.warning(
+                        "Failed to grab frame %d during video loading. "
+                        "This frame will be skipped.",
+                        idx,
+                    )
+                continue
+            if idx in frame_indices:
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    valid_frame_indices.append(idx)
+                    i += 1
+                else:
+                    # retrieve() failed even though grab() succeeded
+                    logger.warning(
+                        "Failed to retrieve frame %d during video loading. "
+                        "This frame will be skipped.",
+                        idx,
+                    )
+
+        valid_num_frames = len(valid_frame_indices)
+        if valid_num_frames < num_expected_frames:
+            logger.warning(
+                "Video loading completed with %d broken/unreadable frames. "
+                "Expected %d frames but only loaded %d frames.",
+                num_expected_frames - valid_num_frames,
+                num_expected_frames,
+                valid_num_frames,
+            )
+
+        assert i == valid_num_frames, (
+            f"Expected reading {valid_num_frames} frames, "
+            f"but only loaded {i} frames from video."
+        )
+
+        return frames[:valid_num_frames], valid_num_frames, valid_frame_indices
+
 
 VIDEO_LOADER_REGISTRY = ExtensionManager()
 
@@ -120,24 +177,10 @@ class OpenCVVideoBackend(VideoLoader):
             )
             frame_idx = uniform_sampled_frames.tolist()
 
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
-
-        i = 0
-        for idx in range(max(frame_idx) + 1):
-            ok = cap.grab()
-            if not ok:
-                break
-            if idx in frame_idx:
-                ret, frame = cap.retrieve()
-                if ret:
-                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    i += 1
-
-        assert i == num_frames_to_sample, (
-            f"Expected reading {num_frames_to_sample} frames, "
-            f"but only loaded {i} frames from video."
+        # Convert to set for O(1) lookup performance
+        frame_idx_set = set(frame_idx)
+        frames, valid_num_frames, valid_frame_indices = cls._read_frames(
+            cap, frame_idx_set, num_frames_to_sample, max(frame_idx)
         )
 
         # Use transformers transformers.video_utils.VideoMetadata format
@@ -148,10 +191,10 @@ class OpenCVVideoBackend(VideoLoader):
             "fps": original_fps,
             "duration": duration,
             "video_backend": "opencv",
-            "frames_indices": list(frame_idx),
+            "frames_indices": valid_frame_indices,
             # extra field used to control hf processor's video
             # sampling behavior
-            "do_sample_frames": num_frames_to_sample == total_frames_num,
+            "do_sample_frames": valid_num_frames == total_frames_num,
         }
 
         return frames, metadata
@@ -185,10 +228,10 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
 
         # Refer to:
         # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
-        frame_indices: range | list[int]
+        frame_indices_list: list[int]
         if duration <= max_duration:
             n = int(math.floor(duration * fps))
-            frame_indices = sorted(
+            frame_indices_list = sorted(
                 {
                     min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
                     for i in range(n)
@@ -197,34 +240,23 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
         else:
             num_samples = int(max_duration * fps)
             if num_samples >= total_frames_num:
-                frame_indices = range(total_frames_num)
+                frame_indices_list = list(range(total_frames_num))
             else:
                 target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
-                frame_indices = sorted(
+                frame_indices_list = sorted(
                     {
                         min(max_frame_idx, int(math.ceil(t * original_fps)))
                         for t in target_seconds
                     }
                 )
 
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8)
-
-        i = 0
-        for idx in range(total_frames_num):
-            ok = cap.grab()
-            if not ok:
-                break
-            if idx in frame_indices:
-                ret, frame = cap.retrieve()
-                if ret:
-                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    i += 1
-
-        assert i == len(frame_indices), (
-            f"Expected reading {len(frame_indices)} frames, "
-            f"but only loaded {i} frames from video."
+        # Convert to set for O(1) lookup performance
+        frame_indices_set = set(frame_indices_list)
+        frames, valid_num_frames, valid_frame_indices = cls._read_frames(
+            cap,
+            frame_indices_set,
+            len(frame_indices_list),
+            total_frames_num - 1,
         )
 
         # Use transformers transformers.video_utils.VideoMetadata format
@@ -233,7 +265,7 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
             "fps": original_fps,
             "duration": duration,
             "video_backend": "opencv_dynamic",
-            "frames_indices": list(frame_indices),
+            "frames_indices": valid_frame_indices,
             "do_sample_frames": False,
         }
 

From 64192d562402a56dc1e3a2141cfe896a7f0b52e9 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 20 Nov 2025 13:23:22 +0800
Subject: [PATCH 054/249] [Bugfix] Revert custom attention mask for gemma3-mm
 (#28995)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/config/model.py                    |   5 -
 vllm/model_executor/models/gemma3_mm.py | 138 +-----------------------
 vllm/transformers_utils/config.py       |  11 --
 vllm/v1/worker/gpu_model_runner.py      |  19 ----
 4 files changed, 1 insertion(+), 172 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index d1e56a72a318b..97cba6ea7295e 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -32,7 +32,6 @@ from vllm.transformers_utils.config import (
     try_get_generation_config,
     try_get_safetensors_metadata,
     try_get_tokenizer_config,
-    uses_custom_attention_masks,
     uses_mrope,
 )
 from vllm.transformers_utils.gguf_utils import (
@@ -1625,10 +1624,6 @@ class ModelConfig:
     def uses_mrope(self) -> bool:
         return uses_mrope(self.hf_config)
 
-    @property
-    def uses_custom_attention_masks(self) -> bool:
-        return uses_custom_attention_masks(self.hf_config)
-
     @property
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index fe83c8b63b018..43c69e5e13992 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -596,7 +596,7 @@ class Gemma3ForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -644,142 +644,6 @@ class Gemma3ForConditionalGeneration(
 
         return hidden_states
 
-    def generate_attention_masks(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        mask_dtype: torch.dtype,
-    ) -> dict[str, Any]:
-        """Generate custom attention masks for Gemma3 multimodal inputs.
-
-        This is called by V1 engine's gpu_model_runner during preprocessing
-        to generate attention masks that allow bidirectional attention between
-        image tokens while maintaining causal attention for text.
-        """
-        # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
-        # This is a HACK. Fix this.
-        start_indices = (positions == 0).cpu().nonzero()
-        num_seqs = len(start_indices)
-        seq_lens = []
-        for i in range(num_seqs):
-            start_idx = start_indices[i]
-            end_idx = start_indices[i + 1] if i < num_seqs - 1 else len(input_ids)
-            seq_lens.append(end_idx - start_idx)
-
-        global_attn_masks = []
-        local_attn_masks = []
-        start_idx = 0
-        for seq_idx, seq_len in enumerate(seq_lens):
-            end_idx = start_idx + seq_len
-            input_token_ids = input_ids[start_idx:end_idx]
-
-            # Find image token positions
-            img_pos = input_token_ids == self.config.image_token_index
-
-            start_idx = end_idx
-
-            # Create a global causal mask
-            global_attn_mask = torch.empty(
-                1,
-                1,
-                seq_len,
-                seq_len,
-                dtype=mask_dtype,
-                device=input_ids.device,
-            )
-            global_attn_mask.fill_(float("-inf"))
-            # Fill the lower triangle with 0 (causal attention)
-            global_attn_mask = global_attn_mask.triu(diagonal=1)
-
-            # Enable bidirectional attention between image tokens
-            img_mask = torch.zeros_like(global_attn_mask)
-            img_mask[:, :, :, img_pos] += 1
-            img_mask[:, :, img_pos, :] += 1
-            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
-            global_attn_masks.append(global_attn_mask)
-
-            # GGUF compatibility: config might be Gemma3TextConfig directly
-            text_config = getattr(self.config, "text_config", self.config)
-            sliding_window = text_config.sliding_window
-            if sliding_window is not None:
-                # Create a local causal mask with sliding window (1024)
-                local_attn_mask = torch.ones_like(global_attn_mask)
-                local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window)
-                local_attn_mask = torch.where(
-                    local_attn_mask == 0, global_attn_mask, float("-inf")
-                )
-                local_attn_masks.append(local_attn_mask)
-
-        return {
-            "has_images": True,
-            "seq_lens": seq_lens,
-            "global_attn_masks": global_attn_masks,
-            "local_attn_masks": local_attn_masks,
-        }
-
-    def prepare_attn_masks(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        mask_dtype: torch.dtype,
-        **kwargs,
-    ):
-        kwargs["has_images"] = True
-        # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
-        # This is a HACK. Fix this.
-        start_indices = (positions == 0).cpu().nonzero()
-        num_seqs = len(start_indices)
-        seq_lens = []
-        for i in range(num_seqs):
-            start_idx = start_indices[i].item()
-            if i < num_seqs - 1:
-                end_idx = start_indices[i + 1].item()
-            else:
-                end_idx = len(input_ids)
-            seq_lens.append(end_idx - start_idx)
-        kwargs["seq_lens"] = seq_lens
-
-        global_attn_masks = []
-        local_attn_masks = []
-        start_idx = 0
-        for seq_len in seq_lens:
-            end_idx = start_idx + seq_len
-            input_token_ids = input_ids[start_idx:end_idx]
-            start_idx = end_idx
-            # Create a global causal mask.
-            global_attn_mask = torch.empty(
-                1,
-                1,
-                seq_len,
-                seq_len,
-                dtype=mask_dtype,
-                device=input_ids.device,
-            )
-            global_attn_mask.fill_(float("-inf"))
-            # Fill the lower triangle with 0.
-            global_attn_mask = global_attn_mask.triu(diagonal=1)
-
-            # Consider the bidirectional attention between image tokens.
-            img_mask = torch.zeros_like(global_attn_mask)
-            img_pos = input_token_ids == self.config.image_token_index
-            img_mask[:, :, :, img_pos] += 1
-            img_mask[:, :, img_pos, :] += 1
-            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
-            global_attn_masks.append(global_attn_mask)
-
-            sliding_window = self.config.text_config.sliding_window
-            if sliding_window is not None:
-                # Create a local causal mask with sliding window (1024).
-                local_attn_mask = torch.ones_like(global_attn_mask)
-                local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window)
-                local_attn_mask = torch.where(
-                    local_attn_mask == 0, global_attn_mask, float("-inf")
-                )
-                local_attn_masks.append(local_attn_mask)
-        kwargs["global_attn_masks"] = global_attn_masks
-        kwargs["local_attn_masks"] = local_attn_masks
-        return kwargs
-
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4ca155af03dca..df24738477e76 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -520,17 +520,6 @@ def is_interleaved(config: PretrainedConfig) -> bool:
     return False
 
 
-def uses_custom_attention_masks(config: PretrainedConfig) -> bool:
-    """Detect if model uses custom attention mask generation for multimodal.
-
-    Some multimodal models require custom attention masks that enable
-    bidirectional attention between image tokens while maintaining causal
-    attention for text tokens. Currently applies to Gemma3 multimodal models.
-    """
-    architectures = getattr(config, "architectures", [])
-    return "Gemma3ForConditionalGeneration" in architectures
-
-
 def _maybe_update_auto_config_kwargs(kwargs: dict[str, Any], model_type: str):
     """
     Update kwargs for AutoConfig initialization based on model_type
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 80f8344d44100..0490ed39c8c78 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -324,7 +324,6 @@ class GPUModelRunner(
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
-        self.uses_custom_attention_masks = model_config.uses_custom_attention_masks
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
             model_config
         )
@@ -2352,24 +2351,6 @@ class GPUModelRunner(
                 **self._init_model_kwargs(num_scheduled_tokens),
                 **self._extract_mm_kwargs(scheduler_output),
             }
-
-            # Generate custom attention masks for models that require them.
-            # V1 pre-generates embeddings, so forward() skips prepare_attn_masks().
-            # Check mm_features (mm_embeds is empty during decode).
-            has_mm_features = any(
-                req_state.mm_features for req_state in self.requests.values()
-            )
-            if (
-                self.uses_custom_attention_masks
-                and has_mm_features
-                and hasattr(self.model, "generate_attention_masks")
-            ):
-                mask_kwargs = self.model.generate_attention_masks(
-                    self.input_ids.gpu[:num_scheduled_tokens],
-                    self.positions.gpu[:num_scheduled_tokens],
-                    mask_dtype=self.model.dtype,
-                )
-                model_kwargs.update(mask_kwargs)
         elif self.enable_prompt_embeds and is_first_rank:
             # Get the input embeddings for the tokens that are not input embeds,
             # then put them into the appropriate positions.

From a9705a290af05ad71023714074ad8bf1a50c60a3 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Thu, 20 Nov 2025 06:04:23 +0000
Subject: [PATCH 055/249] [Model][QwenVL] Replace `torch.repeat_interleave`
 with faster `np.repeat` (#28964)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 .../models/multimodal/generation/test_qwen2_vl.py | 14 ++------------
 vllm/model_executor/models/qwen2_vl.py            | 15 +++++++++------
 vllm/model_executor/models/qwen3_vl.py            | 12 +++++++-----
 3 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index e10b8e1e77af1..e1b7dbf99f1fd 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -128,12 +128,7 @@ def batch_make_image_embeddings(
             visual = model.visual
 
             pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
-            image_grid_thw_on_device = image_grid_thw.to(
-                visual.device, dtype=torch.int64
-            )
-            return visual(
-                pixel_values_on_device, grid_thw=image_grid_thw_on_device
-            ).cpu()
+            return visual(pixel_values_on_device, grid_thw=image_grid_thw).cpu()
 
     image_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
@@ -217,12 +212,7 @@ def batch_make_video_embeddings(
             visual = model.visual
 
             pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
-            video_grid_thw_on_device = video_grid_thw.to(
-                visual.device, dtype=torch.int64
-            )
-            return visual(
-                pixel_values_on_device, grid_thw=video_grid_thw_on_device
-            ).cpu()
+            return visual(pixel_values_on_device, grid_thw=video_grid_thw).cpu()
 
     video_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index d25ff2785bfef..479a7871e364f 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -29,6 +29,7 @@ from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from typing import Annotated, Any, Literal, TypeAlias
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -751,25 +752,27 @@ class Qwen2VisionTransformer(nn.Module):
 
         if isinstance(grid_thw, list):
             grid_thw_list = grid_thw
-            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
+            grid_thw = np.array(grid_thw, dtype=np.int32)
         else:
             grid_thw_list = grid_thw.tolist()
+            grid_thw = grid_thw.numpy()
 
         # compute position embedding
         rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
 
         # compute cu_seqlens
-        cu_seqlens = torch.repeat_interleave(
-            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
-        ).cumsum(dim=0, dtype=torch.int32)
-        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
-        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
+        cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            axis=0, dtype=np.int32
+        )
+        cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens])
+        cu_seqlens = torch.from_numpy(cu_seqlens)
 
         # transformers
         x = x.unsqueeze(1)
 
         # pre-compute seqlens for attn mask to reduce cuMemcpy operations
         max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
         for blk in self.blocks:
             x = blk(
                 x,
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index c10aeaec5ab83..90c4894d33e88 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -553,18 +553,20 @@ class Qwen3_VisionTransformer(nn.Module):
 
         if isinstance(grid_thw, list):
             grid_thw_list = grid_thw
-            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
+            grid_thw = np.array(grid_thw, dtype=np.int32)
         else:
             grid_thw_list = grid_thw.tolist()
+            grid_thw = grid_thw.numpy()
 
         pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list)
         hidden_states = hidden_states + pos_embeds
         rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
 
-        cu_seqlens = torch.repeat_interleave(
-            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
-        ).cumsum(dim=0, dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
-        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+        cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            axis=0, dtype=np.int32
+        )
+        cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens])
+        cu_seqlens = torch.from_numpy(cu_seqlens)
 
         hidden_states = hidden_states.unsqueeze(1)
         max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)

From 1c7bcc55b86d6cb867072dfb890dec6c7e747a1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Wed, 19 Nov 2025 23:20:12 -0700
Subject: [PATCH 056/249] [Frontend] Allow parsed tool arguments (#28820)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/entrypoints/chat_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3b722c2d92770..03214c4d131bc 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1437,7 +1437,8 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
             for item in message["tool_calls"]:
                 # if arguments is None or empty string, set to {}
                 if content := item["function"].get("arguments"):
-                    item["function"]["arguments"] = json.loads(content)
+                    if not isinstance(content, (dict, list)):
+                        item["function"]["arguments"] = json.loads(content)
                 else:
                     item["function"]["arguments"] = {}
 

From 20e4497be23f8e74882bfb0bd0db3d30dd821afc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 20 Nov 2025 14:39:10 +0800
Subject: [PATCH 057/249] [V0 Deprecation] Remove `num_lookahead_slots`
 (#29000)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/config/scheduler.py   |  9 ---------
 vllm/config/speculative.py | 10 ----------
 vllm/engine/arg_utils.py   | 11 -----------
 3 files changed, 30 deletions(-)

diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 8194295ffedb6..b6078706daacf 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -62,15 +62,6 @@ class SchedulerConfig:
     """For chunked prefill, a request is considered long if the prompt is
     longer than this number of tokens."""
 
-    num_lookahead_slots: int = Field(default=0, ge=0)
-    """The number of slots to allocate per sequence per
-    step, beyond the known token ids. This is used in speculative
-    decoding to store KV activations of tokens which may or may not be
-    accepted.
-
-    NOTE: This will be replaced by speculative config in the future; it is
-    present to enable correctness tests until then."""
-
     enable_chunked_prefill: bool = True
     """If True, prefill requests can be chunked based
     on the remaining `max_num_batched_tokens`.
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 13a8632413d91..a0c65b6049e1e 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -634,16 +634,6 @@ class SpeculativeConfig:
 
         return self
 
-    @property
-    def num_lookahead_slots(self) -> int:
-        """The number of additional slots the scheduler should allocate per
-        step, in addition to the slots allocated for each known token.
-
-        This is equal to the number of speculative tokens, as each speculative
-        token must be scored.
-        """
-        return self.num_speculative_tokens
-
     def use_eagle(self) -> bool:
         return self.method in ("eagle", "eagle3", "mtp")
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 68205b6079d78..74828bc109cbe 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -488,7 +488,6 @@ class EngineArgs:
 
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
-    num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
     model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
     ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
 
@@ -1081,9 +1080,6 @@ class EngineArgs:
             "--long-prefill-token-threshold",
             **scheduler_kwargs["long_prefill_token_threshold"],
         )
-        scheduler_group.add_argument(
-            "--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]
-        )
         # multi-step scheduling has been removed; corresponding arguments
         # are no longer supported.
         scheduler_group.add_argument(
@@ -1653,18 +1649,11 @@ class EngineArgs:
             target_parallel_config=parallel_config,
         )
 
-        # make sure num_lookahead_slots is set appropriately depending on
-        # whether speculative decoding is enabled
-        num_lookahead_slots = self.num_lookahead_slots
-        if speculative_config is not None:
-            num_lookahead_slots = speculative_config.num_lookahead_slots
-
         scheduler_config = SchedulerConfig(
             runner_type=model_config.runner_type,
             max_num_batched_tokens=self.max_num_batched_tokens,
             max_num_seqs=self.max_num_seqs,
             max_model_len=model_config.max_model_len,
-            num_lookahead_slots=num_lookahead_slots,
             enable_chunked_prefill=self.enable_chunked_prefill,
             disable_chunked_mm_input=self.disable_chunked_mm_input,
             is_multimodal_model=model_config.is_multimodal_model,

From 7218f83992c7d61fc3845ea24407a1f3b909713e Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Thu, 20 Nov 2025 15:50:23 +0800
Subject: [PATCH 058/249] [ROCm][BugFix] Fix shared expert loading error when
 disable `VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS` (#28633)

Signed-off-by: ganyi <ygan@amd.com>
---
 vllm/model_executor/models/deepseek_v2.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index c50fc327e7608..d0a116b97997a 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -287,7 +287,10 @@ class DeepseekV2MoE(nn.Module):
         )
 
         self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
-        if config.n_shared_experts is None or self.is_rocm_aiter_moe_enabled:
+        self.is_fusion_moe_shared_experts_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
+        if config.n_shared_experts is None or self.is_fusion_moe_shared_experts_enabled:
             self.shared_experts = None
         else:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
@@ -327,7 +330,7 @@ class DeepseekV2MoE(nn.Module):
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
             n_shared_experts=config.n_shared_experts
-            if rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+            if self.is_fusion_moe_shared_experts_enabled
             else None,
         )
 

From 1e1c06789e63a760d91aaf6e4ddfeabfe382c301 Mon Sep 17 00:00:00 2001
From: Bradley D <bradleyhd@meta.com>
Date: Wed, 19 Nov 2025 23:53:38 -0800
Subject: [PATCH 059/249] [ci][amd] fix EPLB execution test (#28742)

Signed-off-by: Bradley Davis <bradleyhd@meta.com>
---
 tests/distributed/test_eplb_execute.py | 387 +++++++++++++------------
 1 file changed, 195 insertions(+), 192 deletions(-)

diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index 7b45ae82c72d4..0a97749ac318c 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import multiprocessing
 import os
 import random
 
 import pytest
 import torch
 import torch.distributed
+import torch.multiprocessing as mp
 
 from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
 from vllm.distributed.parallel_state import (
@@ -17,10 +17,12 @@ from vllm.distributed.parallel_state import (
 )
 from vllm.utils.system_utils import update_environment_variables
 
+mp.set_start_method("spawn", force=True)
 
-def distributed_run(fn, world_size):
+
+def distributed_run(fn, world_size, *args):
     number_of_processes = world_size
-    processes: list[multiprocessing.Process] = []
+    processes: list[mp.Process] = []
     for i in range(number_of_processes):
         env: dict[str, str] = {}
         env["RANK"] = str(i)
@@ -29,7 +31,7 @@ def distributed_run(fn, world_size):
         env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
         env["MASTER_ADDR"] = "localhost"
         env["MASTER_PORT"] = "12345"
-        p = multiprocessing.Process(target=fn, args=(env,))
+        p = mp.Process(target=fn, args=(env, world_size, *args))
         processes.append(p)
         p.start()
 
@@ -40,24 +42,16 @@ def distributed_run(fn, world_size):
         assert p.exitcode == 0
 
 
-def worker_fn_wrapper(fn):
-    # `multiprocessing.Process` cannot accept environment variables directly
-    # so we need to pass the environment variables as arguments
-    # and update the environment variables in the function
-    def wrapped_fn(env):
-        update_environment_variables(env)
-        local_rank = os.environ["LOCAL_RANK"]
-        device = torch.device(f"cuda:{local_rank}")
-        torch.cuda.set_device(device)
-        init_distributed_environment()
+def set_env_vars_and_device(env: dict[str, str]) -> None:
+    update_environment_variables(env)
+    local_rank = os.environ["LOCAL_RANK"]
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    init_distributed_environment()
 
-        # Ensure each worker process has the same random seed
-        random.seed(42)
-        torch.manual_seed(42)
-
-        fn()
-
-    return wrapped_fn
+    # Ensure each worker process has the same random seed
+    random.seed(42)
+    torch.manual_seed(42)
 
 
 def create_expert_indices_with_redundancy(
@@ -275,6 +269,79 @@ def verify_redundant_experts_have_same_weights(
                     )
 
 
+def _test_rearrange_expert_weights_with_redundancy(
+    env, world_size, num_layers, num_local_experts, num_logical_experts
+) -> None:
+    # Initialize model parallel (using tensor parallel as an entrypoint
+    # to expert parallel)
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
+
+    ep_group = get_tp_group().cpu_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
+
+    # Test parameters
+    total_physical_experts = world_size * num_local_experts
+    hidden_sizes = [32, 64]  # Two different weight matrices
+
+    # Create old expert indices (with redundancy)
+    redundancy_config = create_redundancy_config(
+        num_logical_experts, total_physical_experts
+    )
+
+    old_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        redundancy_config,
+    )
+
+    # Create new expert indices (with redundancy)
+    new_redundancy_config = create_redundancy_config(
+        num_logical_experts, total_physical_experts
+    )
+    new_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        new_redundancy_config,
+    )
+
+    # Create expert weights
+    expert_weights = create_expert_weights(
+        num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+    )
+
+    # Execute weight rearrangement
+    rearrange_expert_weights_inplace(
+        old_indices,
+        new_indices,
+        expert_weights,
+        ep_group,
+        is_profile=False,
+    )
+
+    # Verify the rearrangement result
+    verify_expert_weights_after_shuffle(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        ep_rank,
+        num_local_experts,
+    )
+
+    verify_redundant_experts_have_same_weights(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        world_size,
+        num_local_experts,
+    )
+
+
 @pytest.mark.parametrize(
     "world_size,num_layers,num_local_experts,num_logical_experts",
     [
@@ -305,78 +372,69 @@ def test_rearrange_expert_weights_with_redundancy(
 
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
+    distributed_run(
+        _test_rearrange_expert_weights_with_redundancy,
+        world_size,
+        num_layers,
+        num_local_experts,
+        num_logical_experts,
+    )
 
-    @worker_fn_wrapper
-    def worker_fn():
-        # Initialize model parallel (using tensor parallel as an entrypoint
-        # to expert parallel)
-        ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-        )
 
-        ep_group = get_tp_group().cpu_group
-        ep_rank = torch.distributed.get_rank()
-        device = torch.device(f"cuda:{ep_rank}")
+def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
 
-        # Test parameters
-        total_physical_experts = world_size * num_local_experts
-        hidden_sizes = [32, 64]  # Two different weight matrices
+    ep_group = get_tp_group().cpu_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
 
-        # Create old expert indices (with redundancy)
-        redundancy_config = create_redundancy_config(
-            num_logical_experts, total_physical_experts
-        )
+    num_layers = 2
+    num_local_experts = 2
+    total_physical_experts = world_size * num_local_experts
+    num_logical_experts = total_physical_experts // 2  # Some redundancy
+    hidden_sizes = [32, 64]
 
-        old_indices = create_expert_indices_with_redundancy(
-            num_layers,
-            num_logical_experts,
-            total_physical_experts,
-            redundancy_config,
-        )
+    # Create redundancy configuration
+    redundancy_config = [2] * num_logical_experts
 
-        # Create new expert indices (with redundancy)
-        new_redundancy_config = create_redundancy_config(
-            num_logical_experts, total_physical_experts
-        )
-        new_indices = create_expert_indices_with_redundancy(
-            num_layers,
-            num_logical_experts,
-            total_physical_experts,
-            new_redundancy_config,
-        )
+    # Same indices - no change
+    indices = create_expert_indices_with_redundancy(
+        num_layers, num_logical_experts, total_physical_experts, redundancy_config
+    )
 
-        # Create expert weights
-        expert_weights = create_expert_weights(
-            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
-        )
+    expert_weights = create_expert_weights(
+        num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
+    )
 
-        # Execute weight rearrangement
-        rearrange_expert_weights_inplace(
-            old_indices,
-            new_indices,
-            expert_weights,
-            ep_group,
-            is_profile=False,
-        )
+    # Save original weights
+    original_weights = []
+    for layer_weights in expert_weights:
+        layer_copy = []
+        for weight in layer_weights:
+            layer_copy.append(weight.clone())
+        original_weights.append(layer_copy)
 
-        # Verify the rearrangement result
-        verify_expert_weights_after_shuffle(
-            expert_weights,
-            new_indices,
-            hidden_sizes,
-            ep_rank,
-            num_local_experts,
-        )
+    # Execute rearrangement (should be no change)
+    rearrange_expert_weights_inplace(
+        indices,
+        indices,  # Same indices
+        expert_weights,
+        ep_group,
+        is_profile=False,
+    )
 
-        verify_redundant_experts_have_same_weights(
-            expert_weights,
-            new_indices,
-            hidden_sizes,
-            world_size,
-            num_local_experts,
-        )
-
-    distributed_run(worker_fn, world_size)
+    # Verify that the weights have not changed
+    for layer in range(num_layers):
+        for weight_idx in range(len(hidden_sizes)):
+            torch.testing.assert_close(
+                expert_weights[layer][weight_idx],
+                original_weights[layer][weight_idx],
+                msg=f"""Layer {layer}, weight {weight_idx}
+ should remain unchanged""",
+            )
 
 
 @pytest.mark.parametrize("world_size", [2, 4])
@@ -388,62 +446,69 @@ def test_rearrange_expert_weights_no_change(world_size):
 
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
+    distributed_run(_test_rearrange_expert_weights_no_change, world_size)
 
-    @worker_fn_wrapper
-    def worker_fn():
-        ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-        )
 
-        ep_group = get_tp_group().cpu_group
-        ep_rank = torch.distributed.get_rank()
-        device = torch.device(f"cuda:{ep_rank}")
+def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
 
-        num_layers = 2
-        num_local_experts = 2
-        total_physical_experts = world_size * num_local_experts
-        num_logical_experts = total_physical_experts // 2  # Some redundancy
-        hidden_sizes = [32, 64]
+    ep_group = get_tp_group().cpu_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
 
-        # Create redundancy configuration
-        redundancy_config = [2] * num_logical_experts
+    num_layers = 1
+    num_local_experts = 2
+    total_physical_experts = world_size * num_local_experts
+    num_logical_experts = total_physical_experts // 2
+    hidden_sizes = [32]
 
-        # Same indices - no change
-        indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts, redundancy_config
-        )
+    # Create different index distributions
+    old_redundancy = create_redundancy_config(
+        num_logical_experts, total_physical_experts
+    )
+    new_redundancy = create_redundancy_config(
+        num_logical_experts, total_physical_experts
+    )
 
-        expert_weights = create_expert_weights(
-            num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
-        )
+    old_indices = create_expert_indices_with_redundancy(
+        num_layers, num_logical_experts, total_physical_experts, old_redundancy
+    )
+    new_indices = create_expert_indices_with_redundancy(
+        num_layers, num_logical_experts, total_physical_experts, new_redundancy
+    )
 
-        # Save original weights
-        original_weights = []
-        for layer_weights in expert_weights:
-            layer_copy = []
-            for weight in layer_weights:
-                layer_copy.append(weight.clone())
-            original_weights.append(layer_copy)
+    expert_weights = create_expert_weights(
+        num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+    )
 
-        # Execute rearrangement (should be no change)
-        rearrange_expert_weights_inplace(
-            indices,
-            indices,  # Same indices
-            expert_weights,
-            ep_group,
-            is_profile=False,
-        )
+    # Save original weights
+    original_weights = []
+    for layer_weights in expert_weights:
+        layer_copy = []
+        for weight in layer_weights:
+            layer_copy.append(weight.clone())
+        original_weights.append(layer_copy)
 
-        # Verify that the weights have not changed
-        for layer in range(num_layers):
-            for weight_idx in range(len(hidden_sizes)):
-                torch.testing.assert_close(
-                    expert_weights[layer][weight_idx],
-                    original_weights[layer][weight_idx],
-                    msg=f"Layer {layer}, weight {weight_idx} should remain unchanged",
-                )
+    # Execute profile mode rearrangement
+    rearrange_expert_weights_inplace(
+        old_indices,
+        new_indices,
+        expert_weights,
+        ep_group,
+        is_profile=True,  # Profile mode
+    )
 
-    distributed_run(worker_fn, world_size)
+    # In profile mode, the weights should remain unchanged
+    for layer in range(num_layers):
+        for weight_idx in range(len(hidden_sizes)):
+            torch.testing.assert_close(
+                expert_weights[layer][weight_idx],
+                original_weights[layer][weight_idx],
+                msg="In profile mode, the weights should remain unchanged",
+            )
 
 
 @pytest.mark.parametrize("world_size", [2, 4])
@@ -452,66 +517,4 @@ def test_rearrange_expert_weights_profile_mode(world_size):
 
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
-
-    @worker_fn_wrapper
-    def worker_fn():
-        ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-        )
-
-        ep_group = get_tp_group().cpu_group
-        ep_rank = torch.distributed.get_rank()
-        device = torch.device(f"cuda:{ep_rank}")
-
-        num_layers = 1
-        num_local_experts = 2
-        total_physical_experts = world_size * num_local_experts
-        num_logical_experts = total_physical_experts // 2
-        hidden_sizes = [32]
-
-        # Create different index distributions
-        old_redundancy = create_redundancy_config(
-            num_logical_experts, total_physical_experts
-        )
-        new_redundancy = create_redundancy_config(
-            num_logical_experts, total_physical_experts
-        )
-
-        old_indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts, old_redundancy
-        )
-        new_indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts, new_redundancy
-        )
-
-        expert_weights = create_expert_weights(
-            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
-        )
-
-        # Save original weights
-        original_weights = []
-        for layer_weights in expert_weights:
-            layer_copy = []
-            for weight in layer_weights:
-                layer_copy.append(weight.clone())
-            original_weights.append(layer_copy)
-
-        # Execute profile mode rearrangement
-        rearrange_expert_weights_inplace(
-            old_indices,
-            new_indices,
-            expert_weights,
-            ep_group,
-            is_profile=True,  # Profile mode
-        )
-
-        # In profile mode, the weights should remain unchanged
-        for layer in range(num_layers):
-            for weight_idx in range(len(hidden_sizes)):
-                torch.testing.assert_close(
-                    expert_weights[layer][weight_idx],
-                    original_weights[layer][weight_idx],
-                    msg="In profile mode, the weights should remain unchanged",
-                )
-
-    distributed_run(worker_fn, world_size)
+    distributed_run(_test_rearrange_expert_weights_profile_mode, world_size)

From 2c52c7fd9a480f96ac93e63eccf9a3ee01686ad4 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 20 Nov 2025 03:52:23 -0500
Subject: [PATCH 060/249] [Bug] Fix torch dynamo warning Dynamo detected a call
 to a `functools.lru_cache` (#29038)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/v1/determinism/conftest.py              |  5 +--
 tests/v1/determinism/test_batch_invariance.py | 35 +++++--------------
 .../test_online_batch_invariance.py           | 12 +++++--
 tests/v1/determinism/utils.py                 | 20 +++++++++++
 vllm/model_executor/layers/batch_invariant.py | 20 ++++++-----
 5 files changed, 52 insertions(+), 40 deletions(-)

diff --git a/tests/v1/determinism/conftest.py b/tests/v1/determinism/conftest.py
index 3c2136e005849..bde02bbd0d5c6 100644
--- a/tests/v1/determinism/conftest.py
+++ b/tests/v1/determinism/conftest.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import pytest
 
+import vllm.model_executor.layers.batch_invariant as batch_invariant
+
 
 @pytest.fixture(autouse=True)
 def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch):
     """Automatically enable batch invariant kernel overrides for all tests."""
+    monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", True)
     monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
-    yield
diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index d4e88891512c4..74ae5e182da78 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -6,29 +6,16 @@ import random
 
 import pytest
 import torch
-from utils import _extract_step_logprobs, _random_prompt, skip_unsupported
+from utils import (
+    BACKENDS,
+    _extract_step_logprobs,
+    _random_prompt,
+    resolve_model_name,
+    skip_unsupported,
+)
 
+import vllm.model_executor.layers.batch_invariant as batch_invariant
 from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-
-BACKENDS: list[str] = [
-    "FLASH_ATTN",
-    "FLASHINFER",
-]
-
-if current_platform.is_cuda() and current_platform.is_device_capability(90):
-    BACKENDS.append("FLASH_ATTN_MLA")
-
-DEFAULT_MODEL = "Qwen/Qwen3-1.7B"
-MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat"
-
-
-def resolve_model_name(backend: str) -> str:
-    """Resolve the model name for the given backend, respecting env overrides."""
-    model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL)
-    if backend.endswith("MLA") and model == DEFAULT_MODEL:
-        return MLA_MODEL
-    return model
 
 
 @skip_unsupported
@@ -454,14 +441,10 @@ def test_logprobs_without_batch_invariance_should_fail(
     The test will PASS if we detect differences (proving batch invariance matters).
     The test will FAIL if everything matches (suggesting batch invariance isn't needed).
     """
-    from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
-
-    vllm_is_batch_invariant.cache_clear()
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
 
     # CRITICAL: Disable batch invariance for this test
-    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
-
+    monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
     model_name = resolve_model_name(backend)
diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py
index 23f47863dd23f..d74b435797f8f 100644
--- a/tests/v1/determinism/test_online_batch_invariance.py
+++ b/tests/v1/determinism/test_online_batch_invariance.py
@@ -16,7 +16,8 @@ import sys
 from typing import Any
 
 import openai
-from utils import _random_prompt, skip_unsupported
+import pytest
+from utils import BACKENDS, _random_prompt, resolve_model_name, skip_unsupported
 
 from tests.utils import RemoteOpenAIServer
 
@@ -133,9 +134,14 @@ def _compare_bs1_vs_bsn_single_process(
 
 
 @skip_unsupported
-def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN():
+@pytest.mark.parametrize("backend", BACKENDS)
+def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
+    backend: str, monkeypatch: pytest.MonkeyPatch
+) -> None:
     random.seed(int(os.getenv("VLLM_TEST_SEED", "12345")))
-    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    # Override backend for this test (and the RemoteOpenAIServer child process).
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
+    model_name = resolve_model_name(backend)
     prompts_all = [_random_prompt(10, 50) for _ in range(32)]
 
     sp_kwargs: dict[str, Any] = {
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
index 5141837faea04..7ee442551e2c3 100644
--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import random
 
 import pytest
@@ -12,6 +13,25 @@ skip_unsupported = pytest.mark.skipif(
     reason="Requires CUDA and >= Hopper (SM90)",
 )
 
+BACKENDS: list[str] = [
+    "FLASH_ATTN",
+    "FLASHINFER",
+]
+
+if current_platform.is_cuda() and current_platform.is_device_capability(90):
+    BACKENDS.append("FLASH_ATTN_MLA")
+
+DEFAULT_MODEL = "Qwen/Qwen3-1.7B"
+MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+
+
+def resolve_model_name(backend: str) -> str:
+    """Resolve the model name for the given backend."""
+    model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL)
+    if backend.endswith("MLA") and model == DEFAULT_MODEL:
+        return MLA_MODEL
+    return model
+
 
 def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
     # Generate more realistic prompts that will actually produce varied tokens
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 5dbeb29174349..69fa6bdffd43f 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from collections.abc import Callable
-from functools import cache
 from typing import Any
 
 import torch
@@ -785,16 +784,19 @@ def enable_batch_invariant_mode():
     torch.backends.cuda.preferred_blas_library(backend="cublaslt")
 
 
-@cache
-def vllm_is_batch_invariant():
-    env_key = "VLLM_BATCH_INVARIANT"
-    is_overridden = False
-    val = os.getenv(env_key, "0")
+def _read_vllm_batch_invariant() -> bool:
+    val = os.getenv("VLLM_BATCH_INVARIANT", "0")
     try:
-        is_overridden = int(val) != 0
+        return int(val) != 0
     except ValueError:
-        is_overridden = False
-    return is_overridden
+        return False
+
+
+VLLM_BATCH_INVARIANT: bool = _read_vllm_batch_invariant()
+
+
+def vllm_is_batch_invariant() -> bool:
+    return VLLM_BATCH_INVARIANT
 
 
 def override_envs_for_invariance():

From 322cb02872d806afcaaa7d0aac3fad7f304b7888 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 20 Nov 2025 03:48:09 -0600
Subject: [PATCH 061/249] [CI/Build][AMD] Fix import errors in
 tests/kernels/attention (#29032)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 .../attention/test_cascade_flash_attn.py      | 18 +++++++++++++-----
 tests/kernels/attention/test_flash_attn.py    | 19 ++++++++++++++-----
 tests/kernels/attention/test_flashinfer.py    | 12 ++++++++++--
 .../attention/test_flashinfer_mla_decode.py   |  3 ++-
 .../test_flashinfer_trtllm_attention.py       |  3 ++-
 tests/kernels/moe/test_flashinfer.py          |  9 ++++++++-
 6 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
index 20f573821b25f..d86041d71febd 100755
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -7,11 +7,19 @@ import torch
 
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states
-from vllm.vllm_flash_attn import (
-    fa_version_unsupported_reason,
-    flash_attn_varlen_func,
-    is_fa_version_supported,
-)
+
+try:
+    from vllm.vllm_flash_attn import (
+        fa_version_unsupported_reason,
+        flash_attn_varlen_func,
+        is_fa_version_supported,
+    )
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "vllm_flash_attn is not supported for vLLM on ROCm.",
+            allow_module_level=True,
+        )
 
 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 192, 256]
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index 26b8c77ab482f..bbd5df5419f80 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -6,11 +6,20 @@ import pytest
 import torch
 
 from vllm.platforms import current_platform
-from vllm.vllm_flash_attn import (
-    fa_version_unsupported_reason,
-    flash_attn_varlen_func,
-    is_fa_version_supported,
-)
+
+try:
+    from vllm.vllm_flash_attn import (
+        fa_version_unsupported_reason,
+        flash_attn_varlen_func,
+        is_fa_version_supported,
+    )
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "vllm_flash_attn is not supported for vLLM on ROCm.",
+            allow_module_level=True,
+        )
+
 
 NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [40, 72, 80, 128, 256]
diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index 82ec2ef14e56c..eedeec33e0d45 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -2,12 +2,20 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-import flashinfer
 import pytest
-import torch
 
 from vllm.platforms import current_platform
 
+try:
+    import flashinfer
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "flashinfer is not supported for vLLM on ROCm.", allow_module_level=True
+        )
+
+import torch
+
 NUM_HEADS = [(32, 8), (6, 1)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
diff --git a/tests/kernels/attention/test_flashinfer_mla_decode.py b/tests/kernels/attention/test_flashinfer_mla_decode.py
index 0350136677c6b..d183f67d3919e 100644
--- a/tests/kernels/attention/test_flashinfer_mla_decode.py
+++ b/tests/kernels/attention/test_flashinfer_mla_decode.py
@@ -3,7 +3,6 @@
 import pytest
 import torch
 import torch.nn.functional as F
-from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
 from torch import Tensor
 
 from vllm.platforms import current_platform
@@ -15,6 +14,8 @@ if not current_platform.has_device_capability(100):
         reason="FlashInfer MLA Requires compute capability of 10 or above.",
         allow_module_level=True,
     )
+else:
+    from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
 
 
 def ref_mla(
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 693b849ebc5d7..98ea40608b468 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import flashinfer
 import pytest
 import torch
 
@@ -16,6 +15,8 @@ if not current_platform.is_device_capability(100):
     pytest.skip(
         "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True
     )
+else:
+    import flashinfer
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = current_platform.fp8_dtype()
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index 218df4a2632c3..638741e91619b 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -22,7 +22,14 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
 from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8
 from vllm.model_executor.models.llama4 import Llama4MoE
 from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+
+try:
+    from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "flashinfer not supported for vLLM on ROCm", allow_module_level=True
+        )
 
 if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability(
     90

From a903d59ffaffd9160c517fa337b3ab0265a898c3 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 20 Nov 2025 02:51:36 -0800
Subject: [PATCH 062/249] cleanup at::Tag::needs_fixed_stride_order (#28974)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 csrc/cpu/torch_bindings.cpp |  7 ++--
 csrc/torch_bindings.cpp     | 64 +++++++++++--------------------------
 2 files changed, 20 insertions(+), 51 deletions(-)

diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index b07d20bab7dd9..e0e3ef71b485f 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -172,7 +172,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Quantization
 #if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
     defined(__powerpc64__)
-  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
   // Helper function to release oneDNN handlers
   ops.def("release_dnnl_matmul_handler(int handler) -> ()",
           &release_dnnl_matmul_handler);
@@ -208,15 +207,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Compute int8 quantized tensor for given scaling factor.
   ops.def(
       "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
-      "Tensor? azp) -> ()",
-      {stride_tag});
+      "Tensor? azp) -> ()");
   ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
 
   // Compute int8 quantized tensor and scaling factor
   ops.def(
       "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
-      "Tensor!? azp) -> ()",
-      {stride_tag});
+      "Tensor!? azp) -> ()");
   ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
            &dynamic_scaled_int8_quant);
 #endif
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index c3ae06a30e3e8..5af74c2c2a6b0 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -20,18 +20,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
   //
 
-  // The default behavior in PyTorch 2.6 was changed to "requires_contiguous",
-  // so we need
-  // to override this for many GEMMs with the following tag. Otherwise,
-  // torch.compile will force all input tensors to be contiguous(), which
-  // will break many custom ops that require column-major weight matrices.
-  // This was a bug and PyTorch 2.7 has since fixed this.
-#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6
-  #define stride_tag at::Tag::needs_fixed_stride_order
-#else
-  #define stride_tag
-#endif
-
   ops.def(
       "persistent_masked_m_silu_mul_quant(Tensor input, Tensor counts, Tensor! "
       "y_q, Tensor! y_s,"
@@ -241,15 +229,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Quantized GEMM for AWQ.
   ops.def(
       "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, SymInt split_k_iters) -> Tensor",
-      {stride_tag});
+      "Tensor _zeros, SymInt split_k_iters) -> Tensor");
   ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
 
   // Dequantization for AWQ.
   ops.def(
       "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor",
-      {stride_tag});
+      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor");
   ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
 
   // Note about marlin kernel 'workspace' arguments:
@@ -271,8 +257,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
       "Tensor b_scales, Tensor workspace, "
       "int b_q_type, "
-      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor",
-      {stride_tag});
+      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor");
   //  conditionally compiled so impl in source file
 
   // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
@@ -298,8 +283,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "   Tensor? channel_scales,"
       "   Tensor? token_scales,"
       "   str?    schedule"
-      ") -> Tensor",
-      {stride_tag});
+      ") -> Tensor");
   ops.def(
       "machete_prepack_B("
       "   Tensor B,"
@@ -319,8 +303,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? "
       "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, "
       "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
-      "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor",
-      {stride_tag});
+      "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
   // conditionally compiled so impl registration is in source file
 
   // gptq_marlin repack from GPTQ.
@@ -346,8 +329,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "   Tensor token_scales,"
       "   ScalarType? out_type,"
       "   str?   maybe_schedule"
-      ") -> Tensor",
-      {stride_tag});
+      ") -> Tensor");
   // pack scales
   ops.def("cutlass_pack_scale_fp8(Tensor scales) -> Tensor");
   // encode and reorder weight matrix
@@ -394,24 +376,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
       "                      Tensor block_scale_a, Tensor block_scale_b,"
-      "                      Tensor alpha) -> ()",
-      {stride_tag});
+      "                      Tensor alpha) -> ()");
   ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
 
   // cutlass blockwise scaledgroup GEMM
   ops.def(
       "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
       "Tensor scales_a, Tensor scales_b, "
-      "Tensor problem_sizes, Tensor expert_offsets) -> ()",
-      {stride_tag});
+      "Tensor problem_sizes, Tensor expert_offsets) -> ()");
   // conditionally compiled so impl registration is in source file
 
   // cutlass nvfp4 block scaled group GEMM
   ops.def(
       "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
       " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
-      " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
-      {stride_tag});
+      " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()");
   // conditionally compiled so impl registration is in source file
 
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
@@ -419,8 +398,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "cutlass_scaled_mm(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()",
-      {stride_tag});
+      "                  Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
 
   // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
@@ -429,8 +407,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
       "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()",
-      {stride_tag});
+      "                  Tensor? azp, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);
 
   // Check if cutlass scaled_mm is supported for CUDA devices of the given
@@ -449,8 +426,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
       "               Tensor problem_sizes, Tensor a_strides, "
       "               Tensor b_strides, Tensor c_strides, bool per_act_token, "
-      "               bool per_out_ch) -> ()",
-      {stride_tag});
+      "               bool per_out_ch) -> ()");
   ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
 
   // A function that computes data required to run fused MoE with w8a8 grouped
@@ -464,8 +440,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
       "                        Tensor! input_permutation, "
       "                        Tensor! output_permutation, int num_experts, "
-      "                        int n, int k, Tensor? blockscale_offsets) -> ()",
-      {stride_tag});
+      "                        int n, int k, Tensor? blockscale_offsets) -> "
+      "()");
   ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
 
   // A function that computes problem sizes for each expert's multiplication
@@ -476,8 +452,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                                 Tensor! problem_sizes1, "
       "                                 Tensor! problem_sizes2, "
       "                                 int num_experts, int n, int k, "
-      "                                 Tensor? blockscale_offsets) -> ()",
-      {stride_tag});
+      "                                 Tensor? blockscale_offsets) -> ()");
   ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
            &get_cutlass_moe_mm_problem_sizes);
 
@@ -492,8 +467,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                             Tensor! problem_sizes2, "
       "                             Tensor expert_num_tokens, "
       "                             int num_local_experts, int padded_m, "
-      "                             int n, int k) -> ()",
-      {stride_tag});
+      "                             int n, int k) -> ()");
   ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
            &get_cutlass_pplx_moe_mm_data);
 
@@ -517,8 +491,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
       "                         Tensor bt_nzs,"
       "                         Tensor bt_meta, Tensor a_scales,"
-      "                         Tensor b_scales, Tensor? bias) -> ()",
-      {stride_tag});
+      "                         Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
 
   // CUTLASS sparse matrix compressor
@@ -567,8 +540,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
       "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool "
       "use_v2_format, int bit) "
-      "-> Tensor",
-      {stride_tag});
+      "-> Tensor");
   ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
 
   // Post processing for GPTQ.

From fb8851f25485c3c94b0a71b77ff800f55ba328cf Mon Sep 17 00:00:00 2001
From: Vensen <vensenmu@gmail.com>
Date: Thu, 20 Nov 2025 18:52:02 +0800
Subject: [PATCH 063/249] [Bugfix][cache_kernels]: Fix OOB in cache_kernels.cu
 (#28760)

Signed-off-by: vensen <vensenmu@gmail.com>
Signed-off-by: Vensenmu <vensenmu@gmail.com>
---
 csrc/cache_kernels.cu               | 19 +++++----
 tests/kernels/test_cache_kernels.py | 65 +++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 7 deletions(-)
 create mode 100644 tests/kernels/test_cache_kernels.py

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 0aa0dc14c7480..a6c953ee0eac9 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -965,7 +965,9 @@ __global__ void gather_and_maybe_dequant_cache(
     }
   };
 
-  for (int pid = split_start; pid < full_blocks_end; ++pid) {
+  const auto loop_end =
+      std::min((int64_t)full_blocks_end, block_table_stride - offset);
+  for (int pid = split_start; pid < loop_end; ++pid) {
     auto block_id = batch_block_table[pid];
     auto block_start_ptr = src_cache + block_id * cache_block_stride;
     auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
@@ -976,12 +978,15 @@ __global__ void gather_and_maybe_dequant_cache(
   }
 
   if (partial_block_size) {
-    auto block_id = batch_block_table[full_blocks_end];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
-    for (int eid = 0; eid < partial_block_size; ++eid) {
-      copy_entry(block_start_ptr + eid * cache_entry_stride,
-                 block_dst_ptr + eid * dst_entry_stride);
+    if (offset + full_blocks_end < block_table_stride) {
+      auto block_id = batch_block_table[full_blocks_end];
+      auto block_start_ptr = src_cache + block_id * cache_block_stride;
+      auto block_dst_ptr =
+          dst + full_blocks_end * block_size * dst_entry_stride;
+      for (int eid = 0; eid < partial_block_size; ++eid) {
+        copy_entry(block_start_ptr + eid * cache_entry_stride,
+                   block_dst_ptr + eid * dst_entry_stride);
+      }
     }
   }
 }
diff --git a/tests/kernels/test_cache_kernels.py b/tests/kernels/test_cache_kernels.py
new file mode 100644
index 0000000000000..b5d66b4ede886
--- /dev/null
+++ b/tests/kernels/test_cache_kernels.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for CUDA kernels in cache_kernels.cu."""
+
+import pytest
+import torch
+
+try:
+    from vllm import _custom_ops as ops
+except ImportError:
+    pytest.skip(
+        "Could not import vllm._custom_ops. (pip install -e .)", allow_module_level=True
+    )
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Need CUDA device")
+def test_gather_cache_oob():
+    """
+    Tests for OOB read in gather_and_maybe_dequant_cache (Issue #27909).
+    This test constructs a boundary case identified in the issue where
+    seq_starts causes the block_table offset to read out of bounds.
+    """
+
+    batch_size = 1
+    block_size = 64
+    entry_size = 128
+
+    block_table = torch.tensor([[1, 2]], dtype=torch.int32, device="cuda")
+
+    # This will result in offset = 128 / block_size = 128 / 64 = 2
+    # This will cause the kernel to try to read from
+    # block_table[0, 2], but its size is only 2.
+    seq_starts = torch.tensor([128], dtype=torch.int32, device="cuda")
+
+    seq_len = 65
+    cu_seq_lens = torch.tensor([0, seq_len], dtype=torch.int32, device="cuda")
+
+    # src_cache: [num_blocks, block_size, entry_size]
+    num_blocks = 5
+    src_cache = torch.randn(
+        (num_blocks, block_size, entry_size), dtype=torch.float16, device="cuda"
+    )
+
+    dst = torch.empty((seq_len, entry_size), dtype=torch.float16, device="cuda")
+
+    scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
+
+    # Calling the C++ function gather_and_maybe_dequant_cache
+    ops.gather_and_maybe_dequant_cache(
+        src_cache,
+        dst,
+        block_table,
+        cu_seq_lens,
+        batch_size,
+        "auto",  # kv_cache_dtype
+        scale,
+        seq_starts,
+    )
+
+    torch.cuda.synchronize()
+    assert True
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])

From dc45efc8ef7fc1e2571331eaf4671e1652e2a865 Mon Sep 17 00:00:00 2001
From: Dezhan <dezhantu@gmail.com>
Date: Thu, 20 Nov 2025 02:52:36 -0800
Subject: [PATCH 064/249] [BugFix] Fix Llama4 Pipeline Parallelism Assert Error
 (#28577)

Co-authored-by: Dezhan Tu <dztu@meta.com>
---
 vllm/model_executor/models/llama4.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 4c6d1d4244755..e1bdfc3405f70 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -53,6 +53,7 @@ from vllm.model_executor.models.utils import sequence_parallel_chunk
 from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
 from .utils import (
     AutoWeightsLoader,
+    PPMissingLayer,
     extract_layer_index,
     fast_topk,
     is_pp_missing_parameter,
@@ -729,6 +730,9 @@ class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts):
         self.moe_layers = []
         example_moe = None
         for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
             assert isinstance(layer, Llama4DecoderLayer)
             if isinstance(layer.feed_forward, Llama4MoE):
                 # Pick last one layer since the first ones may be dense layers.
@@ -765,6 +769,9 @@ class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts):
         self.num_local_physical_experts = num_local_physical_experts
         self.num_redundant_experts = num_physical_experts - self.num_logical_experts
         for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
             if isinstance(layer.feed_forward, Llama4MoE):
                 moe = layer.feed_forward
                 moe.n_local_physical_experts = num_local_physical_experts

From edfe867208482ccadbf0ef503fc43e1fbb1e48f6 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Thu, 20 Nov 2025 18:52:53 +0800
Subject: [PATCH 065/249] [Misc] don't cache `CUTLASS_REVISION` var in
 CMakeLists.txt (#28518)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae8e6175443f3..a4cf51d17e982 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.2.1")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})

From 66483a9d00e4e26647dd26b4c49f6eca73972b8c Mon Sep 17 00:00:00 2001
From: cjackal <44624812+cjackal@users.noreply.github.com>
Date: Thu, 20 Nov 2025 19:53:09 +0900
Subject: [PATCH 066/249] [Chore] Update `xgrammar` version from 0.1.25 to
 0.1.27 (#28221)

Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 1058ab91a02a5..f2d1c0762ef6a 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -24,7 +24,7 @@ outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
+xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs

From 6eb745d9bdf5b69bb63f897b32465c62ecb9e14a Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Thu, 20 Nov 2025 02:53:50 -0800
Subject: [PATCH 067/249] Add truncate arg to yarn to match openai
 implementation of gpt-oss (#28244)

Signed-off-by: ashors1 <ashors@nvidia.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
---
 .../layers/rotary_embedding/__init__.py            |  1 +
 .../layers/rotary_embedding/common.py              | 14 +++++++-------
 .../layers/rotary_embedding/yarn_scaling_rope.py   |  3 +++
 vllm/model_executor/models/gpt_oss.py              |  1 +
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index ae8a7d93b50e4..152d9401b8e94 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -197,6 +197,7 @@ def get_rope(
                     "beta_fast",
                     "beta_slow",
                     "apply_yarn_scaling",
+                    "truncate",
                 )
             }
             if "mrope_section" in rope_parameters:
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 196533b617959..13f8d15cc0f72 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -117,13 +117,13 @@ def yarn_find_correction_range(
     dim: int,
     base: float = 10000,
     max_position_embeddings: int = 2048,
-) -> tuple[int, int]:
-    low = math.floor(
-        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
-    )
-    high = math.ceil(
-        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
-    )
+    truncate: bool = True,
+) -> tuple[float | int, float | int]:
+    low = yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    high = yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    if truncate:
+        low = math.floor(low)
+        high = math.ceil(high)
     return max(low, 0), min(high, dim - 1)  # Clamp values just in case
 
 
diff --git a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
index ff46ad74b302e..f01ca1e231211 100644
--- a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
@@ -28,12 +28,14 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
         beta_fast: int = 32,
         beta_slow: int = 1,
         apply_yarn_scaling: bool = True,
+        truncate: bool = True,
     ) -> None:
         self.scaling_factor = scaling_factor
         self.extrapolation_factor = extrapolation_factor
         self.attn_factor = attn_factor
         self.beta_fast = beta_fast
         self.beta_slow = beta_slow
+        self.truncate = truncate
         # Get n-d magnitude scaling corrected for interpolation
         self.mscale = (
             float(yarn_get_mscale(self.scaling_factor) * attn_factor)
@@ -57,6 +59,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
             self.rotary_dim,
             self.base,
             self.max_position_embeddings,
+            self.truncate,
         )
         # Get n-d rotational scaling corrected for extrapolation
         inv_freq_mask = (
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 25048330f7974..8835acb8ec65c 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -78,6 +78,7 @@ class OAIAttention(nn.Module):
                 ],
                 "beta_fast": config.rope_parameters["beta_fast"],
                 "beta_slow": config.rope_parameters["beta_slow"],
+                "truncate": config.rope_parameters.get("truncate", True),
             },
             is_neox_style=True,
         )

From 06c20c9904644d8f65523bb747756b2eae706b8e Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Thu, 20 Nov 2025 18:54:01 +0800
Subject: [PATCH 068/249] [ROCm] Add AMD GPU support on Deepseek v3.2 and
 SparseMLA (#26670)

Signed-off-by: ganyi <ygan@amd.com>
---
 csrc/cache_kernels.cu                         |   4 +
 vllm/attention/ops/rocm_aiter_mla_sparse.py   | 210 +++++++++++
 vllm/model_executor/models/deepseek_v2.py     |  22 +-
 vllm/platforms/rocm.py                        |  13 +-
 vllm/utils/deep_gemm.py                       |   5 +-
 .../attention/backends/mla/flashmla_sparse.py |   2 +-
 vllm/v1/attention/backends/mla/indexer.py     |  15 +-
 .../backends/mla/rocm_aiter_mla_sparse.py     | 325 ++++++++++++++++++
 vllm/v1/worker/utils.py                       |   2 +-
 9 files changed, 583 insertions(+), 15 deletions(-)
 create mode 100644 vllm/attention/ops/rocm_aiter_mla_sparse.py
 create mode 100644 vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index a6c953ee0eac9..32960cc8073bb 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -552,7 +552,11 @@ __global__ void indexer_k_quant_and_cache_kernel(
 #ifndef USE_ROCM
   __syncwarp();
 #endif
+#if defined(__gfx942__)
+  float scale = fmaxf(amax, 1e-4) / 224.0f;
+#else
   float scale = fmaxf(amax, 1e-4) / 448.0f;
+#endif
   if (use_ue8m0) {
     scale = exp2f(ceilf(log2f(scale)));
   }
diff --git a/vllm/attention/ops/rocm_aiter_mla_sparse.py b/vllm/attention/ops/rocm_aiter_mla_sparse.py
new file mode 100644
index 0000000000000..080e92ecc9408
--- /dev/null
+++ b/vllm/attention/ops/rocm_aiter_mla_sparse.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+from functools import lru_cache
+
+import torch
+
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+
+# Take from https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_attention.py#L84
+def fp8_mqa_logits_torch(
+    q: torch.Tensor,
+    kv: tuple[torch.Tensor, torch.Tensor],
+    weights: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits for a single sequence without KV paging.
+
+    Args:
+        q: Query tensor of shape [M, H, D]. Casted to
+            `torch.float8_e4m3fn` by caller.
+        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
+            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or
+            [N, 1]) with dtype `torch.float32`.
+        weights: weights of shape [M, H], dtype `torch.float32`.
+        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
+            shape [M], dtype int32.
+        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
+            shape [M], dtype int32.
+
+    Returns:
+        Logits tensor of shape [M, N], dtype `torch.float32`.
+    """
+    kv, scale = kv
+    seq_len_kv = kv.shape[0]
+    k = kv.to(torch.bfloat16)
+    q = q.to(torch.bfloat16)
+
+    mask_lo = (
+        torch.arange(0, seq_len_kv, device="cuda")[None, :] >= cu_seqlen_ks[:, None]
+    )
+    mask_hi = (
+        torch.arange(0, seq_len_kv, device="cuda")[None, :] < cu_seqlen_ke[:, None]
+    )
+    mask = mask_lo & mask_hi
+
+    score = torch.einsum("mhd,nd->hmn", q, k).float() * scale
+    logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
+    logits = logits.masked_fill(~mask, float("-inf"))
+
+    return logits
+
+
+def rocm_fp8_mqa_logits(
+    q: torch.Tensor,
+    kv: tuple[torch.Tensor, torch.Tensor],
+    weights: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits for a single sequence without KV paging.
+
+    Args:
+        q: Query tensor of shape [M, H, D]. Casted to
+            `torch.float8_e4m3fn` by caller.
+        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
+            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or
+            [N, 1]) with dtype `torch.float32`.
+        weights: weights of shape [M, H], dtype `torch.float32`.
+        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
+            shape [M], dtype int32.
+        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
+            shape [M], dtype int32.
+
+    Returns:
+        Logits tensor of shape [M, N], dtype `torch.float32`.
+    """
+
+    # TODO(ganyi): Temporarily workaround, will remove the module check and reference
+    # path after aiter merge this kernel into main
+    @lru_cache
+    def has_mqa_logits_module():
+        return importlib.util.find_spec("aiter.ops.triton.fp8_mqa_logits") is not None
+
+    if rocm_aiter_ops.is_enabled() and has_mqa_logits_module():
+        from aiter.ops.triton.fp8_mqa_logits import fp8_mqa_logits
+
+        kv, scale = kv
+        return fp8_mqa_logits(q, kv, scale, weights, cu_seqlen_ks, cu_seqlen_ke)
+    else:
+        return fp8_mqa_logits_torch(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)
+
+
+# Taken from https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_attention.py#L156
+def fp8_paged_mqa_logits_torch(
+    q: torch.Tensor,
+    kv_cache: torch.Tensor,
+    weights: torch.Tensor,
+    context_lens: torch.Tensor,
+    block_tables: torch.Tensor,
+    max_model_len: int,
+):
+    from vllm.utils.math_utils import cdiv
+
+    fp8_dtype = current_platform.fp8_dtype()
+    batch_size, next_n, _, dim = q.size()
+    kv_cache, scale = kv_cache[..., :dim], kv_cache[..., dim:]
+    scale = scale.contiguous().view(torch.float)
+    q = q.float()
+    kv_cache = kv_cache.view(fp8_dtype).float() * scale
+    num_block, block_size, _, dim = kv_cache.size()
+    logits = torch.full(
+        [batch_size * next_n, max_model_len],
+        float("-inf"),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    context_lens = context_lens.tolist()
+    for i in range(batch_size):
+        context_len = context_lens[i]
+        q_offsets = torch.arange(context_len - next_n, context_len, device="cuda")
+        weight_slice = (
+            weights[i * next_n : (i + 1) * next_n, :].transpose(0, 1).contiguous()
+        )
+        for block_rk in range(cdiv(context_len, block_size)):
+            block_idx = block_tables[i][block_rk]
+            qx, kx = q[i], kv_cache[block_idx]
+            k_offsets = torch.arange(
+                block_rk * block_size, (block_rk + 1) * block_size, device="cuda"
+            )
+            mask = (k_offsets[None, :] < context_len) & (
+                k_offsets[None, :] <= q_offsets[:, None]
+            )
+            s = torch.where(
+                mask[None, :, :],
+                (qx.transpose(0, 1) @ kx.transpose(0, 1).transpose(1, 2)).to(
+                    logits.dtype
+                ),
+                float("-inf"),
+            )
+            s = torch.relu(s) * weight_slice[..., None]
+            s = s.sum(dim=0)
+            logits[
+                i * next_n : (i + 1) * next_n,
+                block_rk * block_size : (block_rk + 1) * block_size,
+            ] = torch.where(k_offsets[None, :] <= q_offsets[:, None], s, float("-inf"))
+    return logits
+
+
+def rocm_fp8_paged_mqa_logits(
+    q_fp8: torch.Tensor,
+    kv_cache_fp8: torch.Tensor,
+    weights: torch.Tensor,
+    context_lens: torch.Tensor,
+    block_tables: torch.Tensor,
+    schedule_metadata: torch.Tensor,
+    max_model_len: int,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits using paged KV-cache.
+
+    Args:
+        q_fp8: Query tensor of shape [B, next_n, H, D]. Casted to
+            `torch.float8_e4m3fn` by caller.
+        kv_cache_fp8: Paged KV-cache in packed FP8+scale layout with shape
+            [num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last
+            4 bytes per (block,pos) store the `float` dequant scale.
+        weights: Tensor of shape [B * next_n, H], dtype `torch.float32`.
+        context_lens: Tensor of shape [B], dtype int32; effective context length
+            for each batch element.
+        block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical
+            block indices to physical blocks in the paged cache.
+        schedule_metadata: Returned by `get_paged_mqa_logits_metadata`;
+            used to distribute work across SMs.
+        max_model_len: Maximum sequence length used to size the logits output.
+
+    Returns:
+        Logits tensor of shape [B * next_n, max_model_len], dtype
+        `torch.float32`.
+    """
+
+    if rocm_aiter_ops.is_enabled():
+        from aiter.ops.triton.pa_mqa_logits import deepgemm_fp8_paged_mqa_logits_stage1
+
+        batch_size, next_n, heads, _ = q_fp8.shape
+        out_qk = torch.full(
+            (heads, batch_size * next_n, max_model_len),
+            float("-inf"),
+            device="cuda",
+            dtype=torch.float32,
+        )
+        deepgemm_fp8_paged_mqa_logits_stage1(
+            q_fp8,
+            kv_cache_fp8,
+            weights,
+            out_qk,
+            context_lens,
+            block_tables,
+            max_model_len,
+        )
+        return out_qk.sum(dim=0)
+    else:
+        return fp8_paged_mqa_logits_torch(
+            q_fp8, kv_cache_fp8, weights, context_lens, block_tables, max_model_len
+        )
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index d0a116b97997a..7cfd381592b49 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -594,6 +594,7 @@ def sparse_attn_indexer(
 ) -> torch.Tensor:
     # careful! this will be None in dummy run
     attn_metadata = get_forward_context().attn_metadata
+    fp8_dtype = current_platform.fp8_dtype()
     # assert isinstance(attn_metadata, dict)
     if not isinstance(attn_metadata, dict):
         return sparse_attn_indexer_fake(
@@ -633,7 +634,7 @@ def sparse_attn_indexer(
             k_fp8 = torch.empty(
                 [chunk.total_seq_lens, head_dim],
                 device=k.device,
-                dtype=torch.float8_e4m3fn,
+                dtype=fp8_dtype,
             )
             k_scale = torch.empty(
                 [chunk.total_seq_lens, 4],
@@ -647,7 +648,12 @@ def sparse_attn_indexer(
                 chunk.block_table,
                 chunk.cu_seq_lens,
             )
-            logits = fp8_mqa_logits(
+            fp8_mqa_logits_func = fp8_mqa_logits
+            if current_platform.is_rocm():
+                from vllm.attention.ops.rocm_aiter_mla_sparse import rocm_fp8_mqa_logits
+
+                fp8_mqa_logits_func = rocm_fp8_mqa_logits
+            logits = fp8_mqa_logits_func(
                 q_fp8[chunk.token_start : chunk.token_end],
                 (k_fp8, k_scale.view(torch.float32)),
                 weights[chunk.token_start : chunk.token_end],
@@ -692,7 +698,14 @@ def sparse_attn_indexer(
         next_n = padded_q_fp8_decode_tokens.shape[1]
         assert batch_size == decode_metadata.seq_lens.shape[0]
         num_padded_tokens = batch_size * next_n
-        logits = fp8_paged_mqa_logits(
+        fp8_paged_mqa_logits_func = fp8_paged_mqa_logits
+        if current_platform.is_rocm():
+            from vllm.attention.ops.rocm_aiter_mla_sparse import (
+                rocm_fp8_paged_mqa_logits,
+            )
+
+            fp8_paged_mqa_logits_func = rocm_fp8_paged_mqa_logits
+        logits = fp8_paged_mqa_logits_func(
             padded_q_fp8_decode_tokens,
             kv_cache,
             weights[:num_padded_tokens],
@@ -749,7 +762,8 @@ def sparse_attn_indexer_fake(
     _flattened_kv = torch.empty(
         [total_seq_lens, head_dim + 4], device=k.device, dtype=torch.uint8
     )
-    _k_fp8 = _flattened_kv[..., :head_dim].view(torch.float8_e4m3fn).contiguous()
+    fp8_dtype = current_platform.fp8_dtype()
+    _k_fp8 = _flattened_kv[..., :head_dim].view(fp8_dtype).contiguous()
     _k_scale = _flattened_kv[..., head_dim:].view(torch.float32).contiguous()
     return topk_indices_buffer
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f07f068a9249b..1a2f9226ddce8 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -225,7 +225,18 @@ class RocmPlatform(Platform):
         from vllm.attention.backends.registry import AttentionBackendEnum
 
         if use_sparse:
-            raise NotImplementedError("Sparse Attention is not supported on ROCm.")
+            if kv_cache_dtype.startswith("fp8"):
+                raise ValueError(
+                    "ROCMAiterMLASparseBackend doesn't support fp8 kv_cache_dtype."
+                )
+            assert block_size == 1, (
+                "Sparse MLA backend on ROCm only supports block size 1 for now."
+            )
+            logger.info_once("Using Sparse MLA backend on V1 engine.")
+            return (
+                "vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse."
+                "ROCMAiterMLASparseBackend"
+            )
 
         if use_mla:
             if selected_backend is None:
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 6b0a383a0e28c..b25c1e3e1ece3 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -325,6 +325,7 @@ DEFAULT_BLOCK_SIZE = [128, 128]
 def per_block_cast_to_fp8(
     x: torch.Tensor, block_size: list[int] = DEFAULT_BLOCK_SIZE, use_ue8m0: bool = False
 ) -> tuple[torch.Tensor, torch.Tensor]:
+    fp8_dtype = current_platform.fp8_dtype()
     assert x.dim() == 2
     m, n = x.shape
     block_m, block_n = block_size
@@ -334,9 +335,9 @@ def per_block_cast_to_fp8(
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    sf = x_amax / 448.0
+    sf = x_amax / 224.0 if current_platform.is_fp8_fnuz() else x_amax / 448.0
     sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf
-    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    x_scaled = (x_view * (1.0 / sf)).to(fp8_dtype)
     return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
         x_view.size(0), x_view.size(2)
     )
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index bb8d914d15719..3f2cc8c38327e 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -168,7 +168,7 @@ def _convert_req_index_to_global_index_kernel(
     inblock_off = tok % BLOCK_SIZE
 
     # Guard block_table access
-    valid_block = block_id < max_num_blocks_per_req
+    valid_block = (block_id < max_num_blocks_per_req) & (block_id >= 0)
     bt_ptr = block_table_ptr + req * bt_stride0 + block_id * bt_stride1
     base = tl.load(bt_ptr, mask=valid_block, other=0)
 
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index 37aa5dad89a0e..cc0988435768c 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -11,7 +11,8 @@ from vllm.attention.backends.abstract import (
 )
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata
+from vllm.platforms import current_platform
+from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata, is_deep_gemm_supported
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
@@ -23,7 +24,9 @@ logger = init_logger(__name__)
 
 
 class DeepseekV32IndexerBackend(AttentionBackend):
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [
+        1 if current_platform.is_rocm() else 64
+    ]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -328,10 +331,10 @@ class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
             requires_padding = (decode_lens_cpu.max() > decode_lens_cpu.min()).item()
 
             seq_lens = common_attn_metadata.seq_lens[:num_decodes]
-
-            self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata(
-                seq_lens, self.kv_cache_spec.block_size, self.num_sms
-            )
+            if is_deep_gemm_supported():
+                self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata(
+                    seq_lens, self.kv_cache_spec.block_size, self.num_sms
+                )
             decode_metadata = DeepSeekV32IndexerDecodeMetadata(
                 block_table=common_attn_metadata.block_table_tensor[:num_decodes, ...],
                 seq_lens=common_attn_metadata.seq_lens[:num_decodes],
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
new file mode 100644
index 0000000000000..c0e7f0e380b98
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
@@ -0,0 +1,325 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, ClassVar, Optional
+
+import numpy as np
+import torch
+
+from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionLayer,
+    AttentionMetadata,
+)
+from vllm.attention.backends.utils import get_mla_dims
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (
+    MLACommonBaseImpl,
+)
+from vllm.v1.attention.backends.mla.flashmla_sparse import (
+    triton_convert_req_index_to_global_index,
+)
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+if TYPE_CHECKING:
+    from vllm.model_executor.models.deepseek_v2 import Indexer
+logger = init_logger(__name__)
+
+
+class ROCMAiterMLASparseBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_name() -> str:
+        return "ROCM_AITER_MLA_SPARSE"
+
+    @staticmethod
+    def get_metadata_cls() -> type[AttentionMetadata]:
+        return ROCMAiterMLASparseMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["ROCMAiterMLASparseMetadataBuilder"]:
+        return ROCMAiterMLASparseMetadataBuilder
+
+    @staticmethod
+    def get_impl_cls() -> type["ROCMAiterMLASparseImpl"]:
+        return ROCMAiterMLASparseImpl
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,  # assumed to be 1 for MLA
+        head_size: int,
+        cache_dtype_str: str = "auto",
+    ) -> tuple[int, ...]:
+        return (num_blocks, block_size, head_size)
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16]
+
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [576]
+
+
+@dataclass
+class ROCMAiterMLASparseMetadata:
+    num_reqs: int
+    max_query_len: int
+    max_seq_len: int
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    query_start_loc: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    block_table: torch.Tensor
+    req_id_per_token: torch.Tensor
+    block_size: int = 1
+    topk_tokens: int = 2048
+
+
+@dataclass
+class ROCMAiterMLASparseMetadataBuilder(
+    AttentionMetadataBuilder[ROCMAiterMLASparseMetadata]
+):
+    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
+
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.kv_cache_spec = kv_cache_spec
+        self.model_config = vllm_config.model_config
+        parallel_config = vllm_config.parallel_config
+        self.device = device
+
+        self.num_heads = self.model_config.get_num_attention_heads(parallel_config)
+        self.mla_dims = get_mla_dims(self.model_config)
+        self.topk_tokens = vllm_config.model_config.hf_config.index_topk
+        self.topk_tokens_tensor = torch.tensor(
+            [self.topk_tokens], device=device, dtype=torch.int32
+        )
+        self.max_model_len_tensor = torch.tensor(
+            [self.model_config.max_model_len], device=device, dtype=torch.int32
+        )
+        # this is ignored by `flash_mla_with_kvcache` if indices not None
+        self.dummy_block_table = torch.empty(
+            (1, 1), dtype=torch.int32, device=self.device
+        )
+
+        self.req_id_per_token_buffer = torch.empty(
+            (vllm_config.scheduler_config.max_num_batched_tokens,),
+            dtype=torch.int32,
+            device=device,
+        )
+
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> ROCMAiterMLASparseMetadata:
+        num_tokens = common_attn_metadata.num_actual_tokens
+        starts = np.asarray(common_attn_metadata.query_start_loc_cpu, dtype=np.int32)
+        seg_lengths = np.diff(starts)
+        req_id_per_token = np.repeat(
+            np.arange(seg_lengths.shape[0], dtype=np.int32), seg_lengths
+        )
+        # Zero-fill for cudagraphs
+        self.req_id_per_token_buffer.fill_(0)
+        self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_(
+            torch.from_numpy(req_id_per_token), non_blocking=True
+        )
+        req_id_per_token = self.req_id_per_token_buffer[:num_tokens]
+
+        metadata = ROCMAiterMLASparseMetadata(
+            num_reqs=common_attn_metadata.num_reqs,
+            max_query_len=common_attn_metadata.max_query_len,
+            max_seq_len=common_attn_metadata.max_seq_len,
+            num_actual_tokens=common_attn_metadata.num_actual_tokens,
+            query_start_loc=common_attn_metadata.query_start_loc,
+            slot_mapping=common_attn_metadata.slot_mapping,
+            block_table=common_attn_metadata.block_table_tensor,
+            req_id_per_token=req_id_per_token,
+            block_size=self.kv_cache_spec.block_size,
+            topk_tokens=self.topk_tokens,
+        )
+        return metadata
+
+
+# Take from
+# https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla_prefill.py#L72
+def reference_mla_sparse_prefill(
+    q: torch.Tensor, kv: torch.Tensor, indices: torch.Tensor, sm_scale: float, d_v: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    import math
+
+    def log2sumexp2(a: torch.Tensor, dim: int) -> torch.Tensor:
+        return torch.logsumexp(a * math.log(2), dim=dim) * math.log2(math.e)
+
+    skv = kv.shape[0]
+    sq = q.shape[0]
+    topk = indices.shape[-1]
+    dqk = q.shape[-1]
+    indices = indices[:, 0, :]  # [s_q, topk]
+    invalid_indices_mask = (indices < 0) | (indices >= skv)
+    indices[invalid_indices_mask] = 0
+    qs = q  # [s_q, h_q, d_qk]
+    kvs = kv[:, 0, :][indices].view(sq, topk, dqk)  # [s_q, topk, d_qk]
+
+    attn_score = (qs @ kvs.transpose(1, 2)).float()  # [s_q, h_q, topk]
+    attn_score.masked_fill_(invalid_indices_mask.unsqueeze(1), float("-inf"))
+    attn_score *= sm_scale * math.log2(math.e)
+    lse = log2sumexp2(attn_score, dim=-1)  # [s_q, h_q]
+    attn_score = torch.exp2(attn_score - lse.unsqueeze(-1))  # [s_q, h_q, topk]
+    result = attn_score.to(q.dtype) @ kvs[:, :, :d_v]
+    return (result, lse)
+
+
+class ROCMAiterMLASparseImpl(MLACommonBaseImpl[ROCMAiterMLASparseMetadata]):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        topk_indice_buffer: torch.Tensor | None = None,
+        indexer: Optional["Indexer"] = None,
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **mla_args,
+        )
+        self.softmax_scale = scale
+        assert indexer is not None
+        self.topk_indices_buffer = indexer.topk_indices_buffer
+        self.is_fp8bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled()
+
+    def _forward_bf16_kv(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        topk_indices: torch.Tensor,
+        attn_metadata: ROCMAiterMLASparseMetadata,
+    ) -> torch.Tensor:
+        num_tokens = q.shape[0]
+        kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.view(
+            -1, 1, kv_c_and_k_pe_cache.shape[-1]
+        )
+
+        topk_indices = topk_indices.view(num_tokens, 1, -1)
+        output = reference_mla_sparse_prefill(
+            q, kv_c_and_k_pe_cache, topk_indices, self.softmax_scale, 512
+        )[0]
+        return output[:, : self.num_heads, :]
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        q: torch.Tensor,
+        k_c_normed: torch.Tensor,  # key in unified attn
+        k_pe: torch.Tensor,  # value in unified attn
+        kv_cache: torch.Tensor,
+        attn_metadata: ROCMAiterMLASparseMetadata,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        # NOTE(lucas): for the sparse FlashMLA kernels the kernels want to use
+        # MQA 576/512 approach for both prefill and decode
+
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None or output_block_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported for ROCMAiterMLASparse"
+            )
+
+        if attn_metadata is None:
+            # The zero fill is required when used with DP + EP
+            # to ensure all ranks within a DP group compute the
+            # same expert outputs.
+            return output.fill_(0)
+
+        num_actual_toks = attn_metadata.num_actual_tokens
+
+        # Inputs and outputs may be padded for CUDA graphs
+
+        q = q[:num_actual_toks, ...]
+        k_c_normed = k_c_normed[:num_actual_toks, ...]
+        k_pe = k_pe[:num_actual_toks, ...]
+
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        # Convert from (B, N, P) to (N, B, P)
+        q_nope = q_nope.transpose(0, 1)
+        if self.is_fp8bmm_enabled:
+            # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L)
+            ql_nope = rocm_aiter_ops.triton_fp8_bmm(
+                q_nope, self.W_K, self.W_K_scale, group_size=128, transpose_bm=True
+            )
+        else:
+            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+            ql_nope = torch.bmm(q_nope, self.W_UK_T)
+            # Convert from (N, B, L) to (B, N, L)
+            ql_nope = ql_nope.transpose(0, 1)
+
+        topk_indices = self.topk_indices_buffer[:num_actual_toks]
+
+        topk_indices_global = triton_convert_req_index_to_global_index(
+            attn_metadata.req_id_per_token,
+            attn_metadata.block_table,
+            topk_indices,
+            BLOCK_SIZE=attn_metadata.block_size,
+            NUM_TOPK_TOKENS=attn_metadata.topk_tokens,
+        )
+
+        q = torch.cat([ql_nope, q_pe], dim=-1)
+
+        # write the latent and rope to kv cache
+        if kv_cache.numel() > 0:
+            ops.concat_and_cache_mla(
+                k_c_normed,
+                k_pe.squeeze(1),
+                kv_cache,
+                attn_metadata.slot_mapping.flatten(),
+                kv_cache_dtype=self.kv_cache_dtype,
+                scale=layer._k_scale,
+            )
+
+        attn_out = self._forward_bf16_kv(
+            q, kv_cache, topk_indices_global, attn_metadata
+        )
+
+        self._v_up_proj(attn_out, out=output[:num_actual_toks])
+        return output
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 095407a8b9596..9e99ea964ee08 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -316,7 +316,7 @@ def bind_kv_cache(
             # TODO - analyze where runner_kv_caches is used and the right
             # way to ensure it properly reflects multiple attention layers
             # in the same decoder block.
-            if current_platform.is_cuda() or current_platform.is_xpu():
+            if current_platform.is_cuda_alike() or current_platform.is_xpu():
                 # We know that the GPU runner is not impacted by this
                 # case. Some test code depends on runner_kv_caches, but
                 # not in a way that's impacted by ignoring this.

From c0c2dd1e0b75c70706f4d8dbcd1d75f1c1750e14 Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Thu, 20 Nov 2025 12:55:10 +0200
Subject: [PATCH 069/249] [BugFix] kv_offloading: Fix bug in loading of partial
 cpu blocks (#28951)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/v1/kv_offload/test_cpu_gpu.py  |  4 ++--
 vllm/v1/kv_offload/worker/cpu_gpu.py | 20 +++++++++-----------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py
index 0d4fa344d298c..a248104e16d2d 100644
--- a/tests/v1/kv_offload/test_cpu_gpu.py
+++ b/tests/v1/kv_offload/test_cpu_gpu.py
@@ -103,8 +103,8 @@ def test_transfer(
         for i in range(gpu_blocks_per_cpu_block):
             cpu_blocks_in_gpu_block_size.append(i + base_block_id)
 
-    # maybe skip a GPU block to test writing to the middle of a CPU block
-    if gpu_to_cpu:
+    # maybe skip a GPU block to test reading from the middle of a CPU block
+    if not gpu_to_cpu:
         gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1 :]
         cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[
             gpu_blocks_per_cpu_block - 1 :
diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
index 0f2ec4a1b41f3..111046377a5da 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -135,22 +135,20 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
         assert src_blocks.ndim == 1
         assert dst_blocks.ndim == 1
 
-        dst_sub_blocks_to_skip = -src_blocks.size % dst_block_size_factor
         src_sub_block_count = src_blocks.size * src_block_size_factor
+        dst_sub_block_count = dst_blocks.size * dst_block_size_factor
+        src_sub_blocks_to_skip = -dst_blocks.size % src_block_size_factor
 
-        assert (
-            src_sub_block_count
-            == dst_blocks.size * dst_block_size_factor - dst_sub_blocks_to_skip
-        )
+        assert dst_sub_block_count == src_sub_block_count - src_sub_blocks_to_skip
 
-        src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64)
-        expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0])
+        src_to_dst = np.empty((dst_sub_block_count, 2), dtype=np.int64)
         expand_block_ids(
-            dst_blocks,
-            dst_block_size_factor,
-            src_to_dst[:, 1],
-            skip_count=dst_sub_blocks_to_skip,
+            src_blocks,
+            src_block_size_factor,
+            src_to_dst[:, 0],
+            skip_count=src_sub_blocks_to_skip,
         )
+        expand_block_ids(dst_blocks, dst_block_size_factor, src_to_dst[:, 1])
         src_to_dst_tensor = torch.from_numpy(src_to_dst)
 
         event = self.events_pool.pop() if self.events_pool else torch.Event()

From c9e093116c00781dda86df7a77e976c614b35d51 Mon Sep 17 00:00:00 2001
From: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com>
Date: Thu, 20 Nov 2025 20:00:19 +0900
Subject: [PATCH 070/249] [MODEL] Implement plamo3 (#28834)

Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com>
---
 docs/models/supported_models.md             |   1 +
 tests/distributed/test_pipeline_parallel.py |   1 +
 tests/models/registry.py                    |   4 +
 vllm/model_executor/models/plamo3.py        | 431 ++++++++++++++++++++
 vllm/model_executor/models/registry.py      |   1 +
 5 files changed, 438 insertions(+)
 create mode 100644 vllm/model_executor/models/plamo3.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 80fe143269a76..f0531ced0aaa3 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -436,6 +436,7 @@ th {
 | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
 | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ |
 | `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ |
+| `Plamo3ForCausalLM` | PLaMo3 | `pfnet/plamo-3-nict-2b-base`, `pfnet/plamo-3-nict-8b-base`, etc. | | ✅︎ |
 | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ |
 | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ |
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 0ab94d30858fb..89f035d2cdd6f 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -130,6 +130,7 @@ TEXT_GENERATION_MODELS = {
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
     "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
     "pfnet/plamo-2-1b": PPTestSettings.fast(),
+    "pfnet/plamo-3-nict-2b-base": PPTestSettings.fast(),
     "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
     # Tests TransformersForCausalLM
     "hmellor/Ilama-3.2-1B": PPTestSettings.fast(),
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 094f921e4305f..1999e3cd2de2d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -402,6 +402,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
         "pfnet/plamo-2-1b",
         trust_remote_code=True,
     ),
+    "Plamo3ForCausalLM": _HfExamplesInfo(
+        "pfnet/plamo-3-nict-2b-base",
+        trust_remote_code=True,
+    ),
     "QWenLMHeadModel": _HfExamplesInfo(
         "Qwen/Qwen-7B-Chat",
         max_transformers_version="4.53",
diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py
new file mode 100644
index 0000000000000..5bb07722a5fc1
--- /dev/null
+++ b/vllm/model_executor/models/plamo3.py
@@ -0,0 +1,431 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only PLaMo3 model."""
+
+from collections.abc import Iterable
+from itertools import islice
+from typing import Any
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    LoaderFunction,
+    composed_weight_loader,
+    default_weight_loader,
+)
+from vllm.model_executor.models.interfaces import SupportsPP
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    extract_layer_index,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.sequence import IntermediateTensors
+
+
+# Only used for type hinting.
+class Plamo3Config(PretrainedConfig):  # type: ignore
+    model_type: str = "plamo3"
+
+    hidden_size: int
+    num_hidden_layers: int
+    rms_norm_eps: float
+    # Attention
+    num_attention_heads: int
+    head_dim: int
+    num_key_value_heads: int
+    # vllm rename `sliding_window` attr to `interleaved_sliding_window`
+    # if `sliding_window` is list
+    interleaved_sliding_window: list[int | None]
+    sliding_window_pattern: int
+    rope_theta: int
+    rope_local_theta: int
+    # MLP
+    intermediate_size: int
+    # Tokenizer
+    vocab_size: int
+
+
+def rms_norm_weight_loader(offset: float) -> LoaderFunction:
+    return composed_weight_loader(
+        default_weight_loader,
+        lambda x: x + offset,
+    )
+
+
+class DenseMLP(nn.Module):
+    def __init__(
+        self,
+        config: Plamo3Config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            prefix=f"{prefix}.gate_up_proj",
+            quant_config=quant_config,
+            return_bias=False,
+        )
+        self.act = SiluAndMul()
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            prefix=f"{prefix}.down_proj",
+            quant_config=quant_config,
+            return_bias=False,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        h = self.gate_up_proj(hidden_states)
+        h = self.act(h)
+        return self.down_proj(h)
+
+
+class Plamo3AttentionMixer(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        layer_idx = extract_layer_index(prefix)
+        full_attn = config.interleaved_sliding_window[layer_idx] is None
+
+        self.rope_theta = config.rope_theta if full_attn else config.rope_local_theta
+        self.rope_scaling = (
+            config.rope_scaling if hasattr(config, "rope_scaling") else None
+        )
+        max_position = config.max_position_embeddings
+        if hasattr(vllm_config.model_config, "max_model_len") and isinstance(
+            vllm_config.model_config.max_model_len, int
+        ):
+            max_position = min(max_position, vllm_config.model_config.max_model_len)
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+        )
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.q_norm.weight, {"weight_loader": rms_norm_weight_loader(offset=1.0)}
+        )
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.k_norm.weight, {"weight_loader": rms_norm_weight_loader(offset=1.0)}
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            per_layer_sliding_window=config.interleaved_sliding_window[layer_idx],
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q_shape = q.shape
+        q = q.reshape(q_shape[:-1] + (q_shape[-1] // self.head_dim, self.head_dim))
+        q = self.q_norm.forward_native(q).reshape(q_shape)
+        k_shape = k.shape
+        k = k.reshape(k_shape[:-1] + (k_shape[-1] // self.head_dim, self.head_dim))
+        k = self.k_norm.forward_native(k).reshape(k_shape)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Plamo3DecoderLayer(nn.Module):
+    def __init__(
+        self, vllm_config: VllmConfig, prefix: str = "", **kwargs: Any
+    ) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.mixer = Plamo3AttentionMixer(
+            vllm_config=vllm_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.mlp = DenseMLP(
+            config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
+        self.pre_mixer_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.pre_mixer_norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0)},
+        )
+        self.post_mixer_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.post_mixer_norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0 / 5)},
+        )
+        self.pre_mlp_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.pre_mlp_norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0)},
+        )
+        self.post_mlp_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.post_mlp_norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0 / (5**1.5))},
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        **kwargs: Any,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.pre_mixer_norm(hidden_states)
+        else:
+            hidden_states, residual = self.pre_mixer_norm(hidden_states, residual)
+
+        hidden_states = self.mixer(
+            positions=positions, hidden_states=hidden_states, residual=residual
+        )
+        hidden_states = self.post_mixer_norm(hidden_states)
+        # Fully Connected
+        hidden_states, residual = self.pre_mlp_norm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_norm(hidden_states)
+        return hidden_states, residual
+
+
+class Plamo3Decoder(torch.nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            num_hidden_layers,
+            lambda prefix: Plamo3DecoderLayer(vllm_config, prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Plamo3Model(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+        self.layers = Plamo3Decoder(vllm_config, prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0)},
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        hidden_states, residual = self.layers(
+            positions=positions, hidden_states=hidden_states, residual=residual
+        )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Plamo3ForCausalLM(nn.Module, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.scheduler_config = vllm_config.scheduler_config
+
+        self.model = Plamo3Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+        self.vocab_size = self.config.vocab_size
+        self.unpadded_vocab_size = self.config.vocab_size
+
+        num_embeddings = ((self.vocab_size + 15) // 16) * 16
+        self.lm_head = ParallelLMHead(
+            num_embeddings,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            prefix=f"{prefix}.lm_head",
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
+        self.logits_processor = LogitsProcessor(
+            self.unpadded_vocab_size, self.config.vocab_size
+        )
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a2de597c87d88..4943987606201 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -157,6 +157,7 @@ _TEXT_GENERATION_MODELS = {
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
     "Plamo2ForCausalLM": ("plamo2", "Plamo2ForCausalLM"),
+    "Plamo3ForCausalLM": ("plamo3", "Plamo3ForCausalLM"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),

From 371b1d4c61335ed4c1d7fb2acee75274cc6d4551 Mon Sep 17 00:00:00 2001
From: Samit <285365963@qq.com>
Date: Thu, 20 Nov 2025 19:01:03 +0800
Subject: [PATCH 071/249] [RL] Add Pause and Resume Generation for Asynchronous
 RL Training (#28037)

Signed-off-by: SamitHuang <285365963@qq.com>
Signed-off-by: Samit <285365963@qq.com>
Signed-off-by: samithuang <285365963@qq.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 vllm/engine/protocol.py               | 27 ++++++++++
 vllm/entrypoints/openai/api_server.py | 78 +++++++++++++++++++++++++++
 vllm/v1/engine/async_llm.py           | 64 ++++++++++++++++++++++
 vllm/v1/engine/output_processor.py    | 13 +++++
 4 files changed, 182 insertions(+)

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 462d2c4e50e73..5e3374f9f6a10 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -149,6 +149,33 @@ class EngineClient(ABC):
         """Load a new LoRA adapter into the engine for future requests."""
         ...
 
+    @abstractmethod
+    async def pause_generation(
+        self,
+        *,
+        wait_for_inflight_requests: bool = False,
+        clear_cache: bool = True,
+    ) -> None:
+        """Pause new generation/encoding requests.
+
+        Args:
+            wait_for_inflight_requests: When ``True`` waits for in-flight requests
+                to finish before pausing. When ``False`` (default), aborts in-flight
+                requests immediately.
+            clear_cache: Whether to clear KV and prefix caches after draining.
+        """
+        ...
+
+    @abstractmethod
+    async def resume_generation(self) -> None:
+        """Resume accepting generation/encoding requests."""
+        ...
+
+    @abstractmethod
+    async def is_paused(self) -> bool:
+        """Return whether the engine is currently paused."""
+        ...
+
     async def scale_elastic_ep(
         self, new_data_parallel_size: int, drain_timeout: int = 300
     ) -> None:
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 3974f45a7135c..70174250ceabe 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -394,6 +394,84 @@ async def get_server_load_metrics(request: Request):
     return JSONResponse(content={"server_load": request.app.state.server_load_metrics})
 
 
+@router.post("/pause")
+async def pause_generation(
+    raw_request: Request,
+    wait_for_inflight_requests: bool = Query(False),
+    clear_cache: bool = Query(True),
+) -> JSONResponse:
+    """Pause generation requests to allow weight updates.
+
+    Args:
+        wait_for_inflight_requests: When ``True`` waits for in-flight
+            requests to finish before pausing. When ``False`` (default),
+            aborts any in-flight requests immediately.
+        clear_cache: Whether to clear KV/prefix caches after draining.
+    """
+
+    engine = engine_client(raw_request)
+
+    try:
+        await engine.pause_generation(
+            wait_for_inflight_requests=wait_for_inflight_requests,
+            clear_cache=clear_cache,
+        )
+        return JSONResponse(
+            content={"status": "paused"},
+            status_code=HTTPStatus.OK.value,
+        )
+
+    except ValueError as err:
+        return JSONResponse(
+            content={"error": str(err)},
+            status_code=HTTPStatus.BAD_REQUEST.value,
+        )
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to pause generation")
+        return JSONResponse(
+            content={"error": f"Failed to pause generation: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+
+@router.post("/resume")
+async def resume_generation(raw_request: Request) -> JSONResponse:
+    """Resume generation after a pause."""
+
+    engine = engine_client(raw_request)
+
+    try:
+        await engine.resume_generation()
+        return JSONResponse(
+            content={"status": "resumed"},
+            status_code=HTTPStatus.OK.value,
+        )
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to resume generation")
+        return JSONResponse(
+            content={"error": f"Failed to resume generation: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+
+@router.get("/is_paused")
+async def is_paused(raw_request: Request) -> JSONResponse:
+    """Return the current pause status."""
+
+    engine = engine_client(raw_request)
+
+    try:
+        paused = await engine.is_paused()
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to fetch pause status")
+        return JSONResponse(
+            content={"error": f"Failed to fetch pause status: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+    return JSONResponse(content={"is_paused": paused})
+
+
 @router.post(
     "/tokenize",
     dependencies=[Depends(validate_json_request)],
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index abf2c8cfa4539..c64b3cccfc652 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -152,6 +152,10 @@ class AsyncLLM(EngineClient):
             )
             self.logger_manager.log_engine_initialized()
 
+        # Pause / resume state for async RL workflows.
+        self._pause_cond = asyncio.Condition()
+        self._paused = False
+
         self.output_handler: asyncio.Task | None = None
         try:
             # Start output handler eagerly if we are in the asyncio eventloop.
@@ -404,6 +408,10 @@ class AsyncLLM(EngineClient):
             # to handle startup failure gracefully in the OpenAI server.
             self._run_output_handler()
 
+            # Wait until generation is resumed if the engine is paused.
+            async with self._pause_cond:
+                await self._pause_cond.wait_for(lambda: not self._paused)
+
             if tokenization_kwargs is None:
                 tokenization_kwargs = {}
                 truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
@@ -551,6 +559,58 @@ class AsyncLLM(EngineClient):
         if self.log_requests:
             logger.info("Aborted request(s) %s.", ",".join(request_ids))
 
+    async def pause_generation(
+        self,
+        *,
+        wait_for_inflight_requests: bool = False,
+        clear_cache: bool = True,
+    ) -> None:
+        """
+        Pause generation to allow model weight updates.
+
+        New generation/encoding requests are blocked until resume.
+
+        Args:
+            wait_for_inflight_requests: When ``True`` waits for in-flight
+                requests to finish before pausing. When ``False`` (default),
+                immediately aborts any in-flight requests.
+            clear_cache: Whether to clear KV cache and prefix cache after
+                draining. Set to ``False`` to preserve cache for faster resume.
+                Default is ``True`` (clear caches).
+        """
+
+        async with self._pause_cond:
+            if self._paused:
+                return
+            self._paused = True
+
+        if not wait_for_inflight_requests:
+            request_ids = list(self.output_processor.request_states.keys())
+            if request_ids:
+                await self.abort(request_ids)
+
+        # Wait for running requests to drain before clearing cache.
+        if self.output_processor.has_unfinished_requests():
+            await self.output_processor.wait_for_requests_to_drain()
+
+        # Clear cache
+        if clear_cache:
+            await self.reset_prefix_cache()
+            await self.reset_mm_cache()
+
+    async def resume_generation(self) -> None:
+        """Resume generation after :meth:`pause_generation`."""
+
+        async with self._pause_cond:
+            self._paused = False
+            self._pause_cond.notify_all()  # Wake up all waiting requests
+
+    async def is_paused(self) -> bool:
+        """Return whether the engine is currently paused."""
+
+        async with self._pause_cond:
+            return self._paused
+
     async def encode(
         self,
         prompt: PromptType,
@@ -582,6 +642,10 @@ class AsyncLLM(EngineClient):
             # to handle startup failure gracefully in the OpenAI server.
             self._run_output_handler()
 
+            # Respect pause state before accepting new requests.
+            async with self._pause_cond:
+                await self._pause_cond.wait_for(lambda: not self._paused)
+
             if tokenization_kwargs is None:
                 tokenization_kwargs = {}
             _validate_truncation_size(
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index bdbbfe2595f81..0453c4a77f0cd 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -350,6 +350,8 @@ class OutputProcessor:
         self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates(log_stats)
         self.tracer: Tracer | None = None
+        self._requests_drained = asyncio.Event()
+        self._requests_drained.set()
 
     def get_num_unfinished_requests(self):
         return len(self.request_states)
@@ -357,6 +359,11 @@ class OutputProcessor:
     def has_unfinished_requests(self) -> bool:
         return len(self.request_states) > 0
 
+    async def wait_for_requests_to_drain(self) -> None:
+        if not self.request_states:
+            return
+        await self._requests_drained.wait()
+
     def propagate_error(self, e: Exception):
         """Propagate error to all generate() tasks."""
 
@@ -396,6 +403,8 @@ class OutputProcessor:
                     child_reqs = self.abort_requests(child_reqs)
                     request_ids_to_abort.extend(child_reqs)
                 self.parent_requests.pop(request_id, None)
+        if not self.request_states:
+            self._requests_drained.set()
         return request_ids_to_abort
 
     def add_request(
@@ -420,6 +429,8 @@ class OutputProcessor:
             log_stats=self.log_stats,
             stream_interval=self.stream_interval,
         )
+        if self._requests_drained.is_set():
+            self._requests_drained.clear()
         self.request_states[request_id] = req_state
         if parent_req:
             self.parent_requests[parent_req.request_id] = parent_req
@@ -511,6 +522,8 @@ class OutputProcessor:
                 parent_req = req_state.parent_req
                 if parent_req and not parent_req.child_requests:
                     self.parent_requests.pop(parent_req.request_id, None)
+                if not self.request_states:
+                    self._requests_drained.set()
                 if not engine_core_output.finished:
                     # If req not finished in EngineCore, but Detokenizer
                     # detected stop string, abort needed in EngineCore.

From 93c8672ceb06f6e9c282a96fcd85a7ce41293693 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Thu, 20 Nov 2025 03:05:50 -0800
Subject: [PATCH 072/249] [Bugfix] Fix spec decode memory regression after
 #28549 (#28819)

Signed-off-by: zhewenli <zhewenli@meta.com>
---
 vllm/model_executor/models/deepseek_eagle.py | 5 -----
 vllm/model_executor/models/llama4_eagle.py   | 7 -------
 vllm/model_executor/models/llama_eagle.py    | 5 -----
 vllm/v1/spec_decode/eagle.py                 | 7 +++++--
 4 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index 3fb04c3b70dd1..4d7a37292cb02 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -8,7 +8,6 @@ import torch.nn as nn
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -172,10 +171,6 @@ class DeepseekV2Model(nn.Module):
                     )
                     break
                 else:
-                    # if PP disabled then draft will share embed with target
-                    if get_pp_group().world_size == 1 and "embed_tokens." in name:
-                        continue
-
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index 660c8f1bb5226..0146b30579287 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -23,7 +23,6 @@ import torch.nn as nn
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -127,17 +126,11 @@ class LlamaModel(nn.Module):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # if PP disabled then draft will share embed with target
-                if get_pp_group().world_size == 1 and "embed_tokens." in name:
-                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         for name in params_dict:
-            # if PP disabled then draft will share embed with target
-            if get_pp_group().world_size == 1 and "embed_tokens." in name:
-                continue
             assert name in loaded_params, f"{name} is not loaded!"
         return loaded_params
 
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 90ab5c50361b6..05cb456e7776e 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -9,7 +9,6 @@ from transformers import LlamaConfig
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -155,10 +154,6 @@ class LlamaModel(nn.Module):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # if PP disabled then draft will share embed with target
-                if get_pp_group().world_size == 1 and "embed_tokens." in name:
-                    continue
-
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 406bb696bd4cf..ba37bc81607fe 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1028,8 +1028,11 @@ class EagleProposer:
                 elif (
                     isinstance(target_embed_tokens.weight, torch.Tensor)
                     and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
-                    and torch.equal(
-                        target_embed_tokens.weight, self.model.model.embed_tokens.weight
+                    and torch.allclose(
+                        target_embed_tokens.weight.cpu(),
+                        self.model.model.embed_tokens.weight.cpu(),
+                        rtol=1e-5,
+                        atol=1e-7,
                     )
                 ):
                     share_embeddings = True

From a2e9ebe9e242295a58e400835ef98a14b29c4fb0 Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Thu, 20 Nov 2025 20:14:29 +0800
Subject: [PATCH 073/249] [BugFix] Fix flash_attn import in `siglip2navit.py`
 (#29082)

Signed-off-by: Fanli Lin <fanli.lin@intel.com>
---
 vllm/model_executor/models/siglip2navit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 46f5e67d659ef..c185b45345bd5 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -191,7 +191,7 @@ def apply_rotary_pos_emb(
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
     if is_flash_attn_backend and not current_platform.is_xpu():
-        from flash_attn.layers.rotary import apply_rotary_emb
+        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
 
         apply_rotary_emb_func = apply_rotary_emb
     else:

From 82b05b15e61badfd0c5912d4c3eebc88043c9ef8 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Thu, 20 Nov 2025 23:34:11 +0700
Subject: [PATCH 074/249] [BugFix] [FEAT] Enable fastsafetensors for ROCm
 platform (#28225)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 requirements/rocm.txt                                          | 1 +
 .../fastsafetensors_loader/test_fastsafetensors_loader.py      | 3 ++-
 .../model_loader/fastsafetensors_loader/test_weight_utils.py   | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 6f1cca90e5e2b..abbd33d6e1240 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -15,3 +15,4 @@ setuptools-scm>=8
 runai-model-streamer[s3,gcs]==0.15.0
 conch-triton-kernels==1.2.1
 timm>=1.0.17
+fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
diff --git a/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
index f154df6dfc232..c5b3c731ffc64 100644
--- a/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
+++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
@@ -19,7 +19,8 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda(), reason="fastsafetensors requires CUDA/NVIDIA GPUs"
+    not current_platform.is_cuda_alike(),
+    reason="fastsafetensors requires NVIDIA/AMD GPUs",
 )
 def test_model_loader_download_files(vllm_runner):
     with vllm_runner(test_model, load_format="fastsafetensors") as llm:
diff --git a/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
index bd216f0e41a47..1975eb61b25da 100644
--- a/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
+++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
@@ -17,7 +17,8 @@ from vllm.platforms import current_platform
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda(), reason="fastsafetensors requires CUDA/NVIDIA GPUs"
+    not current_platform.is_cuda_alike(),
+    reason="fastsafetensors requires NVIDIA/AMD GPUs",
 )
 def test_fastsafetensors_model_loader():
     with tempfile.TemporaryDirectory() as tmpdir:

From 56f45eddaff817ec7118bf9a73c5e4b560738bed Mon Sep 17 00:00:00 2001
From: rookie <66160395+zhanggzh@users.noreply.github.com>
Date: Fri, 21 Nov 2025 01:02:30 +0800
Subject: [PATCH 075/249] [Frontend] Optimize beam search loop by sorting and
 then splicing (#19347)

Signed-off-by: zhangguozhu <zhangguozhu@360.cn>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: zhangguozhu <zhangguozhu@360.cn>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 vllm/entrypoints/openai/serving_engine.py | 101 +++++++++++++++-------
 1 file changed, 69 insertions(+), 32 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index c50b0c4a23e17..127b8e6dcb87c 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -10,6 +10,7 @@ from concurrent.futures import ThreadPoolExecutor
 from http import HTTPStatus
 from typing import Any, ClassVar, Generic, TypeAlias, TypeVar
 
+import numpy as np
 import torch
 from fastapi import Request
 from pydantic import BaseModel, ConfigDict, Field, TypeAdapter
@@ -389,8 +390,9 @@ class OpenAIServing:
 
         sort_beams_key = create_sort_beams_key_function(eos_token_id, length_penalty)
 
+        logprobs_num = 2 * beam_width
         beam_search_params = SamplingParams(
-            logprobs=2 * beam_width,
+            logprobs=logprobs_num,
             max_tokens=1,
             temperature=temperature,
         )
@@ -443,40 +445,75 @@ class OpenAIServing:
             output = [x[0] for x in await asyncio.gather(*tasks)]
 
             new_beams = []
-            for i, current_beam in enumerate(all_beams):
-                result = output[i]
-
+            # Store all new tokens generated by beam
+            all_beams_token_id = []
+            # Store the cumulative probability of all tokens
+            # generated by beam search
+            all_beams_logprob = []
+            # Iterate through all beam inference results
+            for i, result in enumerate(output):
+                current_beam = all_beams[i]
                 if result.outputs[0].logprobs is not None:
                     logprobs = result.outputs[0].logprobs[0]
-                    for token_id, logprob_obj in logprobs.items():
-                        if token_id == eos_token_id and not ignore_eos:
-                            completed.append(
-                                BeamSearchSequence(
-                                    tokens=current_beam.tokens + [token_id]
-                                    if include_stop_str_in_output
-                                    else current_beam.tokens,
-                                    logprobs=current_beam.logprobs + [logprobs],
-                                    cum_logprob=current_beam.cum_logprob
-                                    + logprob_obj.logprob,
-                                    finish_reason="stop",
-                                    stop_reason=eos_token_id,
-                                )
-                            )
-                        else:
-                            new_beams.append(
-                                BeamSearchSequence(
-                                    tokens=current_beam.tokens + [token_id],
-                                    logprobs=current_beam.logprobs + [logprobs],
-                                    lora_request=current_beam.lora_request,
-                                    cum_logprob=current_beam.cum_logprob
-                                    + logprob_obj.logprob,
-                                    multi_modal_data=current_beam.multi_modal_data,
-                                    mm_processor_kwargs=current_beam.mm_processor_kwargs,
-                                )
-                            )
+                    all_beams_token_id.extend(list(logprobs.keys()))
+                    all_beams_logprob.extend(
+                        [
+                            current_beam.cum_logprob + obj.logprob
+                            for obj in logprobs.values()
+                        ]
+                    )
 
-            sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True)
-            all_beams = sorted_beams[:beam_width]
+            # Handle the token for the end of sentence (EOS)
+            all_beams_token_id = np.array(all_beams_token_id)
+            all_beams_logprob = np.array(all_beams_logprob)
+
+            if not ignore_eos:
+                # Get the index position of eos token in all generated results
+                eos_idx = np.where(all_beams_token_id == eos_token_id)[0]
+                for idx in eos_idx:
+                    current_beam = all_beams[idx // logprobs_num]
+                    result = output[idx // logprobs_num]
+                    assert result.outputs[0].logprobs is not None
+                    logprobs_entry = result.outputs[0].logprobs[0]
+                    completed.append(
+                        BeamSearchSequence(
+                            tokens=current_beam.tokens + [eos_token_id]
+                            if include_stop_str_in_output
+                            else current_beam.tokens,
+                            logprobs=current_beam.logprobs + [logprobs_entry],
+                            cum_logprob=float(all_beams_logprob[idx]),
+                            finish_reason="stop",
+                            stop_reason=eos_token_id,
+                        )
+                    )
+                # After processing, set the log probability of the eos condition
+                # to negative infinity.
+                all_beams_logprob[eos_idx] = -np.inf
+
+            # Processing non-EOS tokens
+            # Get indices of the top beam_width probabilities
+            topn_idx = np.argpartition(np.negative(all_beams_logprob), beam_width)[
+                :beam_width
+            ]
+
+            for idx in topn_idx:
+                current_beam = all_beams[idx // logprobs_num]
+                result = output[idx // logprobs_num]
+                token_id = int(all_beams_token_id[idx])
+                assert result.outputs[0].logprobs is not None
+                logprobs_entry = result.outputs[0].logprobs[0]
+                new_beams.append(
+                    BeamSearchSequence(
+                        tokens=current_beam.tokens + [token_id],
+                        logprobs=current_beam.logprobs + [logprobs_entry],
+                        lora_request=current_beam.lora_request,
+                        cum_logprob=float(all_beams_logprob[idx]),
+                        multi_modal_data=current_beam.multi_modal_data,
+                        mm_processor_kwargs=current_beam.mm_processor_kwargs,
+                    )
+                )
+
+            all_beams = new_beams
 
         completed.extend(all_beams)
         sorted_completed = sorted(completed, key=sort_beams_key, reverse=True)

From 22924383e14a7a37ee86cf6e15f39e13efc86f7c Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Thu, 20 Nov 2025 11:07:06 -0600
Subject: [PATCH 076/249] Updating the mirror of test-amd.yaml as of 2025-11-18
 (#29016)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/test-amd.yaml | 41 ++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 37c6bd4276722..4e2ff5c5a6bd5 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -61,7 +61,7 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
   timeout_in_minutes: 10
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
@@ -73,6 +73,7 @@ steps:
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
   - tests/transformers_utils
+  - tests/config
   no_gpu: true
   commands:
   - python3 standalone_tests/lazy_imports.py
@@ -80,6 +81,7 @@ steps:
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s transformers_utils
+  - pytest -v -s config
 
 - label: Python-only Installation Test # 10min
   timeout_in_minutes: 20
@@ -390,6 +392,15 @@ steps:
   commands:
     - pytest -v -s v1/attention
 
+- label: V1 Test attention (B200) # 10min
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+
 - label: V1 Test others (CPU) # 5 mins
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
@@ -529,7 +540,7 @@ steps:
   - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
     # Limit to no custom ops to reduce running time
     # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -694,7 +705,7 @@ steps:
   - vllm/model_executor/models/whisper.py
   commands: # LMEval
   # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
-  - pytest -s entrypoints/openai/correctness/  --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
+  - pytest -s entrypoints/openai/correctness/
 
 - label: OpenAI-Compatible Tool Use # 23 min
   timeout_in_minutes: 35
@@ -995,12 +1006,12 @@ steps:
   optional: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
     - pytest -v -s tests/models/test_transformers.py
-    - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py
+    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
     - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 
@@ -1045,7 +1056,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
 
-- label: Blackwell Fusion Tests # 30 min
+- label: Blackwell Fusion and Compile Tests # 30 min
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   gpu: b200
@@ -1066,7 +1077,9 @@ steps:
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
     # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell Fusion E2E Tests # 30 min
   timeout_in_minutes: 40
@@ -1088,15 +1101,13 @@ steps:
   commands:
     - nvidia-smi
     # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+    - pytest -v -s tests/compile/test_fusions_e2e.py
 
 - label: ROCm GPT-OSS Eval
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
   agent_pool: mi325_1
-  mirror_hardwares: [amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction]
   optional: true # run on nightlies
   source_file_dependencies:
   - tests/evals/gpt_oss
@@ -1416,7 +1427,9 @@ steps:
     - pytest -v -s tests/compile/distributed/test_async_tp.py
     - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - pytest -v -s tests/compile/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py

From e5bfcb6a88cda4f91e3c7074d7e76bb5d1d36362 Mon Sep 17 00:00:00 2001
From: Pan Li <1162953505@qq.com>
Date: Fri, 21 Nov 2025 01:38:31 +0800
Subject: [PATCH 077/249] [BugFix][PD]: make example proxy usable with
 P2pNcclConnector (#26628)

Signed-off-by: PAN <1162953505@qq.com>
---
 .../disagg_prefill_proxy_server.py            | 245 +++++++++++-------
 .../online_serving/disaggregated_prefill.sh   |  19 +-
 2 files changed, 167 insertions(+), 97 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index 904f805349148..d072c03c440b2 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -5,11 +5,12 @@ import argparse
 import asyncio
 import logging
 import os
+import time
+import uuid
+from urllib.parse import urlparse
 
 import aiohttp
 from quart import Quart, Response, make_response, request
-from rate_limiter import RateLimiter
-from request_queue import RequestQueue
 
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -24,26 +25,8 @@ def parse_args():
     parser.add_argument(
         "--timeout",
         type=float,
-        default=300,
-        help="Timeout for backend service requests in seconds (default: 300)",
-    )
-    parser.add_argument(
-        "--max-concurrent",
-        type=int,
-        default=100,
-        help="Maximum concurrent requests to backend services (default: 100)",
-    )
-    parser.add_argument(
-        "--queue-size",
-        type=int,
-        default=500,
-        help="Maximum number of requests in the queue (default: 500)",
-    )
-    parser.add_argument(
-        "--rate-limit",
-        type=int,
-        default=40,
-        help="Maximum requests per second (default: 40)",
+        default=6 * 60 * 60,
+        help="Timeout for backend service requests in seconds (default: 21600)",
     )
     parser.add_argument(
         "--port",
@@ -54,14 +37,32 @@ def parse_args():
     parser.add_argument(
         "--prefill-url",
         type=str,
-        default="http://localhost:8100/v1/completions",
-        help="Prefill service endpoint URL",
+        default="http://localhost:8100",
+        help="Prefill service base URL (protocol + host[:port])",
     )
     parser.add_argument(
         "--decode-url",
         type=str,
-        default="http://localhost:8200/v1/completions",
-        help="Decode service endpoint URL",
+        default="http://localhost:8200",
+        help="Decode service base URL (protocol + host[:port])",
+    )
+    parser.add_argument(
+        "--kv-host",
+        type=str,
+        default="localhost",
+        help="Hostname or IP used by KV transfer (default: localhost)",
+    )
+    parser.add_argument(
+        "--prefill-kv-port",
+        type=int,
+        default=14579,
+        help="Prefill KV port (default: 14579)",
+    )
+    parser.add_argument(
+        "--decode-kv-port",
+        type=int,
+        default=14580,
+        help="Decode KV port (default: 14580)",
     )
 
     return parser.parse_args()
@@ -73,70 +74,129 @@ def main():
 
     # Initialize configuration using command line parameters
     AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
-    MAX_CONCURRENT_REQUESTS = args.max_concurrent
-    REQUEST_QUEUE_SIZE = args.queue_size
-    RATE_LIMIT = args.rate_limit
     PREFILL_SERVICE_URL = args.prefill_url
     DECODE_SERVICE_URL = args.decode_url
     PORT = args.port
 
+    PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}"
+    DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}"
+
+    logger.info(
+        "Proxy resolved KV addresses -> prefill: %s, decode: %s",
+        PREFILL_KV_ADDR,
+        DECODE_KV_ADDR,
+    )
+
     app = Quart(__name__)
 
-    # Initialize the rate limiter and request queue
-    rate_limiter = RateLimiter(RATE_LIMIT)
-    request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
-
-    # Attach the configuration object to the application instance
+    # Attach the configuration object to the application instance so helper
+    # coroutines can read the resolved backend URLs and timeouts without using
+    # globals.
     app.config.update(
         {
             "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
-            "rate_limiter": rate_limiter,
-            "request_queue": request_queue,
             "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
             "DECODE_SERVICE_URL": DECODE_SERVICE_URL,
+            "PREFILL_KV_ADDR": PREFILL_KV_ADDR,
+            "DECODE_KV_ADDR": DECODE_KV_ADDR,
         }
     )
 
-    # Start queue processing on app startup
-    @app.before_serving
-    async def startup():
-        """Start request processing task when app starts serving"""
-        asyncio.create_task(request_queue.process())
+    def _normalize_base_url(url: str) -> str:
+        """Remove any trailing slash so path joins behave predictably."""
+        return url.rstrip("/")
 
-    async def forward_request(url, data):
-        """Forward request to backend service with rate limiting and error handling"""
-        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+    def _get_host_port(url: str) -> str:
+        """Return the hostname:port portion for logging and KV headers."""
+        parsed = urlparse(url)
+        host = parsed.hostname or "localhost"
+        port = parsed.port
+        if port is None:
+            port = 80 if parsed.scheme == "http" else 443
+        return f"{host}:{port}"
 
-        # Use rate limiter as context manager
-        async with (
-            rate_limiter,
-            aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
-        ):
-            try:
-                async with session.post(
-                    url=url, json=data, headers=headers
-                ) as response:
-                    if response.status == 200:
-                        # Stream response chunks
-                        async for chunk_bytes in response.content.iter_chunked(1024):
-                            yield chunk_bytes
-                    else:
-                        # Handle backend service errors
-                        error_text = await response.text()
-                        logger.error(
-                            "Backend service error: %s - %s",
-                            response.status,
-                            error_text,
-                        )
-                        yield b'{"error": "Backend service error"}'
-            except aiohttp.ClientError as e:
-                # Handle connection errors
-                logger.error("Connection error to %s: %s", url, str(e))
-                yield b'{"error": "Service unavailable"}'
-            except asyncio.TimeoutError:
-                # Handle timeout errors
-                logger.error("Timeout connecting to %s", url)
-                yield b'{"error": "Service timeout"}'
+    PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL)
+    DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL)
+    KV_TARGET = _get_host_port(DECODE_SERVICE_URL)
+
+    def _build_headers(request_id: str) -> dict[str, str]:
+        """Construct the headers expected by vLLM's P2P disagg connector."""
+        headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET}
+        api_key = os.environ.get("OPENAI_API_KEY")
+        if api_key:
+            headers["Authorization"] = f"Bearer {api_key}"
+        return headers
+
+    async def _run_prefill(
+        request_path: str,
+        payload: dict,
+        headers: dict[str, str],
+        request_id: str,
+    ):
+        url = f"{PREFILL_BASE}{request_path}"
+        start_ts = time.perf_counter()
+        logger.info("[prefill] start request_id=%s url=%s", request_id, url)
+        try:
+            async with (
+                aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+                session.post(url=url, json=payload, headers=headers) as resp,
+            ):
+                if resp.status != 200:
+                    error_text = await resp.text()
+                    raise RuntimeError(
+                        f"Prefill backend error {resp.status}: {error_text}"
+                    )
+                await resp.read()
+                logger.info(
+                    "[prefill] done request_id=%s status=%s elapsed=%.2fs",
+                    request_id,
+                    resp.status,
+                    time.perf_counter() - start_ts,
+                )
+        except asyncio.TimeoutError as exc:
+            raise RuntimeError(f"Prefill service timeout at {url}") from exc
+        except aiohttp.ClientError as exc:
+            raise RuntimeError(f"Prefill service unavailable at {url}") from exc
+
+    async def _stream_decode(
+        request_path: str,
+        payload: dict,
+        headers: dict[str, str],
+        request_id: str,
+    ):
+        url = f"{DECODE_BASE}{request_path}"
+        # Stream tokens from the decode service once the prefill stage has
+        # materialized KV caches on the target workers.
+        logger.info("[decode] start request_id=%s url=%s", request_id, url)
+        try:
+            async with (
+                aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+                session.post(url=url, json=payload, headers=headers) as resp,
+            ):
+                if resp.status != 200:
+                    error_text = await resp.text()
+                    logger.error(
+                        "Decode backend error %s - %s", resp.status, error_text
+                    )
+                    err_msg = (
+                        '{"error": "Decode backend error ' + str(resp.status) + '"}'
+                    )
+                    yield err_msg.encode()
+                    return
+                logger.info(
+                    "[decode] streaming response request_id=%s status=%s",
+                    request_id,
+                    resp.status,
+                )
+                async for chunk_bytes in resp.content.iter_chunked(1024):
+                    yield chunk_bytes
+                logger.info("[decode] finished streaming request_id=%s", request_id)
+        except asyncio.TimeoutError:
+            logger.error("Decode service timeout at %s", url)
+            yield b'{"error": "Decode service timeout"}'
+        except aiohttp.ClientError as exc:
+            logger.error("Decode service error at %s: %s", url, exc)
+            yield b'{"error": "Decode service unavailable"}'
 
     async def process_request():
         """Process a single request through prefill and decode stages"""
@@ -146,13 +206,27 @@ def main():
             # Create prefill request (max_tokens=1)
             prefill_request = original_request_data.copy()
             prefill_request["max_tokens"] = 1
+            if "max_completion_tokens" in prefill_request:
+                prefill_request["max_completion_tokens"] = 1
 
             # Execute prefill stage
-            async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
-                continue
+            # The request id encodes both KV socket addresses so the backend can
+            # shuttle tensors directly via NCCL once the prefill response
+            # completes.
+            request_id = (
+                f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_"
+                f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}"
+            )
+
+            headers = _build_headers(request_id)
+            await _run_prefill(request.path, prefill_request, headers, request_id)
 
             # Execute decode stage and stream response
-            generator = forward_request(DECODE_SERVICE_URL, original_request_data)
+            # Pass the unmodified user request so the decode phase can continue
+            # sampling with the already-populated KV cache.
+            generator = _stream_decode(
+                request.path, original_request_data, headers, request_id
+            )
             response = await make_response(generator)
             response.timeout = None  # Disable timeout for streaming response
             return response
@@ -168,23 +242,10 @@ def main():
     @app.route("/v1/completions", methods=["POST"])
     async def handle_request():
         """Handle incoming API requests with concurrency and rate limiting"""
-        # Create task for request processing
-        task = asyncio.create_task(process_request())
-
-        # Enqueue request or reject if queue is full
-        if not await request_queue.enqueue(task):
-            return Response(
-                response=b'{"error": "Server busy, try again later"}',
-                status=503,
-                content_type="application/json",
-            )
-
         try:
-            # Return the response from the processing task
-            return await task
+            return await process_request()
         except asyncio.CancelledError:
-            # Handle task cancellation (timeout or queue full)
-            logger.warning("Request cancelled due to timeout or queue full")
+            logger.warning("Request cancelled")
             return Response(
                 response=b'{"error": "Request cancelled"}',
                 status=503,
diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
index d434e22b1ae88..cd2f2e44a4d69 100644
--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@@ -24,7 +24,14 @@ cleanup() {
     exit 0
 }
 
-export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+if [[ -z "${VLLM_HOST_IP:-}" ]]; then
+    export VLLM_HOST_IP=127.0.0.1
+    echo "Using default VLLM_HOST_IP=127.0.0.1 (override by exporting VLLM_HOST_IP before running this script)"
+else
+    echo "Using provided VLLM_HOST_IP=${VLLM_HOST_IP}"
+fi
+
 
 # install quart first -- required for disagg prefill proxy serve
 if python3 -c "import quart" &> /dev/null; then
@@ -38,7 +45,7 @@ fi
 wait_for_server() {
   local port=$1
   timeout 1200 bash -c "
-    until curl -s localhost:${port}/v1/completions > /dev/null; do
+    until curl -i localhost:${port}/v1/models > /dev/null; do
       sleep 1
     done" && return 0 || return 1
 }
@@ -48,21 +55,23 @@ wait_for_server() {
 
 # prefilling instance, which is the KV producer
 CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
+    --host 0.0.0.0 \
     --port 8100 \
     --max-model-len 100 \
     --gpu-memory-utilization 0.8 \
     --trust-remote-code \
     --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' &
 
-# decoding instance, which is the KV consumer
+# decoding instance, which is the KV consumer  
 CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
+    --host 0.0.0.0 \
     --port 8200 \
     --max-model-len 100 \
     --gpu-memory-utilization 0.8 \
     --trust-remote-code \
     --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":"1e10","kv_port":"14580","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8200","send_type":"PUT_ASYNC"}}' &
 
 # wait until prefill and decode instances are ready
 wait_for_server 8100

From 647464719b131963dccdc3a28cfe52d1af293cda Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Thu, 20 Nov 2025 20:09:59 +0200
Subject: [PATCH 078/249] [KVConnector][Core] Support cross-layer KV blocks
 (#27743)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 .../unit/test_offloading_connector.py         |   8 +-
 tests/v1/kv_offload/test_cpu_offloading.py    | 145 +++++++++------
 tests/v1/worker/test_gpu_model_runner.py      |   5 +-
 vllm/attention/backends/abstract.py           |  29 ++-
 .../kv_transfer/kv_connector/v1/base.py       |  33 +++-
 .../kv_connector/v1/offloading_connector.py   |  43 ++++-
 vllm/v1/attention/backends/flash_attn.py      |  12 +-
 vllm/v1/attention/backends/flashinfer.py      |  12 +-
 vllm/v1/attention/backends/mla/common.py      |   9 +
 vllm/v1/attention/backends/mla/indexer.py     |   6 +-
 vllm/v1/kv_offload/cpu.py                     |  17 +-
 vllm/v1/kv_offload/spec.py                    |   6 +-
 vllm/v1/kv_offload/worker/cpu_gpu.py          |  12 +-
 vllm/v1/worker/gpu_model_runner.py            |  41 ++++-
 .../worker/kv_connector_model_runner_mixin.py | 165 ++++++++++++++++++
 15 files changed, 453 insertions(+), 90 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py
index 23b6c4802d106..69565f584ab89 100644
--- a/tests/v1/kv_connector/unit/test_offloading_connector.py
+++ b/tests/v1/kv_connector/unit/test_offloading_connector.py
@@ -19,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.offloading_connector import (
 )
 from vllm.forward_context import ForwardContext
 from vllm.utils.hashing import sha256
+from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
     get_request_block_hasher,
@@ -92,7 +93,7 @@ class MockOffloadingSpec(OffloadingSpec):
         return self.manager
 
     def get_handlers(
-        self, _
+        self, _, __
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         yield GPULoadStoreSpec, MockLoadStoreSpec, self.handler
         yield MockLoadStoreSpec, GPULoadStoreSpec, self.handler
@@ -138,7 +139,10 @@ class RequestRunner:
         self.worker_connector = OffloadingConnector(vllm_config, KVConnectorRole.WORKER)
 
         # register worker kv_caches to enable OffloadingWorker creations
-        self.worker_connector.register_kv_caches(kv_caches={"a": torch.empty(0)})
+        self.worker_connector.register_cross_layers_kv_cache(
+            kv_cache=torch.empty(0),
+            attn_backend=FlashAttentionBackend,
+        )
 
         # extract connector of scheduler
         scheduler_connector = self.scheduler.connector
diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py
index b654ea4298dbb..3ee41c40859dc 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -12,8 +12,10 @@ from tqdm import tqdm
 from vllm import LLM, SamplingParams, TokensPrompt
 from vllm.config import KVEventsConfig, KVTransferConfig
 from vllm.distributed.kv_events import BlockStored, KVEventBatch
+from vllm.utils.system_utils import set_env_var
 
-CPU_BLOCK_SIZES = [16, 48]
+CPU_BLOCK_SIZES = [48]
+ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER"]
 
 
 class MockSubscriber:
@@ -63,8 +65,88 @@ class MockSubscriber:
         self.sub.close()
 
 
+def _latency_test(llm: LLM, subscriber: MockSubscriber):
+    sampling_params = SamplingParams(max_tokens=1)
+
+    num_times_cpu_better_than_cold = 0
+    num_tests = 10
+    total_cold_time = 0.0
+    total_gpu_hit_time = 0.0
+    total_cpu_hit_time = 0.0
+    prompt_token_ids = [0] * 10001
+    for i in tqdm(range(num_tests), desc="Running tests"):
+        prompt_token_ids[0] = i
+        prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)]
+
+        # run generation - this should trigger saving KV cache
+        start_time = time.time()
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+        cold_time = time.time() - start_time
+        total_cold_time += cold_time
+
+        # run generation again - should hit the GPU prefix cache
+        start_time = time.time()
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+        gpu_hit_time = time.time() - start_time
+        total_gpu_hit_time += gpu_hit_time
+
+        # reset prefix cache to avoid GPU hit.
+        llm.reset_prefix_cache()
+
+        assert subscriber.get_new_cpu_stored_events()
+
+        # run generation again - this should trigger loading from CPU
+        start_time = time.time()
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+        cpu_hit_time = time.time() - start_time
+        total_cpu_hit_time += cpu_hit_time
+
+        if cpu_hit_time < cold_time:
+            num_times_cpu_better_than_cold += 1
+
+    print("Average times:")
+    print(f"    Cold: {total_cold_time * 1000 / num_tests:.2f}ms")
+    print(f"    GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms")
+    print(f"    CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms")
+
+    assert num_times_cpu_better_than_cold >= 0.8 * num_tests
+
+
+def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
+    sampling_params = SamplingParams(max_tokens=1)
+    cpu_block_size = (
+        llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config[
+            "block_size"
+        ]
+    )
+
+    subscriber.get_new_cpu_stored_events()
+
+    # prepend prompt to be cpu block aligned
+    prompt = "Let's count to 10. One, two, three, four,"
+    while (
+        len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size
+        != 0
+    ):
+        prompt = ". " + prompt
+
+    assert subscriber.get_new_cpu_stored_events()
+
+    test_count = 100
+    success_count = 0
+    for i in range(test_count):
+        if (
+            llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text
+            == " five"
+        ):
+            success_count += 1
+
+    assert success_count >= 0.5 * test_count
+
+
 @pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES)
-def test_cpu_offloading(cpu_block_size: int) -> None:
+@pytest.mark.parametrize("attn_backend", ATTN_BACKENDS)
+def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
     """
     Tests OffloadingConnector with CPUOffloadingSpec.
     """
@@ -92,61 +174,20 @@ def test_cpu_offloading(cpu_block_size: int) -> None:
         topic="test",
     )
 
-    llm = LLM(
-        model="meta-llama/Llama-3.2-1B-Instruct",
-        gpu_memory_utilization=0.5,
-        kv_events_config=kv_events_config,
-        kv_transfer_config=kv_transfer_config,
-    )
-
-    sampling_params = SamplingParams(temperature=0, max_tokens=1)
+    with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend):
+        llm = LLM(
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            gpu_memory_utilization=0.5,
+            kv_events_config=kv_events_config,
+            kv_transfer_config=kv_transfer_config,
+        )
 
     events_endpoint = events_endpoint.replace("*", "127.0.0.1")
     subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)
 
     try:
-        num_times_cpu_better_than_cold = 0
-        num_tests = 10
-        total_cold_time = 0.0
-        total_gpu_hit_time = 0.0
-        total_cpu_hit_time = 0.0
-        prompt_token_ids = [0] * 10001
-        for i in tqdm(range(num_tests), desc="Running tests"):
-            prompt_token_ids[0] = i
-            prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)]
-
-            # run generation - this should trigger saving KV cache
-            start_time = time.time()
-            llm.generate(prompts, sampling_params, use_tqdm=False)
-            cold_time = time.time() - start_time
-            total_cold_time += cold_time
-
-            # run generation again - should hit the GPU prefix cache
-            start_time = time.time()
-            llm.generate(prompts, sampling_params, use_tqdm=False)
-            gpu_hit_time = time.time() - start_time
-            total_gpu_hit_time += gpu_hit_time
-
-            # reset prefix cache to avoid GPU hit.
-            llm.reset_prefix_cache()
-
-            assert subscriber.get_new_cpu_stored_events()
-
-            # run generation again - this should trigger loading from CPU
-            start_time = time.time()
-            llm.generate(prompts, sampling_params, use_tqdm=False)
-            cpu_hit_time = time.time() - start_time
-            total_cpu_hit_time += cpu_hit_time
-
-            if cpu_hit_time < cold_time:
-                num_times_cpu_better_than_cold += 1
-
-        print("Average times:")
-        print(f"    Cold: {total_cold_time * 1000 / num_tests:.2f}ms")
-        print(f"    GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms")
-        print(f"    CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms")
-
-        assert num_times_cpu_better_than_cold >= 0.8 * num_tests
+        _latency_test(llm, subscriber)
+        _accuracy_test(llm, subscriber)
     finally:
         subscriber.close()
         del llm
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 824e458978350..01c1364f7ee62 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -483,7 +483,10 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
     # Permutation that gets you back to expected kv shape
     for test_stride in ((1, 4, 0, 2, 3), (0, 1, 2, 3, 4)):
 
-        def rnd_stride_order(test_stride=test_stride):
+        def rnd_stride_order(
+            include_num_layers_dimension: bool = False, test_stride=test_stride
+        ):
+            assert not include_num_layers_dimension
             return test_stride
 
         # Patch the attention backend class and re-trigger the KV cache creation
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 188becb6ad6f0..67ded88475243 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -76,7 +76,34 @@ class AttentionBackend(ABC):
         raise NotImplementedError
 
     @staticmethod
-    def get_kv_cache_stride_order() -> tuple[int, ...]:
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
+        """
+        Get the physical (memory layout) ordering of the kv cache dimensions.
+        e.g. if the KV cache shape is
+        [2, num_blocks, block_size, num_heads, head_size],
+        and get_kv_cache_stride_order returns (1, 3, 0, 2, 4) then the physical
+        ordering of dimensions is
+        [num_blocks, num_heads, 2, block_size, head_size].
+
+        If this function is unimplemented / raises NotImplementedError,
+        the physical layout of the KV cache will match the logical shape.
+
+        Args:
+            include_num_layers_dimension: if True, includes an additional
+                num_layers dimension, which is assumed to be prepended
+                to the logical KV cache shape.
+                With the above example, a return value (2, 4, 0, 1, 3, 5)
+                corresponds to
+                [num_blocks, num_heads, num_layers, 2, block_size, head_size].
+
+                If an additional dimension is NOT included in the returned
+                tuple, the physical layout will not include a layers dimension.
+
+        Returns:
+            A tuple of ints which is a permutation of range(len(shape)).
+        """
         raise NotImplementedError
 
     @classmethod
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index f85eb414b2222..74f09278b7bb1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -38,7 +38,7 @@ The class provides the following primitives:
 import enum
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Iterable
-from typing import TYPE_CHECKING, Any, Literal, Optional
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional
 
 import torch
 
@@ -47,7 +47,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
@@ -142,6 +142,18 @@ class KVConnectorMetadata(ABC):  # noqa: B024
 
 
 class KVConnectorBase_V1(ABC):
+    """
+    Base class for KV connectors.
+
+    Attributes:
+        prefer_cross_layer_blocks (bool): Indicates whether this connector
+            prefers KV blocks that hold KV data for all layers (for speeding
+            up KV data transfers).
+            Defaults to False.
+    """
+
+    prefer_cross_layer_blocks: ClassVar[bool] = False
+
     def __init__(
         self,
         vllm_config: "VllmConfig",
@@ -226,6 +238,23 @@ class KVConnectorBase_V1(ABC):
         """
         return
 
+    def register_cross_layers_kv_cache(
+        self, kv_cache: torch.Tensor, attn_backend: type["AttentionBackend"]
+    ):
+        """
+        Initialize with a single KV cache tensor used by all layers.
+        The first dimension should be num_layers.
+        This function will only be called for models with uniform layers,
+        and only if the prefers_cross_layer_blocks is set to True.
+        Only one of the functions
+        {register_kv_caches, register_cross_layers_kv_cache} will be called.
+
+        Args:
+            kv_cache: a cross-layers kv cache tensor
+            attn_backend: The attention backend that corresponds to all layers
+        """
+        return
+
     def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
         """
         Set the xPU-specific ops for copying KV between host and device.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index 582e42cc466ae..8cd09014cab11 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -4,12 +4,12 @@ from collections import defaultdict
 from collections.abc import Iterable, Iterator
 from dataclasses import dataclass
 from itertools import islice
-from typing import Any
+from typing import Any, ClassVar
 
 import torch
 
-from vllm.attention import AttentionMetadata
-from vllm.config import VllmConfig
+from vllm.attention import Attention, AttentionBackend, AttentionMetadata
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
@@ -42,6 +42,8 @@ class OffloadingConnectorMetadata(KVConnectorMetadata):
 
 
 class OffloadingConnector(KVConnectorBase_V1):
+    prefer_cross_layer_blocks: ClassVar[bool] = True
+
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -63,6 +65,12 @@ class OffloadingConnector(KVConnectorBase_V1):
         assert self.connector_worker is not None
         self.connector_worker.register_kv_caches(kv_caches)
 
+    def register_cross_layers_kv_cache(
+        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
+    ):
+        assert self.connector_worker is not None
+        self.connector_worker.register_cross_layers_kv_cache(kv_cache, attn_backend)
+
     def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
         assert self.connector_worker is not None
         assert isinstance(self._connector_metadata, OffloadingConnectorMetadata)
@@ -422,10 +430,35 @@ class OffloadingConnectorWorker:
         self._job_counter = job_id + 1
         return job_id
 
-    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
-        for src_cls, dst_cls, handler in self.spec.get_handlers(kv_caches):
+    def _register_handlers(
+        self,
+        kv_caches: dict[str, torch.Tensor],
+        attn_backends: dict[str, type[AttentionBackend]],
+    ):
+        for src_cls, dst_cls, handler in self.spec.get_handlers(
+            kv_caches, attn_backends
+        ):
             self.worker.register_handler(src_cls, dst_cls, handler)
 
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        layer_names = list(kv_caches.keys())
+        layers = get_layers_from_vllm_config(
+            self.spec.vllm_config, Attention, layer_names
+        )
+        attn_backends = {
+            layer_name: layers[layer_name].get_attn_backend()
+            for layer_name in layer_names
+        }
+        self._register_handlers(kv_caches, attn_backends)
+
+    def register_cross_layers_kv_cache(
+        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
+    ):
+        cross_layer_name = "ALL_LAYERS"
+        kv_caches = {cross_layer_name: kv_cache}
+        attn_backends = {cross_layer_name: attn_backend}
+        self._register_handlers(kv_caches, attn_backends)
+
     def start_load_kv(self, metadata: OffloadingConnectorMetadata):
         for req_id, transfer_spec in metadata.reqs_to_load.items():
             job_id = self._generate_job_id()
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index cf3c1d05f5b3f..9fa6b1dfd19dd 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -99,12 +99,20 @@ class FlashAttentionBackend(AttentionBackend):
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
     @staticmethod
-    def get_kv_cache_stride_order() -> tuple[int, ...]:
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
         # `stride_order` indicates the permutation that gets
         # us from `get_kv_cache_shape` to the actual memory layout we want.
         cache_layout = get_kv_cache_layout()
-        if cache_layout == "NHD":
+        if cache_layout == "NHD" and include_num_layers_dimension:
+            # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size)
+            return (2, 0, 1, 3, 4, 5)
+        elif cache_layout == "NHD":
             stride_order = (0, 1, 2, 3, 4)
+        elif cache_layout == "HND" and include_num_layers_dimension:
+            # (num_blocks, num_kv_heads, num_layers, 2, block_size, head_size)
+            return (2, 4, 0, 1, 3, 5)
         elif cache_layout == "HND":
             stride_order = (0, 1, 3, 2, 4)
         else:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 4da1637d96eb6..3ad7e8c52fc1f 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -309,12 +309,20 @@ class FlashInferBackend(AttentionBackend):
         return (num_blocks, 2, block_size, num_kv_heads, head_size)
 
     @staticmethod
-    def get_kv_cache_stride_order() -> tuple[int, ...]:
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
         # `stride_order` indicates the permutation that gets us from
         # `get_kv_cache_shape` to the actual memory layout we want.
         cache_layout = get_kv_cache_layout()
-        if cache_layout == "NHD":
+        if cache_layout == "NHD" and include_num_layers_dimension:
+            # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size)
+            return (1, 0, 2, 3, 4, 5)
+        elif cache_layout == "NHD":
             stride_order = (0, 1, 2, 3, 4)
+        elif cache_layout == "HND" and include_num_layers_dimension:
+            # (num_blocks, 2, num_kv_heads, num_layers, block_size, head_size)
+            return (1, 2, 4, 0, 3, 5)
         elif cache_layout == "HND":
             stride_order = (0, 1, 3, 2, 4)
         else:
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 32f406980f2ed..43aef8a7cca91 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -308,6 +308,15 @@ class MLACommonBackend(AttentionBackend):
     ) -> tuple[int, ...]:
         return (num_blocks, block_size, head_size)
 
+    @staticmethod
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
+        # `stride_order` indicates the permutation that gets
+        # us from `get_kv_cache_shape` to the actual memory layout we want.
+        # (num_blocks, num_layers, block_size, head_size)
+        return (1, 0, 2, 3) if include_num_layers_dimension else (0, 1, 2)
+
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [576]
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index cc0988435768c..d38361e0fcbf8 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -48,7 +48,11 @@ class DeepseekV32IndexerBackend(AttentionBackend):
         return (num_blocks, block_size, head_size)
 
     @staticmethod
-    def get_kv_cache_stride_order() -> tuple[int, ...]:
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
+        if include_num_layers_dimension:
+            return (0, 1, 2, 3)
         return (0, 1, 2)
 
 
diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py
index 4b1bbe6f0cc2a..86747299eb107 100644
--- a/vllm/v1/kv_offload/cpu.py
+++ b/vllm/v1/kv_offload/cpu.py
@@ -4,8 +4,8 @@ from collections.abc import Iterator
 
 import torch
 
-from vllm.config import VllmConfig, get_layers_from_vllm_config
-from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.attention import AttentionBackend
+from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
 from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
@@ -63,7 +63,9 @@ class CPUOffloadingSpec(OffloadingSpec):
         return self._manager
 
     def get_handlers(
-        self, kv_caches: dict[str, torch.Tensor]
+        self,
+        kv_caches: dict[str, torch.Tensor],
+        attn_backends: dict[str, type[AttentionBackend]],
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         if not self._handler:
             if not current_platform.is_cuda_alike():
@@ -71,15 +73,6 @@ class CPUOffloadingSpec(OffloadingSpec):
                     "CPU Offloading is currently only supported on CUDA-alike GPUs"
                 )
 
-            layer_names = list(kv_caches.keys())
-            layers = get_layers_from_vllm_config(
-                self.vllm_config, AttentionLayerBase, layer_names
-            )
-            attn_backends = {
-                layer_name: layers[layer_name].get_attn_backend()
-                for layer_name in layer_names
-            }
-
             self._handler = CpuGpuOffloadingHandler(
                 attn_backends=attn_backends,
                 gpu_block_size=self.gpu_block_size,
diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py
index a3c539a47d458..c1813a4ff4ea9 100644
--- a/vllm/v1/kv_offload/spec.py
+++ b/vllm/v1/kv_offload/spec.py
@@ -11,6 +11,7 @@ from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 if TYPE_CHECKING:
+    from vllm.attention import AttentionBackend
     from vllm.config import VllmConfig
 
 logger = init_logger(__name__)
@@ -48,13 +49,16 @@ class OffloadingSpec(ABC):
 
     @abstractmethod
     def get_handlers(
-        self, kv_caches: dict[str, torch.Tensor]
+        self,
+        kv_caches: dict[str, torch.Tensor],
+        attn_backends: dict[str, type["AttentionBackend"]],
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         """
         Get offloading handlers along with their respective src and dst types.
 
         Args:
             kv_caches: A dictionary of layer_name -> gpu_kv_cache tensor.
+            attn_backends: A dictionary of layer_name -> AttentionBackend.
 
         Yields:
             Tuples of (src_type, dst_type, offloading_handler).
diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
index 111046377a5da..bb163f0043fc6 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -83,10 +83,18 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
             self.gpu_tensors.append(gpu_tensor)
 
             gpu_shape = gpu_tensor.shape
-            test_shape = attn_backends[layer_name].get_kv_cache_shape(
+            attn_backend = attn_backends[layer_name]
+            test_shape = attn_backend.get_kv_cache_shape(
                 num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256
             )
-            if test_shape[0] == 1234:
+
+            if len(gpu_shape) != len(test_shape):
+                # cross-layers tensor
+                # shape is (num_blocks, ...)
+                assert len(gpu_shape) == len(test_shape) + 1
+                num_blocks_idx = 0
+                self.kv_dim_before_num_blocks.append(False)
+            elif test_shape[0] == 1234:
                 # shape is (num_blocks, ...)
                 num_blocks_idx = 0
                 self.kv_dim_before_num_blocks.append(False)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0490ed39c8c78..4b0a08ab57e16 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -349,6 +349,9 @@ class GPUModelRunner(
         # self.model: nn.Module  # Set after load_model
         # Initialize in initialize_kv_cache
         self.kv_caches: list[torch.Tensor] = []
+        # Initialize in initialize_kv_cache_tensors
+        self.cross_layers_kv_cache: torch.Tensor | None = None
+        self.cross_layers_attn_backend: type[AttentionBackend] | None = None
         # indexes: [kv_cache_group_id][attn_group]
         self.attn_groups: list[list[AttentionGroup]] = []
         # self.kv_cache_config: KVCacheConfig
@@ -4930,12 +4933,30 @@ class GPUModelRunner(
             Dict[str, torch.Tensor]: A map between layer names to their
             corresponding memory buffer for KV cache.
         """
-        # Initialize the memory buffer for KV cache
-        kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
-        # Change the memory buffer to the desired shape
-        kv_caches = self._reshape_kv_cache_tensors(
-            kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes
-        )
+
+        # Try creating KV caches optimized for kv-connector transfers
+        cache_dtype = self.cache_config.cache_dtype
+        if self.use_uniform_kv_cache(self.attn_groups, cache_dtype):
+            kv_caches, cross_layers_kv_cache, attn_backend = (
+                self.allocate_uniform_kv_caches(
+                    kv_cache_config,
+                    self.attn_groups,
+                    cache_dtype,
+                    self.device,
+                    kernel_block_sizes,
+                )
+            )
+            self.cross_layers_kv_cache = cross_layers_kv_cache
+            self.cross_layers_attn_backend = attn_backend
+        else:
+            # Fallback to the general case
+            # Initialize the memory buffer for KV cache
+            kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
+
+            # Change the memory buffer to the desired shape
+            kv_caches = self._reshape_kv_cache_tensors(
+                kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes
+            )
 
         # Set up cross-layer KV cache sharing
         for layer_name, target_layer_name in self.shared_kv_cache_layers.items():
@@ -5017,7 +5038,13 @@ class GPUModelRunner(
 
         if has_kv_transfer_group():
             kv_transfer_group = get_kv_transfer_group()
-            kv_transfer_group.register_kv_caches(kv_caches)
+            if self.cross_layers_kv_cache is not None:
+                assert self.cross_layers_attn_backend is not None
+                kv_transfer_group.register_cross_layers_kv_cache(
+                    self.cross_layers_kv_cache, self.cross_layers_attn_backend
+                )
+            else:
+                kv_transfer_group.register_kv_caches(kv_caches)
             kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
 
         if self.dcp_world_size > 1:
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index db037a9fccd5c..e59361f21372a 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -11,7 +11,11 @@ from typing import (
     TYPE_CHECKING,  # noqa: UP035
 )
 
+import torch
+
+from vllm.attention import AttentionBackend
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.distributed.kv_transfer import (
     ensure_kv_transfer_shutdown,
     get_kv_transfer_group,
@@ -21,11 +25,13 @@ from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
+from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig
 from vllm.v1.outputs import (
     EMPTY_MODEL_RUNNER_OUTPUT,
     KVConnectorOutput,
     ModelRunnerOutput,
 )
+from vllm.v1.worker.utils import AttentionGroup
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -142,3 +148,162 @@ class KVConnectorModelRunnerMixin:
         if has_kv_transfer_group():
             return get_kv_transfer_group().get_kv_connector_stats()
         return None
+
+    @staticmethod
+    def use_uniform_kv_cache(
+        attn_groups: list[list[AttentionGroup]],
+        cache_dtype: CacheDType,
+    ) -> bool:
+        """
+        Determines whether a uniform KV layout should be used.
+        A uniform layout means all layers KV caches will share the same
+        underlying tensor, where for a given block number, the respective
+        KV data for all layers will be contiguous.
+        This will allow efficient KV transfer of per-block KV data for all
+        layers at once.
+        Note this layout will only be applied given 3 conditions:
+        1. The KV Cache config contains just a single group where all layers
+            have the same page size.
+        2. A KV connector is configured, and the KV connector instance prefers
+            to use this layout (prefer_cross_layer_blocks() returns True)
+        2. The flash attention backend supports this layout
+            (get_kv_cache_stride_order(True) includes a placement for a
+            num_layers dimension)
+
+        Note that the actual placement of the num_layers dimensions
+        in the unified layers tensors will be determined by the attention
+        backend.
+        Thus, the layers KV data may still not be contiguous per block
+        if the attention backend does not support it.
+
+        Args:
+            attn_groups: The list of attention groups for this model
+            cache_dtype: The KV cache dtype
+        Returns:
+            True if we should use a uniform KV cache layout.
+        """
+
+        if not has_kv_transfer_group():
+            return False
+        if not get_kv_transfer_group().prefer_cross_layer_blocks:
+            return False
+
+        if len(attn_groups) != 1 or len(attn_groups[0]) != 1:
+            return False
+
+        attn_group = attn_groups[0][0]
+        kv_cache_spec = attn_group.kv_cache_spec
+        if not isinstance(kv_cache_spec, AttentionSpec):
+            return False
+
+        attn_backend = attn_group.backend
+        kv_cache_shape = attn_backend.get_kv_cache_shape(
+            1234,
+            kv_cache_spec.block_size,
+            kv_cache_spec.num_kv_heads,
+            kv_cache_spec.head_size,
+            cache_dtype_str=cache_dtype,
+        )
+
+        try:
+            kv_cache_stride_order = attn_backend.get_kv_cache_stride_order(
+                include_num_layers_dimension=True
+            )
+        except (AttributeError, NotImplementedError):
+            return False
+
+        # check that attention backend include a layers dimension
+        return len(kv_cache_stride_order) == len(kv_cache_shape) + 1
+
+    @staticmethod
+    def allocate_uniform_kv_caches(
+        kv_cache_config: KVCacheConfig,
+        attn_groups: list[list[AttentionGroup]],
+        cache_dtype: CacheDType,
+        device: torch.device,
+        kernel_block_sizes: list[int],
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, type[AttentionBackend]]:
+        """
+        Initializes and reshapes KV caches for the simple case where all
+        layers have the same layout.
+
+        This function assumes use_uniform_kv_cache() returned True.
+
+        Args:
+            kv_cache_config: The KV cache config
+            attn_groups: The list of attention groups for this model
+            cache_dtype: The KV cache dtype
+            device: The torch device to allocate on.
+            kernel_block_sizes: The kernel block sizes for each KV cache group.
+        Returns:
+            A tuple (kv_caches, cross_layers_kv_cache, attn_backend) where:
+                kv_caches is a dict mapping between layer names to their
+                    corresponding memory buffer for KV cache.
+                cross_layers_kv_cache is the cross layers kv cache tensor
+                attn_backend is the attention backend matching this tensor
+        """
+        attn_group = attn_groups[0][0]
+        kv_cache_spec = attn_group.kv_cache_spec
+        assert isinstance(kv_cache_spec, AttentionSpec)
+
+        tensor_sizes = set(
+            kv_cache_tensor.size for kv_cache_tensor in kv_cache_config.kv_cache_tensors
+        )
+        assert len(tensor_sizes) == 1
+        tensor_size = tensor_sizes.pop()
+
+        page_size = kv_cache_spec.page_size_bytes
+        assert tensor_size % page_size == 0
+        num_blocks = tensor_size // page_size
+        num_layers = len(kv_cache_config.kv_cache_tensors)
+        total_size = tensor_size * num_layers
+
+        assert len(kernel_block_sizes) == 1
+        kernel_block_size = kernel_block_sizes[0]
+        num_blocks_per_kv_block = kv_cache_spec.block_size // kernel_block_size
+        kernel_num_blocks = num_blocks * num_blocks_per_kv_block
+
+        attn_backend = attn_group.backend
+        kv_cache_shape = attn_backend.get_kv_cache_shape(
+            kernel_num_blocks,
+            kernel_block_size,
+            kv_cache_spec.num_kv_heads,
+            kv_cache_spec.head_size,
+            cache_dtype_str=cache_dtype,
+        )
+
+        # prepend a num_layers dimension into the shape
+        kv_cache_shape = (num_layers,) + kv_cache_shape
+
+        try:
+            kv_cache_stride_order = attn_backend.get_kv_cache_stride_order(
+                include_num_layers_dimension=True
+            )
+            assert len(kv_cache_stride_order) == len(kv_cache_shape)
+        except (AttributeError, NotImplementedError):
+            kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
+
+        kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
+
+        logger.info("Allocating a cross layer KV cache of shape %s", kv_cache_shape)
+
+        # allocate one contiguous buffer for all layers
+        cross_layers_kv_cache = (
+            torch.zeros(total_size, dtype=torch.int8, device=device)
+            .view(kv_cache_spec.dtype)
+            .view(kv_cache_shape)
+        )
+
+        # Maintain original KV shape view.
+        inv_order = [
+            kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order))
+        ]
+        permuted_kv_cache = cross_layers_kv_cache.permute(*inv_order)
+
+        kv_caches = {}
+        for i, kv_cache_tensor in enumerate(kv_cache_config.kv_cache_tensors):
+            tensor = permuted_kv_cache[i]
+            for layer_name in kv_cache_tensor.shared_by:
+                kv_caches[layer_name] = tensor
+
+        return kv_caches, cross_layers_kv_cache, attn_backend

From 114b0e25004b7e7cf0a23dc65f407471bd5de7e8 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Thu, 20 Nov 2025 10:22:40 -0800
Subject: [PATCH 079/249] [chore] Update annotate release scripts (#29077)

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/scripts/annotate-release.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index 56bb5cedaa0a9..df805e0850806 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -23,8 +23,8 @@ To download the wheel (by version):
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
 
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`
 
 To download and upload the image:
@@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 
-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker manifest rm vllm/vllm-openai:latest
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 \`\`\`
-EOF 
\ No newline at end of file
+EOF 

From 4d01b6428448225807e6605d04e37e29fe729b44 Mon Sep 17 00:00:00 2001
From: Software Developer <7852635+dsuhinin@users.noreply.github.com>
Date: Thu, 20 Nov 2025 21:00:33 +0100
Subject: [PATCH 080/249] [Bugfix] - Add Trace Headers to Beam Search Path
 (#29100)

Signed-off-by: dsuhinin <suhinin.dmitriy@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py       | 1 +
 vllm/entrypoints/openai/serving_completion.py | 1 +
 vllm/entrypoints/openai/serving_engine.py     | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 59e1c8d531793..6cc685acd6728 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -319,6 +319,7 @@ class OpenAIServingChat(OpenAIServing):
                         request_id=request_id,
                         params=sampling_params,
                         lora_request=lora_request,
+                        trace_headers=trace_headers,
                     )
                 else:
                     engine_request, tokenization_kwargs = await self._process_inputs(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a114b77ebc16b..1cfb45ef40366 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -216,6 +216,7 @@ class OpenAIServingCompletion(OpenAIServing):
                         request_id=request_id,
                         params=sampling_params,
                         lora_request=lora_request,
+                        trace_headers=trace_headers,
                     )
                 else:
                     engine_request, tokenization_kwargs = await self._process_inputs(
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 127b8e6dcb87c..7dab5dbacd28c 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -343,6 +343,7 @@ class OpenAIServing:
         request_id: str,
         params: BeamSearchParams,
         lora_request: LoRARequest | None = None,
+        trace_headers: Mapping[str, str] | None = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         beam_width = params.beam_width
         max_tokens = params.max_tokens
@@ -437,6 +438,7 @@ class OpenAIServing:
                             beam_search_params,
                             request_id_item,
                             lora_request=lora_req,
+                            trace_headers=trace_headers,
                         )
                     )
                 )

From 3d84ef9054af190ce68333be3e4d16fe928be754 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 20 Nov 2025 14:39:49 -0600
Subject: [PATCH 081/249] [CI/Build][AMD] Skip if flash_attn_varlen_func not
 available in test_aiter_flash_attn.py (#29043)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/kernels/attention/test_aiter_flash_attn.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py
index 1dec46e33f22e..8f58c470d217a 100644
--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@@ -6,6 +6,7 @@ import pytest
 import torch
 
 import vllm.v1.attention.backends.rocm_aiter_fa  # noqa: F401
+from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
 from vllm.platforms import current_platform
 
 NUM_HEADS = [(4, 4), (8, 2)]
@@ -100,6 +101,8 @@ def test_varlen_with_paged_kv(
     num_blocks: int,
     q_dtype: torch.dtype | None,
 ) -> None:
+    if not is_flash_attn_varlen_func_available():
+        pytest.skip("flash_attn_varlen_func required to run this test.")
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)

From 5e5a7eb16f121f05e19c8bdf88247744ab9d1b83 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 20 Nov 2025 14:45:56 -0600
Subject: [PATCH 082/249] [CI/Build] Make test_attention_selector.py run tests
 on correct platform (#29064)

Signed-off-by: Randall Smith <ransmith@amd.com>
Signed-off-by: rasmith <Randall.Smith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/kernels/attention/test_attention_selector.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 3b8e939300a27..9be56a33f76c8 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -7,6 +7,7 @@ import pytest
 import torch
 
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.rocm import RocmPlatform
@@ -47,9 +48,11 @@ DEVICE_MLA_BLOCK_SIZES = {
 
 
 def generate_params():
+    is_rocm = current_platform.is_rocm()
     params = []
+    device_list = ["cuda", "cpu"] if not is_rocm else ["hip", "cpu"]
     for use_mla in [True, False]:
-        for device in ["cuda", "hip", "cpu"]:
+        for device in device_list:
             backends = (
                 DEVICE_MLA_BACKENDS[device]
                 if use_mla

From 3fd74189db13c9793325d9a36539d891873d1ae4 Mon Sep 17 00:00:00 2001
From: Driss Guessous <32754868+drisspg@users.noreply.github.com>
Date: Thu, 20 Nov 2025 13:21:54 -0800
Subject: [PATCH 083/249] Fixes bench (#29058)

Signed-off-by: drisspg <drisspguessous@gmail.com>
---
 vllm/compilation/caching.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index 16e34c2711e9f..63b7ad7279e37 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -12,6 +12,7 @@ from torch.utils import _pytree as pytree
 
 import vllm.envs as envs
 from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config.utils import hash_factors
 from vllm.logger import init_logger
 
 try:
@@ -138,7 +139,7 @@ def compilation_config_hash_factors(vllm_config: VllmConfig) -> list[str]:
     factors = []
     # 0. factors come from the env, for example, The values of
     # VLLM_PP_LAYER_PARTITION will affect the computation graph.
-    env_hash = envs.compute_hash()
+    env_hash = hash_factors(envs.compile_factors())
     factors.append(env_hash)
 
     # 1. factors come from the vllm_config (it mainly summarizes how the

From 8237ab8a2bed14bec5cafbec75033c8e1d54d852 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 20 Nov 2025 15:35:14 -0600
Subject: [PATCH 084/249] [CI/Build] Skip lm-format-enforcer tests in
 test_struct_output_generate.py for now (#29021)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 .../llm/test_struct_output_generate.py        | 28 +++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index a7d769c8542a9..316e152e7395c 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -47,10 +47,34 @@ EAGLE_SPEC_CONFIG = {
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
     ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None),
+    pytest.param(
+        "mistralai/Ministral-8B-Instruct-2410",
+        "lm-format-enforcer",
+        "auto",
+        None,
+        marks=pytest.mark.skip(
+            reason=(
+                "Flaky: lm-format-enforcer intermittently returns"
+                "incomplete JSON."
+                "See https://github.com/noamgat/lm-format-enforcer/issues/169"
+            )
+        ),
+    ),
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
-    ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None),
+    pytest.param(
+        "Qwen/Qwen2.5-1.5B-Instruct",
+        "lm-format-enforcer",
+        "auto",
+        None,
+        marks=pytest.mark.skip(
+            reason=(
+                "Flaky: lm-format-enforcer intermittently returns"
+                "incomplete JSON."
+                "See https://github.com/noamgat/lm-format-enforcer/issues/169"
+            )
+        ),
+    ),
     # FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402
     # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
     # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),

From c7a29d2c8d07ce6188d0c4bb19df6fd1d0e9bc74 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 20 Nov 2025 15:44:37 -0600
Subject: [PATCH 085/249] [CI/Build] Remove skip global cleanup in
 test_struct_output_generate.py (#29022)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 316e152e7395c..a00600b87eca1 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -121,7 +121,6 @@ def test_guided_decoding_deprecated():
     assert sp1.structured_outputs == guided_decoding
 
 
-@pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
     "model_name, backend, tokenizer_mode, speculative_config",
     PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
@@ -626,7 +625,6 @@ Make the response as short as possible.
                 )
 
 
-@pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
     "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
     [
@@ -711,7 +709,6 @@ def test_structured_output_with_reasoning_matrices(
         jsonschema.validate(instance=output_json, schema=reasoning_schema)
 
 
-@pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
     unsupported_json_schema: dict[str, Any],
@@ -758,7 +755,6 @@ def test_structured_output_auto_mode(
         assert isinstance(parsed_json, dict)
 
 
-@pytest.mark.skip_global_cleanup
 def test_guidance_no_additional_properties():
     llm = LLM(
         model="Qwen/Qwen2.5-1.5B-Instruct",

From dd39f91edb0588e2dd77eb55c758eb1e35907af8 Mon Sep 17 00:00:00 2001
From: Rob Mulla <RobMulla@users.noreply.github.com>
Date: Thu, 20 Nov 2025 19:05:59 -0500
Subject: [PATCH 086/249] [Doc] cleanup TPU documentation and remove outdated
 examples (#29048)

Signed-off-by: Rob Mulla <rob.mulla@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/.nav.yml                                 |   6 +-
 docs/configuration/tpu.md                     | 111 ------------------
 docs/features/README.md                       |  37 +++---
 docs/features/quantization/README.md          |  29 +++--
 docs/models/hardware_supported_models/tpu.md  |  34 ------
 .../offline_inference/profiling_tpu/README.md |  70 -----------
 .../profiling_tpu/profiling.py                | 110 -----------------
 examples/offline_inference/tpu.py             |  58 ---------
 8 files changed, 40 insertions(+), 415 deletions(-)
 delete mode 100644 docs/configuration/tpu.md
 delete mode 100644 docs/models/hardware_supported_models/tpu.md
 delete mode 100644 examples/offline_inference/profiling_tpu/README.md
 delete mode 100644 examples/offline_inference/profiling_tpu/profiling.py
 delete mode 100644 examples/offline_inference/tpu.py

diff --git a/docs/.nav.yml b/docs/.nav.yml
index 3151ea0e2ec22..c8bf00efb2370 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -24,14 +24,16 @@ nav:
       - deployment/integrations
     - Training: training
     - Configuration:
-      - configuration/README.md
       - configuration/*
+      - TPU: https://docs.vllm.ai/projects/tpu/en/latest/
     - Models:
       - models/supported_models.md
       - models/generative_models.md
       - models/pooling_models.md
       - models/extensions
-      - Hardware Supported Models: models/hardware_supported_models
+      - Hardware Supported Models:
+        - models/hardware_supported_models/*
+        - TPU: https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/
     - Features: features
   - Developer Guide:
     - contributing/README.md
diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
deleted file mode 100644
index 2d24c9c6e2e95..0000000000000
--- a/docs/configuration/tpu.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# TPU Optimization Tips
-
-This doc serves as a collection of handy tips for optimizing your vLLM on TPU workload.
-
-## Get started
-
-Looking for setup and installation instructions? Find them [here](https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/).
-
-### TPU workload sizing
-
-When selecting the ideal number of chips for a single serving instance, it's important to account for both the model size and the average request context length. Adequate HBM for the KV cache is essential to ensure a sufficient number of concurrent requests can be processed.
-
-The following colab [calculator](https://colab.research.google.com/github/ericehanley/rightsize-vllm/blob/main/HBM_Calculator.ipynb) will tell you:
-
-- KV cache size requirement per token and per request
-- TPU/GPU memory consumed by the model weights
-- TPU/GPU memory allocated for the KV cache
-- Maximum \# of requests you can approximately set (--max-num-seqs)
-
-This approach serves as a general rule of thumb.
-
-#### Latency-throughput tradeoff
-
-As with rightsizing the number of chips for your workload, consider adjusting `--max-num-seqs` to fine-tune the latency-throughput balance. Decreasing `--max-num-seqs` and/or increasing the number of chips can help reduce latency.
-
-`--max-num-seqs` defines the number of concurrent decode slots, effectively limiting the number of requests the server can process tokens for simultaneously. Increasing this value allows the server to pre-allocate more HBM to handle a higher number of concurrent requests, which can maximize overall throughput. However, this often increases the end-to-end (e2e) latency per request.
-
-Therefore, carefully tuning `--max-num-seqs` is crucial to achieving the desired balance between latency and throughput for your specific workload.
-
-In a similar way, `--max-num-batch-tokens` can be adjusted down to improve latency, or adjusted up to improve throughput.
-
-#### Compilation and Caching
-
-Coming from a GPU background, one of the key differences you'll notice with TPUs is an initial compilation step. TPUs are specialized accelerators (ASICs) that achieve maximum performance by executing pre-compiled, static computation graphs via the XLA compiler. Unlike GPUs, which can handle dynamic input shapes more flexibly, TPUs require a specific compiled graph for each tensor shape (e.g., batch size and sequence length) they process.
-
-To manage this, vLLM performs a one-time "warmup" process when you first launch the server. During this phase, it pre-compiles the model for various common input shapes and saves these compiled graphs to a cache on disk or remote storage (located at `~/.cache/vllm/xla_cache` by default). This process can range significantly, anywhere from a few minutes to an hour depending on the size of the model and context length used.
-
-Although the first compilation can take some time, for all subsequent server launches, vLLM can load these graphs directly from the cache, eliminating the compilation time for future runs.
-
-Use `VLLM_XLA_CACHE_PATH` environment variable to write to shareable storage for future deployed nodes (like when using autoscaling).
-
-#### Reducing compilation time
-
-This initial compilation time ranges significantly and is impacted by many of the arguments discussed in this optimization doc. Factors that influence the length of time to compile are things like model size and `--max-num-batch-tokens`. Other arguments you can tune are things like `VLLM_TPU_MOST_MODEL_LEN`.
-
-### Optimize based on your data
-
-#### max-model-len vs. most-model-len
-
-![most_model_len](../assets/design/tpu/most_model_len.png)
-
-If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most-model-len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
-
-For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`.
-
-The requests get subdivided into max-model-len and most-model-len categories, for the latter category, you can gain better performance since the server can process more requests at a time.
-
-#### Padding
-
-For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128 (e.g., 128, 256, etc.)
-
-The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about TPU padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
-
-1. the default exponential padding (pad to the nearest power of 2)
-2. bucket padding (pad to the nearest linearly increasing bucket).
-
-When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`.
-
-For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512].
-
-The fewer tokens you pad, the less unnecessary computation TPU does, the better performance you can get. For example, if num_tokens=300, with exponential padding, you pad to 512, with the bucket_padding above, you pad to 320.
-
-However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
-
-#### Quantization
-
-If possible, use the precision that matches the chip’s hardware acceleration:
-
-- v5e has int4/int8 hardware acceleration in the MXU
-- v6e has int4/int8 hardware acceleration in the MXU
-
-Supported quantized formats and features in vLLM on TPU [Jul '25]:
-
-- INT8 W8A8
-- INT8 W8A16
-- FP8 KV cache
-- [WIP] FP8 W8A8
-- [WIP] AWQ
-- [WIP] FP4 W4A8
-
-#### Parallelization
-
-Don't set TP to be less than the number of chips on a single-host deployment.
-
-Although it’s common to do this with GPUs, don't try to fragment 2 or 8 different workloads across 8 chips on a single host. If you need 1 or 4 chips, just create an instance with 1 or 4 chips (these are partial-host machine types).
-
-### Tune your workloads
-
-Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
-
-### Future Topics We'll Cover
-
-#### Profiling
-
-The auto-tuner provides a profile of optimized configurations as its final step. However, interpreting this profile can be challenging for new users. We plan to expand this section in the future with more detailed guidance. In the meantime, you can learn how to collect a TPU profile using vLLM's native profiling tools [here](../examples/offline_inference/profiling_tpu.md). This profile can provide valuable insights into your workload's performance.
-
-#### SPMD
-
-More details to come.
-
-**Want us to cover something that isn't listed here? Open up an issue please and cite this doc. We'd love to hear your questions or tips.**
diff --git a/docs/features/README.md b/docs/features/README.md
index ad9de9ff8f368..5faf3768f3214 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -59,20 +59,23 @@ th:not(:first-child) {
 
 ### Feature x Hardware
 
-| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU | Intel GPU |
-|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------|
-| [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
-| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
-| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ | ✅        |
-| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
-| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ | ✅        |
-| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅     | ❌ | ✅        |
-| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅       |
+| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | Intel GPU |
+|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------| ------------|
+| [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
+| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
+| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ✅        |
+| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
+| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ✅       |
+| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ✅        |
+| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅     | ✅        |
+| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+
+!!! note
+    For information on feature support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation.
diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index 74f005c496ee5..7b5287bad3bb8 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -43,24 +43,27 @@ th:not(:first-child) {
 }
 </style>
 
-| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | Google TPU   |
-|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|
-| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌           |
-| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌           |
-| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎           |
-| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌           |
-| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌           |
-| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌           |
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|
+| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        |
+| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        |
+| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        |
+| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        |
+| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        |
 
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - ✅︎ indicates that the quantization method is supported on the specified hardware.
 - ❌ indicates that the quantization method is not supported on the specified hardware.
 
+!!! note
+    For information on quantization support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation.
+
 !!! note
     This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
diff --git a/docs/models/hardware_supported_models/tpu.md b/docs/models/hardware_supported_models/tpu.md
deleted file mode 100644
index 7b0a5ba6e72da..0000000000000
--- a/docs/models/hardware_supported_models/tpu.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# TPU
-
-## Supported Models
-
-### Text-only Language Models
-
-| Model                                               | Architecture                   | Supported |
-|-----------------------------------------------------|--------------------------------|-----------|
-| mistralai/Mixtral-8x7B-Instruct-v0.1                | MixtralForCausalLM             | 🟨 |
-| mistralai/Mistral-Small-24B-Instruct-2501           | MistralForCausalLM             | ✅ |
-| mistralai/Codestral-22B-v0.1                        | MistralForCausalLM             | ✅ |
-| mistralai/Mixtral-8x22B-Instruct-v0.1               | MixtralForCausalLM             | ❌ |
-| meta-llama/Llama-3.3-70B-Instruct                   | LlamaForCausalLM               | ✅ |
-| meta-llama/Llama-3.1-8B-Instruct                    | LlamaForCausalLM               | ✅ |
-| meta-llama/Llama-3.1-70B-Instruct                   | LlamaForCausalLM               | ✅ |
-| meta-llama/Llama-4-*                                | Llama4ForConditionalGeneration | ❌ |
-| microsoft/Phi-3-mini-128k-instruct                  | Phi3ForCausalLM                | 🟨 |
-| microsoft/phi-4                                     | Phi3ForCausalLM                | ❌ |
-| google/gemma-3-27b-it                               | Gemma3ForConditionalGeneration | 🟨 |
-| google/gemma-3-4b-it                                | Gemma3ForConditionalGeneration | ❌ |
-| deepseek-ai/DeepSeek-R1                             | DeepseekV3ForCausalLM          | ❌ |
-| deepseek-ai/DeepSeek-V3                             | DeepseekV3ForCausalLM          | ❌ |
-| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8  | LlamaForCausalLM               | ✅ |
-| RedHatAI/Meta-Llama-3.1-70B-Instruct-quantized.w8a8 | LlamaForCausalLM               | ✅ |
-| Qwen/Qwen3-8B                                       | Qwen3ForCausalLM               | ✅ |
-| Qwen/Qwen3-32B                                      | Qwen3ForCausalLM               | ✅ |
-| Qwen/Qwen2.5-7B-Instruct                            | Qwen2ForCausalLM               | ✅ |
-| Qwen/Qwen2.5-32B                                    | Qwen2ForCausalLM               | ✅ |
-| Qwen/Qwen2.5-14B-Instruct                           | Qwen2ForCausalLM               | ✅ |
-| Qwen/Qwen2.5-1.5B-Instruct                          | Qwen2ForCausalLM               | 🟨 |
-
-✅ Runs and optimized.  
-🟨 Runs and correct but not optimized to green yet.  
-❌ Does not pass accuracy test or does not run.  
diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md
deleted file mode 100644
index 8c9c1c92b6764..0000000000000
--- a/examples/offline_inference/profiling_tpu/README.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# vLLM TPU Profiling
-
-This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes.
-
-Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes.
-
-We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [Google TPU installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/google_tpu.html).
-
-> In all examples below, we run several warmups before (so `--enforce-eager` is okay)
-
-## Profile Examples
-
-### Generate Prefill Trace
-
-This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations.
-
-```bash
-export XLA_HLO_DEBUG=1
-export MODEL=Qwen/Qwen2.5-7B-Instruct
-export VLLM_TPU_PROFILE_DURATION_MS=3000
-export VLLM_TPU_PROFILE_DELAY_MS=0
-
-python3 profiling.py \
-    --model $MODEL \
-    --input-len 1024 --output-len 1 \
-    --batch-size 1 --enforce-eager \
-    --max-model-len 2048 \
-    --tensor-parallel-size 1 \
-    --profile-result-dir profiles
-```
-
-### Generate Decode Trace
-
-This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill).
-
-```bash
-export XLA_HLO_DEBUG=1
-export MODEL=meta-llama/Llama-3.1-70B-Instruct
-export VLLM_TPU_PROFILE_DURATION_MS=2000
-export VLLM_TPU_PROFILE_DELAY_MS=1000
-
-rm -rf ~/.cache/vllm/xla_cache
-python3 profiling.py \
-    --model $MODEL \
-    --input-len 1 \
-    --output-len 128 \
-    --batch-size 32 \
-    --enforce-eager \
-    --profile-result-dir profiles \
-    --max-model-len 2048 --tensor-parallel-size 8
-```
-
-## Visualizing the profiles
-
-Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).
-
-Here are most likely the dependencies you need to install:
-
-```bash
-pip install tensorflow-cpu \
-    tensorboard-plugin-profile \
-    etils \
-    importlib_resources
-```
-
-Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
-
-```bash
-tensorboard --logdir profiles/ --port 6006
-```
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
deleted file mode 100644
index 3b127e4fd29df..0000000000000
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import dataclasses
-import os
-import time
-
-import numpy as np
-import torch_xla.debug.profiler as xp
-from tqdm import tqdm
-
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptType
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-
-DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
-DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
-
-
-def main(args: argparse.Namespace):
-    print(args)
-
-    engine_args = EngineArgs.from_cli_args(args)
-    llm = LLM(**dataclasses.asdict(engine_args))
-    server = xp.start_server(9012)  # noqa: F841
-
-    sampling_params = SamplingParams(
-        temperature=0.0,
-        ignore_eos=True,
-        max_tokens=args.output_len,
-    )
-    print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(
-        10000, size=(args.batch_size, args.input_len)
-    )
-    dummy_prompts: list[PromptType] = [
-        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
-    ]
-
-    def run_to_completion():
-        start_time = time.perf_counter()
-        llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
-        end_time = time.perf_counter()
-        latency = end_time - start_time
-        return latency
-
-    # Warmup
-    print("Warming up...")
-    warmup_latencies = []
-    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        warmup_latencies.append(run_to_completion())
-    print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s")
-
-    # Profile
-    profile_dir = args.profile_result_dir
-    print(f"Profiling (results will be saved to '{profile_dir}')...")
-    # Enable tracing on server
-    xp.trace_detached(
-        "localhost:9012", profile_dir, delay_ms=DELAY_MS, duration_ms=DURATION_MS
-    )
-    if DELAY_MS == 0:
-        time.sleep(1.0)
-    profile_latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profile iterations"):
-        profile_latencies.append(run_to_completion())
-    print(f"Average profile latency: {np.mean(profile_latencies):.4f}s")
-
-    return
-
-
-def parse_args():
-    parser = FlexibleArgumentParser(
-        description="Benchmark the latency of processing a single batch of "
-        "requests till completion."
-    )
-    parser.add_argument("--input-len", type=int, default=32)
-    parser.add_argument("--output-len", type=int, default=128)
-    parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument(
-        "--num-iters-warmup",
-        type=int,
-        default=5,
-        help="Number of iterations to run for warmup.",
-    )
-    parser.add_argument(
-        "--num-iters",
-        type=int,
-        default=1,
-        help="Number of iterations to run for profiling.",
-    )
-    parser.add_argument(
-        "--profile-result-dir",
-        type=str,
-        default="profiles",
-        help=(
-            "path to save the pytorch profiler output. Can be visualized "
-            "with ui.perfetto.dev or Tensorboard "
-            "(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)."
-        ),
-    )
-
-    parser = EngineArgs.add_cli_args(parser)
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
deleted file mode 100644
index 0093b63b0b1f3..0000000000000
--- a/examples/offline_inference/tpu.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-from vllm import LLM, SamplingParams
-
-prompts = [
-    "A robot may not injure a human being",
-    "It is only with the heart that one can see rightly;",
-    "The greatest glory in living lies not in never falling,",
-]
-answers = [
-    " or, through inaction, allow a human being to come to harm.",
-    " what is essential is invisible to the eye.",
-    " but in rising every time we fall.",
-]
-N = 1
-# Currently, top-p sampling is disabled. `top_p` should be 1.0.
-sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
-
-
-def main():
-    parser = argparse.ArgumentParser(description="TPU offline inference example")
-    parser.add_argument("--use-spmd", action="store_true", help="Enable SPMD mode")
-    args = parser.parse_args()
-
-    llm_args = {
-        "model": "Qwen/Qwen2-1.5B-Instruct",
-        "max_num_batched_tokens": 64,
-        "max_num_seqs": 4,
-        "max_model_len": 128,
-    }
-    if args.use_spmd:
-        os.environ["VLLM_XLA_USE_SPMD"] = "1"
-        # Can only hardcode the number of chips for now.
-        # calling xr.global_runtime_device_count() beforeing init SPMD env in
-        # torch_xla will mess up the distributed env.
-        llm_args["tensor_parallel_size"] = 8
-        # Use Llama, for num_kv_heads = 8.
-        llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct"
-
-    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-    # In real workloads, `enforce_eager` should be `False`.
-    llm = LLM(**llm_args)
-    outputs = llm.generate(prompts, sampling_params)
-    print("-" * 50)
-    for output, answer in zip(outputs, answers):
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
-        assert generated_text.startswith(answer)
-        print("-" * 50)
-
-
-if __name__ == "__main__":
-    main()

From 986ab5db6325fb4a5d937084ca7921a95641504a Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 20 Nov 2025 19:42:33 -0500
Subject: [PATCH 087/249] [CI Bugfix] Fix Kernels DeepGEMM Test (H100) (#29106)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 71249a9543c7c..6169b279dc8a4 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -554,7 +554,6 @@ steps:
   timeout_in_minutes: 45
   gpu: h100
   num_gpus: 1
-  optional: true
   source_file_dependencies:
   - tools/install_deepgemm.sh
   - vllm/utils/deep_gemm.py
@@ -565,10 +564,10 @@ steps:
   - tests/kernels/moe/test_batched_deepgemm.py
   - tests/kernels/attention/test_deepgemm_attention.py
   commands:
-    - pytest -v -s tests/kernels/quantization/test_block_fp8.py -k deep_gemm
-    - pytest -v -s tests/kernels/moe/test_deepgemm.py
-    - pytest -v -s tests/kernels/moe/test_batched_deepgemm.py
-    - pytest -v -s tests/kernels/attention/test_deepgemm_attention.py
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
 
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35

From 87cbbdff639f96766d4f6604cc970394c550dc5b Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 20 Nov 2025 20:16:52 -0500
Subject: [PATCH 088/249] Update model references for OLMo3 (#29099)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/models/supported_models.md | 2 +-
 tests/models/registry.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index f0531ced0aaa3..626904a974155 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -424,7 +424,7 @@ th {
 | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ |
 | `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ |
 | `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ |
-| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ |
+| `OLMo3ForCausalLM` | OLMo3 | `allenai/Olmo-3-7B-Instruct`, `allenai/Olmo-3-32B-Think`, etc. | ✅︎ | ✅︎ |
 | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
 | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ |
 | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 1999e3cd2de2d..b088e16756d7a 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -370,7 +370,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     ),
     "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
     "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"),
-    "Olmo3ForCausalLM": _HfExamplesInfo("shanearora/2025-sep-a-base-model"),
+    "Olmo3ForCausalLM": _HfExamplesInfo("allenai/Olmo-3-7B-Instruct"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
     "OpenPanguMTPModel": _HfExamplesInfo(
         "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1",

From df44df01431e8af444222addddd2789c0483d70a Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 20 Nov 2025 20:41:49 -0500
Subject: [PATCH 089/249] [Feature] Shared Experts Overlap with FI deepgemm
 swap kernel, 2.2% throughput improvement and 3.6% TTFT improvement (#28879)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../fused_moe/fused_moe_modular_method.py     |  1 +
 vllm/model_executor/layers/fused_moe/layer.py | 69 +++++++++-------
 .../layers/fused_moe/modular_kernel.py        | 79 +++++++++++++++++--
 .../layers/fused_moe/prepare_finalize.py      |  3 +-
 4 files changed, 119 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index 43974ba917e42..c6dc95acdb636 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -50,6 +50,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
                 prepare_finalize,
                 old_quant_method.select_gemm_impl(prepare_finalize, moe_layer),
                 shared_experts,
+                getattr(moe_layer, "shared_experts_stream", None),
             ),
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index d9525a7439c3e..b2f554efd8a6f 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -850,6 +850,45 @@ class FusedMoE(CustomOp):
                     dp_size=get_dp_group().world_size,
                 )
 
+    def _maybe_setup_shared_experts_stream(
+        self,
+        hidden_states: torch.Tensor,
+        has_separate_shared_experts: bool,
+        use_chunked_impl: bool,
+    ) -> tuple[bool, torch.Tensor | None]:
+        use_shared_experts_stream = (
+            has_separate_shared_experts
+            and not use_chunked_impl
+            and self.shared_experts_stream is not None
+            and (
+                hidden_states.shape[0]
+                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
+            )
+        )
+
+        hidden_states_clone: torch.Tensor | None = None
+        if use_shared_experts_stream:
+            assert self.shared_experts_stream is not None
+
+            # Clone BEFORE switching streams to avoid race condition
+            # where routed_expert kernel may mutate hidden_states.
+            hidden_states_clone = hidden_states.clone()
+
+            # Record that the clone will be used by shared_experts_stream
+            # to avoid gc issue from deallocation of hidden_states_clone
+            # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
+            # NOTE: We dont need shared_output.record_stream(current_stream())
+            # because we synch the streams before using shared_output.
+            hidden_states_clone.record_stream(self.shared_experts_stream)
+
+            # Mark sync start point for the separate shared experts
+            # stream here since we want to run in parallel with the
+            # router/gate (next op below)
+            assert self.shared_experts_stream is not None
+            self.shared_experts_stream.wait_stream(current_stream())
+
+        return use_shared_experts_stream, hidden_states_clone
+
     def _load_per_tensor_weight_scale(
         self,
         shard_id: str,
@@ -1819,36 +1858,12 @@ class FusedMoE(CustomOp):
 
         use_chunked_impl = self.use_dp_chunking
 
-        use_shared_experts_stream = (
-            has_separate_shared_experts
-            and not use_chunked_impl
-            and self.shared_experts_stream is not None
-            and (
-                hidden_states.shape[0]
-                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
+        use_shared_experts_stream, hidden_states_clone = (
+            self._maybe_setup_shared_experts_stream(
+                hidden_states, has_separate_shared_experts, use_chunked_impl
             )
         )
 
-        if use_shared_experts_stream:
-            assert self.shared_experts_stream is not None
-
-            # Clone BEFORE switching streams to avoid race condition
-            # where routed_expert kernel may mutate hidden_states.
-            hidden_states_clone = hidden_states.clone()
-
-            # Record that the clone will be used by shared_experts_stream
-            # to avoid gc issue from deallocation of hidden_states_clone
-            # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
-            # NOTE: We dont need shared_output.record_stream(current_stream())
-            # because we synch the streams before using shared_output.
-            hidden_states_clone.record_stream(self.shared_experts_stream)
-
-            # Mark sync start point for the separate shared experts
-            # stream here since we want to run in parallel with the
-            # router/gate (next op below)
-            assert self.shared_experts_stream is not None
-            self.shared_experts_stream.wait_stream(current_stream())
-
         # If router/gate provided, then apply it here.
         # (Note: This code runs only when "overlapped mode" is on to allow
         #        parallel execution of shared experts with the FusedMoE via
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 093affe51f503..4af7af9257dfa 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.utils import (
     count_expert_num_tokens,
     disable_inplace,
 )
+from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.v1.worker.ubatching import (
     dbo_current_ubatch_id,
@@ -709,11 +710,13 @@ class FusedMoEModularKernel(torch.nn.Module):
         prepare_finalize: FusedMoEPrepareAndFinalize,
         fused_experts: FusedMoEPermuteExpertsUnpermute,
         shared_experts: torch.nn.Module | None = None,
+        shared_experts_stream: torch.cuda.Stream | None = None,
     ):
         super().__init__()
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
         self.shared_experts = shared_experts
+        self.shared_experts_stream = shared_experts_stream
 
         self._post_init_setup()
         assert (
@@ -890,6 +893,34 @@ class FusedMoEModularKernel(torch.nn.Module):
             expert_num_tokens_cpu=c_expert_num_tokens_cpu,
         )
 
+    def _maybe_setup_shared_experts_stream(
+        self, hidden_states: torch.Tensor
+    ) -> tuple[bool, torch.Tensor | None]:
+        # decide whether to run shared experts on a separate CUDA stream to
+        # overlap with the main fused MoE kernel.
+        use_shared_experts_stream = (
+            self.shared_experts is not None
+            and self.shared_experts_stream is not None
+            and hidden_states.is_cuda
+            and (
+                hidden_states.shape[0]
+                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
+            )
+        )
+
+        hidden_states_clone: torch.Tensor | None = None
+        if use_shared_experts_stream and self.shared_experts_stream is not None:
+            # TODO: Optimize this (complicated)
+            # Note: this clone adds overhead but is required
+            # for correctness with multiple CUDA streams and CUDA graph capture.
+            hidden_states_clone = hidden_states.clone()
+            # record that the clone will be used by the separate stream so its
+            # lifetime is correctly tracked.
+            hidden_states_clone.record_stream(self.shared_experts_stream)
+            self.shared_experts_stream.wait_stream(torch.cuda.current_stream())
+
+        return use_shared_experts_stream, hidden_states_clone
+
     def _prepare(
         self,
         hidden_states: torch.Tensor,
@@ -1077,12 +1108,30 @@ class FusedMoEModularKernel(torch.nn.Module):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
+        hidden_states_clone: torch.Tensor | None = None,
+        use_shared_experts_stream: bool = False,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """
         The _finalize method is a wrapper around self.prepare_finalize.finalize
         that handles DBO, async and shared expert overlap.
         """
-        shared_output: torch.Tensor | None = None
+
+        def maybe_run_shared_experts() -> torch.Tensor | None:
+            if self.shared_experts is None:
+                return None
+
+            if (
+                not use_shared_experts_stream
+                or self.shared_experts_stream is not None
+                and (not hidden_states.is_cuda or not torch.cuda.is_available())
+            ):
+                # fall back to running on the current stream
+                return self.shared_experts(hidden_states)
+
+            assert hidden_states_clone is not None
+            # launch shared experts on the dedicated stream.
+            with torch.cuda.stream(self.shared_experts_stream):
+                return self.shared_experts(hidden_states_clone)
 
         if not self.prepare_finalize.supports_async():
             assert not dbo_enabled()
@@ -1095,8 +1144,7 @@ class FusedMoEModularKernel(torch.nn.Module):
                 apply_router_weight_on_input,
                 self.fused_experts.finalize_weight_and_reduce_impl(),
             )
-            if self.shared_experts is not None:
-                shared_output = self.shared_experts(hidden_states)
+            shared_output = maybe_run_shared_experts()
         else:
             finalize_ret = self.prepare_finalize.finalize_async(
                 output,
@@ -1107,8 +1155,7 @@ class FusedMoEModularKernel(torch.nn.Module):
                 self.fused_experts.finalize_weight_and_reduce_impl(),
             )
 
-            if self.shared_experts is not None:
-                shared_output = self.shared_experts(hidden_states)
+            shared_output = maybe_run_shared_experts()
 
             # TODO(lucas): refactor this in the alternative schedules followup
             # currently unpack if we have hook + receiver pair or just
@@ -1131,12 +1178,28 @@ class FusedMoEModularKernel(torch.nn.Module):
 
             receiver()
 
+        self._wait_for_shared_experts_stream(hidden_states, use_shared_experts_stream)
+
         if self.shared_experts is None:
             return output
         else:
             assert shared_output is not None
             return shared_output, output
 
+    def _wait_for_shared_experts_stream(
+        self, hidden_states: torch.Tensor, use_shared_experts_stream: bool
+    ) -> None:
+        # ensure that any work enqueued on the shared_experts_stream is
+        # completed before the shared_output tensor is consumed
+        if (
+            self.shared_experts is not None
+            and use_shared_experts_stream
+            and self.shared_experts_stream is not None
+            and hidden_states.is_cuda
+            and current_platform.is_cuda()
+        ):
+            torch.cuda.current_stream().wait_stream(self.shared_experts_stream)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -1183,6 +1246,10 @@ class FusedMoEModularKernel(torch.nn.Module):
         else:
             output = torch.zeros_like(hidden_states)
 
+        use_shared_experts_stream, hidden_states_clone = (
+            self._maybe_setup_shared_experts_stream(hidden_states)
+        )
+
         local_num_experts = w1.size(0)
         if global_num_experts == -1:
             global_num_experts = local_num_experts
@@ -1219,4 +1286,6 @@ class FusedMoEModularKernel(torch.nn.Module):
             topk_weights,
             topk_ids,
             apply_router_weight_on_input,
+            hidden_states_clone=hidden_states_clone,
+            use_shared_experts_stream=use_shared_experts_stream,
         )
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 9bb976fb9ec93..e27e2eb32da0f 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -45,7 +45,8 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
             assert topk == 1, (
                 "apply_router_weight_on_input is only implemented for topk=1"
             )
-            a1.mul_(topk_weights.to(a1.dtype))
+            # Note: do not use inplace for shared experts overlap
+            a1 = a1 * topk_weights.to(a1.dtype)
 
         a1q, a1q_scale = moe_kernel_quantize_input(
             a1,

From 9875be6431872b513a8554c518e48ad79eba4656 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 21 Nov 2025 09:46:43 +0800
Subject: [PATCH 090/249] [LoRA][2/2]Remove LoRA extra vocab  (#28545)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py                       |  10 +
 tests/lora/test_layers.py                    | 189 ++-----------------
 tests/lora/test_llama_tp.py                  |  84 +++++----
 tests/lora/test_lora_functions.py            |   4 +-
 tests/lora/test_lora_manager.py              |  20 +-
 tests/lora/test_worker.py                    |   8 +-
 tests/lora/utils.py                          |   8 -
 vllm/config/lora.py                          |  18 +-
 vllm/engine/arg_utils.py                     |   5 -
 vllm/lora/layers/base.py                     |   1 -
 vllm/lora/layers/base_linear.py              |   1 -
 vllm/lora/layers/column_parallel_linear.py   |   1 -
 vllm/lora/layers/fused_moe.py                |   2 -
 vllm/lora/layers/logits_processor.py         |  55 +-----
 vllm/lora/layers/vocal_parallel_embedding.py |  33 +---
 vllm/lora/lora_weights.py                    |  24 ---
 vllm/lora/models.py                          |  54 +-----
 vllm/lora/punica_wrapper/punica_base.py      |  11 +-
 vllm/lora/punica_wrapper/punica_gpu.py       |   5 +-
 vllm/lora/punica_wrapper/punica_tpu.py       |   3 +-
 vllm/lora/punica_wrapper/punica_xpu.py       |   5 +-
 vllm/lora/utils.py                           |  10 +
 vllm/lora/worker_manager.py                  |   9 +-
 vllm/model_executor/models/granite.py        |  34 +---
 vllm/model_executor/models/llama.py          |  30 +--
 vllm/model_executor/models/mixtral.py        |  32 +---
 vllm/model_executor/models/teleflm.py        |   2 +-
 vllm/v1/worker/tpu_model_runner.py           |   3 -
 28 files changed, 133 insertions(+), 528 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index d8ff9339bb49b..9d38ec5422794 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -250,6 +250,16 @@ def olmoe_lora_files():
     return snapshot_download(repo_id="jeeejeee/olmoe-instruct-text2sql-spider")
 
 
+@pytest.fixture(scope="session")
+def qwen3_lora_files():
+    return snapshot_download(repo_id="charent/self_cognition_Alice")
+
+
+@pytest.fixture(scope="session")
+def llama32_lora_files():
+    return snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
+
+
 @pytest.fixture
 def reset_default_device():
     """
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 8f18f01441932..9df3a07a9e5e9 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -136,7 +136,6 @@ def populate_loras(
     id_to_index: list[int | None],
     layer: BaseLayerWithLoRA,
     layer_weights: torch.Tensor,
-    generate_embeddings_tensor: int = 0,
     repeats: int = 1,
 ) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]:
     """This method populates the lora layers with lora weights.
@@ -148,8 +147,6 @@ def populate_loras(
         layer: the LoRAlayer to populate.
         layer_weights: the PyTorch tensor containing the layer's
             weights.
-        generate_embeddings_tensor: whether to generate an
-            embeddings tensor for each LoRA.
         repeats: must only be set for column parallel packed
             layers. Indicates the number of loras to compose
             together to create a single lora layer.
@@ -171,7 +168,6 @@ def populate_loras(
                 sublora = DummyLoRAManager(layer_weights.device).init_random_lora(
                     module_name=f"fake_{i}",
                     weight=layer_weights,
-                    generate_embeddings_tensor=generate_embeddings_tensor,
                 )
                 sublora.lora_b = sublora.lora_b[
                     (sublora_len * i) : (sublora_len * (i + 1)), :
@@ -185,7 +181,6 @@ def populate_loras(
                 slot_idx,
                 lora_a=lora.lora_a,
                 lora_b=lora.lora_b,
-                embeddings_tensor=lora.embeddings_tensor,
             )
 
             lora_dict[lora_id] = lora
@@ -306,7 +301,6 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
             id_to_index,
             max_loras,
             vocab_size,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_embedding(torch.cat(inputs))
@@ -344,7 +338,6 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
             id_to_index,
             max_loras,
             vocab_size,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_embedding(torch.cat(inputs))
@@ -354,149 +347,6 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
         torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
-@torch.inference_mode()
-# @pytest.mark.skip(
-#     reason="Fails when loras are in any slot other than the first.")
-@pytest.mark.parametrize("num_loras", [1, 2, 4])
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
-@pytest.mark.parametrize("stage", STAGES)
-def test_embeddings_with_new_embeddings(
-    dist_init, num_loras, device, vocab_size, stage
-) -> None:
-    if current_platform.is_cuda_alike():
-        torch.cuda.set_device(device)
-
-    torch.set_default_device(device)
-    max_loras = 8
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(
-        max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
-    )
-
-    def create_random_embedding_layer():
-        embedding = VocabParallelEmbedding(vocab_size, 256)
-        embedding_data = torch.rand_like(embedding.weight.data)
-        embedding.weight.data = embedding_data
-        embedding.weight.data[vocab_size:, :] = 0
-        expanded_embedding = VocabParallelEmbedding(
-            vocab_size + lora_config.lora_extra_vocab_size * max_loras,
-            256,
-            org_num_embeddings=vocab_size,
-        )
-        expanded_embedding.weight.data[:vocab_size, :] = embedding_data
-        # We need to deepcopy the embedding as it will be modified
-        # in place
-        lora_embedding = VocabParallelEmbeddingWithLoRA(deepcopy(expanded_embedding))
-        lora_embedding.create_lora_weights(max_loras, lora_config)
-
-        return expanded_embedding, lora_embedding
-
-    for i in range(NUM_RANDOM_SEEDS):
-        set_random_seed(i)
-
-        id_to_index = get_random_id_to_index(num_loras, max_loras)
-        expanded_embedding, lora_embedding = create_random_embedding_layer()
-        lora_dict, _ = populate_loras(
-            id_to_index,
-            layer=lora_embedding,
-            layer_weights=torch.zeros(
-                (256, vocab_size + lora_config.lora_extra_vocab_size)
-            ),
-            generate_embeddings_tensor=256,
-        )
-
-        lora_embedding.set_mapping(punica_wrapper)
-        # All embeddings tensors have the same shape.
-        embeddings_tensors = [
-            lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys())
-        ]
-        embeddings_tensor_len = embeddings_tensors[0].shape[0]
-
-        # Add empty embeddings_tensors for unoccupied lora slots.
-        for _ in range(max_loras - len(embeddings_tensors)):
-            embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape))
-
-        inputs, index_mapping, prompt_mapping = create_random_inputs(
-            active_lora_ids=list(lora_dict.keys()),
-            num_inputs=num_loras * 3,
-            input_size=(200,),
-            input_range=(1, vocab_size),
-            device=device,
-        )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
-        punica_wrapper.update_metadata(
-            lora_mapping,
-            id_to_index,
-            max_loras,
-            vocab_size,
-            lora_config.lora_extra_vocab_size,
-        )
-        original_inputs = deepcopy(inputs)
-
-        # Force some of the inputs to be in the extended embeddings range
-        # to guarantee that their behavior is tested.
-        for input_, original_input_, lora_id in zip(
-            inputs, original_inputs, prompt_mapping
-        ):
-            embedding_id = lora_id - 1
-            input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len)
-            original_input_[-1] = vocab_size
-            input_[-2] = vocab_size + ((embedding_id + 1) * embeddings_tensor_len - 1)
-            original_input_[-2] = vocab_size + embeddings_tensor_len - 1
-
-        expanded_embedding.weight[
-            vocab_size : vocab_size + (embeddings_tensor_len * max_loras)
-        ] = torch.cat(embeddings_tensors)
-
-        lora_result = lora_embedding(torch.cat(original_inputs))
-
-        expected_results: list[torch.Tensor] = []
-        for input_, original_input_, lora_id in zip(
-            inputs, original_inputs, prompt_mapping
-        ):
-            lora = lora_dict[lora_id]
-            result = expanded_embedding(input_)
-            after_a = F.embedding(
-                original_input_,
-                lora.lora_a.T,
-            )
-            result += after_a @ lora.lora_b.T
-            expected_results.append(result)
-        expected_result = torch.cat(expected_results)
-
-        rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
-
-        # Check that resetting the lora weights succeeds
-
-        for slot_idx in range(max_loras):
-            lora_embedding.reset_lora(slot_idx)
-
-        inputs, index_mapping, prompt_mapping = create_random_inputs(
-            active_lora_ids=[0],
-            num_inputs=num_loras * 3,
-            input_size=(200,),
-            input_range=(1, vocab_size),
-            device=device,
-        )
-        original_inputs = deepcopy(inputs)
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
-        punica_wrapper.update_metadata(
-            lora_mapping,
-            id_to_index,
-            max_loras,
-            vocab_size,
-            lora_config.lora_extra_vocab_size,
-        )
-        lora_result = lora_embedding(torch.cat(original_inputs))
-        expected_result = expanded_embedding(torch.cat(inputs))
-
-        rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
-
-
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
@@ -518,16 +368,13 @@ def test_lm_head_logits_processor(
 
     def _pretest():
         linear = ParallelLMHead(
-            vocab_size + lora_config.lora_extra_vocab_size,
-            1024,
-            vocab_size,
+            num_embeddings=vocab_size,
+            embedding_dim=1024,
             params_dtype=torch.float16,
         )
         linear.weight.data = torch.rand_like(linear.weight.data)
         linear.weight.data[:, vocab_size:] = 0
-        logits_processor = LogitsProcessor(
-            vocab_size + lora_config.lora_extra_vocab_size, vocab_size
-        )
+        logits_processor = LogitsProcessor(vocab_size)
         lora_logits_processor = LogitsProcessorWithLoRA(
             logits_processor, 1024, linear.weight.dtype, linear.weight.device, None
         )
@@ -541,15 +388,12 @@ def test_lm_head_logits_processor(
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, logits_processor, lora_logits_processor = _pretest()
         lora_logits_processor.set_mapping(punica_wrapper)
-        # NOTE: all the generated loras share the same embeddings tensor.
+
         lora_dict, _ = populate_loras(
             id_to_index,
             layer=lora_logits_processor,
             layer_weights=linear.weight,
-            generate_embeddings_tensor=1024,
         )
-        embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor
-        embeddings_tensor_len = embeddings_tensor.shape[0]
 
         inputs, index_mapping, prompt_mapping = create_random_inputs(
             active_lora_ids=list(lora_dict.keys()),
@@ -565,7 +409,6 @@ def test_lm_head_logits_processor(
             id_to_index,
             max_loras,
             vocab_size,
-            lora_config.lora_extra_vocab_size,
         )
         input_ = torch.rand(20, 1024)
 
@@ -575,23 +418,16 @@ def test_lm_head_logits_processor(
 
         original_lm_head = deepcopy(linear)
 
-        linear.weight[
-            logits_processor.org_vocab_size : logits_processor.org_vocab_size
-            + embeddings_tensor_len
-        ] = embeddings_tensor
-
-        logits_processor.org_vocab_size = vocab_size + lora_config.lora_extra_vocab_size
         expected_results: list[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = logits_processor._get_logits(
                 hidden_states=input_, lm_head=linear, embedding_bias=None
             )
-            result[:, vocab_size + embeddings_tensor_len :] = float("-inf")
+
             result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
-        logits_processor.org_vocab_size = vocab_size
 
         # Check that resetting the lora weights succeeds
 
@@ -612,7 +448,6 @@ def test_lm_head_logits_processor(
             id_to_index,
             max_loras,
             vocab_size,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_logits_processor._get_logits(
@@ -694,7 +529,6 @@ def test_linear_replicated(
             id_to_index,
             max_loras,
             512,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -726,7 +560,10 @@ def test_linear_replicated(
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
         punica_wrapper.update_metadata(
-            lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -817,7 +654,6 @@ def test_linear_parallel(
             id_to_index,
             max_loras,
             512,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -849,7 +685,10 @@ def test_linear_parallel(
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
         punica_wrapper.update_metadata(
-            lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -963,7 +802,6 @@ def test_column_parallel_packed(
             id_to_index,
             max_loras,
             512,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -1000,7 +838,6 @@ def test_column_parallel_packed(
             id_to_index,
             max_loras,
             512,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 7bbd1e364d19e..18704fa6e45de 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -13,17 +13,27 @@ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
 from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
 
-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+PROMPT_TEMPLATE = """<|eot_id|><|start_header_id|>user<|end_header_id|>
+I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
+"
+##Instruction:
+candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key.
+Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key.
+The People_ID of candidate is the foreign key of People_ID of people.
+###Input:
+{context}
+###Response:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+"""  # noqa: E501
 
 EXPECTED_LORA_OUTPUT = [
-    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
-    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
-    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
-    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
-    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
-    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ",  # noqa: E501
+    "SELECT count(*) FROM candidate",
+    "SELECT count(*) FROM candidate",
+    "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
+    "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
 ]
 
+MODEL_PATH = "meta-llama/Llama-3.2-3B-Instruct"
+
 
 def do_sample(
     llm: vllm.LLM,
@@ -32,18 +42,19 @@ def do_sample(
     tensorizer_config_dict: dict | None = None,
 ) -> list[str]:
     prompts = [
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]",  # noqa: E501
+        PROMPT_TEMPLATE.format(context="How many candidates are there?"),
+        PROMPT_TEMPLATE.format(context="Count the number of candidates."),
+        PROMPT_TEMPLATE.format(
+            context="Which poll resource provided the most number of candidate information?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            context="Return the poll resource associated with the most candidates."
+        ),
     ]
 
     sampling_params = vllm.SamplingParams(
-        temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]
+        temperature=0, max_tokens=64, stop=["<|im_end|>"]
     )
-
     if tensorizer_config_dict is not None:
         outputs = llm.generate(
             prompts,
@@ -75,13 +86,15 @@ def do_sample(
     return generated_texts
 
 
-def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = None):
+def generate_and_test(
+    llm, llama32_lora_files, tensorizer_config_dict: dict | None = None
+):
     print("lora adapter created")
     print("lora 1")
     assert (
         do_sample(
             llm,
-            sql_lora_files,
+            llama32_lora_files,
             tensorizer_config_dict=tensorizer_config_dict,
             lora_id=1,
         )
@@ -92,7 +105,7 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None =
     assert (
         do_sample(
             llm,
-            sql_lora_files,
+            llama32_lora_files,
             tensorizer_config_dict=tensorizer_config_dict,
             lora_id=2,
         )
@@ -104,51 +117,52 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None =
 
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("cudagraph_specialize_lora", [True, False])
-def test_llama_lora(sql_lora_files, cudagraph_specialize_lora: bool):
+def test_llama_lora(llama32_lora_files, cudagraph_specialize_lora: bool):
     llm = vllm.LLM(
         MODEL_PATH,
-        tokenizer=sql_lora_files,
         enable_lora=True,
         # also test odd max_num_seqs
-        max_num_seqs=13,
+        max_num_seqs=7,
+        max_model_len=1024,
         max_loras=4,
         compilation_config=vllm.config.CompilationConfig(
             cudagraph_specialize_lora=cudagraph_specialize_lora,
         ),
     )
-    generate_and_test(llm, sql_lora_files)
+    generate_and_test(llm, llama32_lora_files)
 
 
 @multi_gpu_test(num_gpus=4)
-def test_llama_lora_tp4(sql_lora_files):
+def test_llama_lora_tp4(llama32_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
-        tokenizer=sql_lora_files,
         enable_lora=True,
-        max_num_seqs=16,
+        max_num_seqs=7,
+        max_model_len=1024,
         max_loras=4,
         tensor_parallel_size=4,
     )
-    generate_and_test(llm, sql_lora_files)
+    generate_and_test(llm, llama32_lora_files)
 
 
 @multi_gpu_test(num_gpus=4)
-def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
+def test_llama_lora_tp4_fully_sharded_loras(llama32_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
-        tokenizer=sql_lora_files,
         enable_lora=True,
-        max_num_seqs=16,
+        max_num_seqs=8,
         max_loras=4,
+        max_model_len=1024,
         tensor_parallel_size=4,
         fully_sharded_loras=True,
     )
-    generate_and_test(llm, sql_lora_files)
+    generate_and_test(llm, llama32_lora_files)
 
 
 @multi_gpu_test(num_gpus=2)
 def test_tp2_serialize_and_deserialize_lora(
-    tmp_path, sql_lora_files, sql_lora_huggingface_id
+    tmp_path,
+    llama32_lora_files,
 ):
     # Run the tensorizing of the LoRA adapter and the model in a subprocess
     # to guarantee cleanup
@@ -157,7 +171,7 @@ def test_tp2_serialize_and_deserialize_lora(
     model_name = "model-rank-%03d.tensors"
 
     model_ref = MODEL_PATH
-    lora_path = sql_lora_huggingface_id
+    lora_path = llama32_lora_files
     suffix = "test"
     try:
         result = subprocess.run(
@@ -195,12 +209,12 @@ def test_tp2_serialize_and_deserialize_lora(
 
     loaded_llm = LLM(
         model=model_ref,
-        tokenizer=sql_lora_files,
         load_format="tensorizer",
         enable_lora=True,
         enforce_eager=True,
         model_loader_extra_config=tensorizer_config,
-        max_num_seqs=13,
+        max_num_seqs=7,
+        max_model_len=1024,
         tensor_parallel_size=2,
         max_loras=2,
     )
@@ -211,7 +225,7 @@ def test_tp2_serialize_and_deserialize_lora(
     print("lora 1")
     assert (
         do_sample(
-            loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
+            loaded_llm, llama32_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
         )
         == EXPECTED_LORA_OUTPUT
     )
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index e914393fee8aa..1c692630284d0 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -13,8 +13,8 @@ from vllm.entrypoints.openai.api_server import (
 from vllm.lora.request import LoRARequest
 from vllm.v1.engine.llm_engine import LLMEngine
 
-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
-LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
+MODEL_PATH = "Qwen/Qwen3-0.6B"
+LORA_MODULE_PATH = "charent/self_cognition_Alice"
 LORA_RANK = 8
 
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index e7816031142e3..24d4dfca46d62 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -48,9 +48,6 @@ DEFAULT_DTYPE = torch.get_default_dtype()
 @pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors"))
-    new_embeddings = load_file(
-        os.path.join(sql_lora_files, "new_embeddings.safetensors")
-    )
 
     peft_helper = PEFTHelper.from_local_dir(
         sql_lora_files, max_position_embeddings=4096
@@ -60,7 +57,6 @@ def test_from_lora_tensors(sql_lora_files, device):
         tensors,
         peft_helper=peft_helper,
         device=device,
-        embeddings=new_embeddings,
         embedding_modules=EMBEDDING_MODULES,
         embedding_padding_modules=EMBEDDING_PADDING_MODULES,
     )
@@ -76,18 +72,6 @@ def test_from_lora_tensors(sql_lora_files, device):
             f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
         )
         assert lora.lora_a.shape[0] == 8
-        embeddings_module = next(
-            (k for k in EMBEDDING_MODULES if k in module_name), None
-        )
-        if embeddings_module:
-            assert torch.equal(
-                lora.embeddings_tensor,
-                new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
-                    device=lora.embeddings_tensor.device
-                ),
-            )
-        else:
-            assert lora.embeddings_tensor is None
 
 
 def create_lora(
@@ -552,9 +536,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
     worker_adapter_manager = WorkerLoRAManager(
         vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
     )
-    worker_adapter_manager.vocab_size = (
-        dummy_model_gate_up.unpadded_vocab_size - lora_config.lora_extra_vocab_size
-    )
+    worker_adapter_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
     worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
 
     dummy_lora_files = f"{tmp_path}/lora_adapter"
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index c97f8debd1b9a..b163559a9414d 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -20,11 +20,12 @@ from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker
 
+MODEL_PATH = "Qwen/Qwen3-0.6B"
 NUM_LORAS = 16
 
 
 @patch.dict(os.environ, {"RANK": "0"})
-def test_worker_apply_lora(sql_lora_files):
+def test_worker_apply_lora(qwen3_lora_files):
     def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
         lora_mapping = LoRAMapping([], [])
 
@@ -34,9 +35,10 @@ def test_worker_apply_lora(sql_lora_files):
 
     vllm_config = VllmConfig(
         model_config=ModelConfig(
-            "meta-llama/Llama-2-7b-hf",
+            MODEL_PATH,
             seed=0,
             dtype="float16",
+            max_model_len=127,
             enforce_eager=True,
         ),
         load_config=LoadConfig(
@@ -73,7 +75,7 @@ def test_worker_apply_lora(sql_lora_files):
     assert worker.list_loras() == set()
 
     lora_requests = [
-        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(NUM_LORAS)
+        LoRARequest(str(i + 1), i + 1, qwen3_lora_files) for i in range(NUM_LORAS)
     ]
 
     set_active_loras(worker, lora_requests)
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index d30b77f094665..6aba5299b5829 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -28,7 +28,6 @@ class DummyLoRAManager:
         module_name: str,
         weight: torch.Tensor,
         rank: int = 8,
-        generate_embeddings_tensor: int = 0,
     ):
         lora = LoRALayerWeights(
             module_name,
@@ -41,13 +40,6 @@ class DummyLoRAManager:
                 [weight.shape[0], rank], dtype=weight.dtype, device=self._device
             ),
         )
-        if generate_embeddings_tensor:
-            lora.embeddings_tensor = torch.rand(
-                5,
-                generate_embeddings_tensor,
-                dtype=weight.dtype,
-                device=self._device,
-            )
         self.set_module_lora(module_name, lora)
 
         return lora
diff --git a/vllm/config/lora.py b/vllm/config/lora.py
index 84e92eef40077..072e0ec2104f5 100644
--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
-from typing import TYPE_CHECKING, Any, ClassVar, Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 import torch
 from pydantic import ConfigDict, Field, model_validator
@@ -11,7 +11,6 @@ from typing_extensions import Self
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -46,19 +45,6 @@ class LoRAConfig:
     `max_loras`."""
     lora_dtype: torch.dtype | LoRADType = "auto"
     """Data type for LoRA. If auto, will default to base model dtype."""
-    lora_extra_vocab_size: LoRAExtraVocabSize = Field(
-        default=256,
-        deprecated=(
-            "`lora_extra_vocab_size` is deprecated and will be removed "
-            "in v0.12.0. Additional vocabulary support for "
-            "LoRA adapters is being phased out."
-        ),
-    )
-    """(Deprecated) Maximum size of extra vocabulary that can be present in a 
-    LoRA adapter. Will be removed in v0.12.0."""
-    lora_vocab_padding_size: ClassVar[int] = (
-        current_platform.get_lora_vocab_padding_size()
-    )
     default_mm_loras: dict[str, str] | None = None
     """Dictionary mapping specific modalities to LoRA model paths; this field
     is only applicable to multimodal models and should be leveraged when a
@@ -87,8 +73,6 @@ class LoRAConfig:
         factors.append(self.max_loras)
         factors.append(self.fully_sharded_loras)
         factors.append(self.lora_dtype)
-        factors.append(self.lora_extra_vocab_size)
-        factors.append(self.lora_vocab_padding_size)
 
         hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 74828bc109cbe..bcb90119f9b04 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -484,7 +484,6 @@ class EngineArgs:
     fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
     max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
     lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
-    lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
 
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
@@ -1011,9 +1010,6 @@ class EngineArgs:
         )
         lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
         lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"])
-        lora_group.add_argument(
-            "--lora-extra-vocab-size", **lora_kwargs["lora_extra_vocab_size"]
-        )
         lora_group.add_argument(
             "--lora-dtype",
             **lora_kwargs["lora_dtype"],
@@ -1680,7 +1676,6 @@ class EngineArgs:
                 max_loras=self.max_loras,
                 default_mm_loras=self.default_mm_loras,
                 fully_sharded_loras=self.fully_sharded_loras,
-                lora_extra_vocab_size=self.lora_extra_vocab_size,
                 lora_dtype=self.lora_dtype,
                 max_cpu_loras=self.max_cpu_loras
                 if self.max_cpu_loras and self.max_cpu_loras > 0
diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py
index 0c7e806848892..62326c05b2bd1 100644
--- a/vllm/lora/layers/base.py
+++ b/vllm/lora/layers/base.py
@@ -44,7 +44,6 @@ class BaseLayerWithLoRA(nn.Module):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
     ):
         """Overwrites lora tensors at index."""
         ...
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index 3db4165e20176..e85c5bd70b072 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -96,7 +96,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
     ):
         # Except for QKVParallelLinearWithLoRA and
         # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index 637ded9b2a0f0..273c4950e3239 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -248,7 +248,6 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
     ):
         self.reset_lora(index)
 
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 3291c41fcda1e..adf30855cafc3 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -406,8 +406,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
-        bias: torch.Tensor | None = None,
     ):
         """Overwrites lora tensors at index."""
         self.reset_lora(index)
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
index adc5e861f57fb..06f92652031e1 100644
--- a/vllm/lora/layers/logits_processor.py
+++ b/vllm/lora/layers/logits_processor.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import math
 
 import torch
 import torch.nn as nn
@@ -108,22 +107,13 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
             (
                 max_loras,
                 1,
-                # Pad for kernel compatibility
-                math.ceil(
-                    self.base_layer.vocab_size / lora_config.lora_vocab_padding_size
-                )
-                * lora_config.lora_vocab_padding_size,
+                self.base_layer.vocab_size,
                 lora_config.max_lora_rank,
             ),
             dtype=lora_config.lora_dtype,
             device=self.device,
         )
-        self.embeddings_tensors = torch.full(
-            (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size),
-            fill_value=float("-inf"),
-            dtype=self.dtype,
-            device=self.device,
-        )
+
         if self.sharded_to_full_mapping is not None:
             self.sharded_to_full_mapping_gpu = torch.tensor(
                 self.sharded_to_full_mapping, device=self.device, dtype=torch.long
@@ -134,14 +124,12 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
-        self.embeddings_tensors[index] = float("-inf")
 
     def set_lora(
         self,
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
     ):
         self.reset_lora(index)
         self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
@@ -150,12 +138,6 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
             lora_b, non_blocking=True
         )
-        if embeddings_tensor is not None:
-            self.embeddings_tensors[
-                index,
-                : embeddings_tensor.shape[0],
-                : embeddings_tensor.shape[1],
-            ] = embeddings_tensor
 
     def _get_logits(
         self,
@@ -193,39 +175,6 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
             # token_id: [0, 1, 2, 3, 4, 5, -1, -1]
             logits = logits[:, self.sharded_to_full_mapping_gpu]
 
-        lora_logits = torch.empty(
-            self.embeddings_tensors.shape[0] + 1,
-            self.embeddings_tensors.shape[1],
-            hidden_states.shape[0],
-            dtype=self.embeddings_tensors.dtype,
-            device=self.embeddings_tensors.device,
-        )
-        torch.matmul(self.embeddings_tensors, hidden_states.T, out=lora_logits[:-1])
-
-        neg_inf, pos_inf = current_platform.get_infinity_values(lora_logits.dtype)
-
-        lora_logits[-1] = neg_inf
-        lora_logits = lora_logits.mT
-        indices_padded = self.punica_wrapper.sampler_indices_padded
-
-        if current_platform.is_tpu() or current_platform.is_xpu():
-            indices_padded = indices_padded[: logits.size(0)]
-
-        lora_logits = (
-            lora_logits.reshape(
-                lora_logits.shape[0] * lora_logits.shape[1],
-                lora_logits.shape[2],
-            )
-            .index_select(0, indices_padded)
-            .nan_to_num_(nan=neg_inf, posinf=pos_inf, neginf=neg_inf)
-        )
-
-        logits[
-            :,
-            self.base_layer.org_vocab_size : self.base_layer.org_vocab_size
-            + lora_logits.shape[1],
-        ] = lora_logits
-
         lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_logits(
             logits, hidden_states, self.lora_a_stacked, self.lora_b_stacked, 1.0
         )
diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py
index ca4ad8012e9c3..5b1f7886bc238 100644
--- a/vllm/lora/layers/vocal_parallel_embedding.py
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -46,19 +46,10 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
             self.embeddings_slice = None
             self.embeddings_weights = None
 
-        self.embeddings_tensors = torch.zeros(
-            (
-                max_loras,
-                lora_config.lora_extra_vocab_size,
-                self.base_layer.embedding_dim,
-            ),
-            dtype=self.base_layer.weight.dtype,
-            device=self.base_layer.weight.device,
-        )
         self.lora_a_stacked = torch.zeros(
             (
                 max_loras,
-                self.base_layer.org_vocab_size + lora_config.lora_extra_vocab_size,
+                self.base_layer.org_vocab_size,
                 lora_config.max_lora_rank,
             ),
             dtype=lora_config.lora_dtype,
@@ -82,14 +73,12 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
-        self.embeddings_tensors[index] = 0
 
     def set_lora(
         self,
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
     ):
         self.reset_lora(index)
         # NOTE self.lora_a_stacked is row-major, and lora_a is col-major,
@@ -100,36 +89,18 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
         self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
             lora_b, non_blocking=True
         )
-        if embeddings_tensor is not None:
-            self.embeddings_tensors[
-                index,
-                : embeddings_tensor.shape[0],
-                : embeddings_tensor.shape[1],
-            ].copy_(embeddings_tensor, non_blocking=True)
-            if self.embeddings_slice is not None:
-                # TODO(yard1): Optimize this copy, we don't need to copy
-                # everything, just the modified part
-                embeddings = self.embeddings_tensors.view(
-                    self.embeddings_tensors.shape[0] * self.embeddings_tensors.shape[1],
-                    self.embeddings_tensors.shape[2],
-                )[self.embeddings_slice[0] : self.embeddings_slice[1]]
-                assert self.embeddings_weights is not None
-                self.embeddings_weights[: embeddings.shape[0]].copy_(embeddings)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1, 1, 0)
-
         # NB: Don't use torch.narrow here. torch.narrow triggers some
         # Dynamic Shape specialization in torch.compile
         num_tokens = x.shape[0]
         indices_1 = self.punica_wrapper._embeddings_indices[1][:num_tokens]
-        indices_0 = self.punica_wrapper._embeddings_indices[0][:num_tokens]
 
         full_lora_a_embeddings = F.embedding(
             x + indices_1,
             self.lora_a_stacked_2d,
         )
-        full_output = self.base_layer.forward(x + (indices_0 * added_tokens_mask))
+        full_output = self.base_layer.forward(x)
 
         full_output_org = full_output
         if full_output.ndim == 3:
diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py
index 7691481d5039e..f0d8e22194050 100644
--- a/vllm/lora/lora_weights.py
+++ b/vllm/lora/lora_weights.py
@@ -21,7 +21,6 @@ class LoRALayerWeights:
         lora_alpha: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None = None,
         scaling: float | None = None,
     ) -> None:
         self.module_name = module_name
@@ -29,7 +28,6 @@ class LoRALayerWeights:
         self.lora_alpha = lora_alpha
         self.lora_a = lora_a
         self.lora_b = lora_b
-        self.embeddings_tensor = embeddings_tensor
 
         if scaling is None:
             self.scaling = self.lora_alpha / self.rank
@@ -56,18 +54,11 @@ class LoRALayerWeights:
     def is_packed(self) -> bool:
         return False
 
-    @property
-    def extra_vocab_size(self) -> int:
-        return (
-            self.embeddings_tensor.shape[0] if self.embeddings_tensor is not None else 0
-        )
-
     @classmethod
     def from_config(
         cls,
         module_name: str,
         peft_helper: PEFTHelper,
-        embeddings_tensor: torch.Tensor | None = None,
     ) -> "LoRALayerWeights":
         # lora_a and lora_b are set to None for config-based construction
         return cls(
@@ -76,7 +67,6 @@ class LoRALayerWeights:
             peft_helper.lora_alpha,
             None,
             None,
-            embeddings_tensor,
             peft_helper.vllm_lora_scaling_factor,
         )
 
@@ -89,7 +79,6 @@ class LoRALayerWeights:
         rank: int,
         dtype: torch.dtype,
         device: torch.types.Device,
-        embeddings_tensor_dim: int | None = None,
     ) -> "LoRALayerWeights":
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         lora_a = torch.zeros(
@@ -99,24 +88,12 @@ class LoRALayerWeights:
             [output_dim, rank], dtype=dtype, device=device, pin_memory=pin_memory
         )
 
-        embeddings_tensor = (
-            torch.rand(
-                10,
-                embeddings_tensor_dim,
-                dtype=dtype,
-                device=device,
-                pin_memory=pin_memory,
-            )
-            if embeddings_tensor_dim
-            else None
-        )
         return cls(
             module_name,
             rank=rank,
             lora_alpha=1,
             lora_a=lora_a,
             lora_b=lora_b,
-            embeddings_tensor=embeddings_tensor,
         )
 
 
@@ -139,7 +116,6 @@ class PackedLoRALayerWeights(LoRALayerWeights):
             lora_a=lora_a,
             lora_b=lora_b,
             scaling=scaling,  # type: ignore
-            embeddings_tensor=None,
         )
         self.lora_alphas = lora_alphas
         if scaling is None:
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 02c252f15bfab..eb11cd0afc487 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -21,6 +21,7 @@ from vllm.lora.utils import (
     from_layer,
     from_layer_logits_processor,
     get_supported_lora_modules,
+    is_base_embeddding_weights,
     is_regex_target_modules,
     parse_fine_tuned_lora_name,
     process_packed_modules_mapping,
@@ -93,14 +94,6 @@ class LoRAModel:
             loras=self.loras.copy(),
         )
 
-    @property
-    def extra_vocab_size(self) -> int:
-        return (
-            max(lora.extra_vocab_size for lora in self.loras.values())
-            if self.loras
-            else 0
-        )
-
     def get_lora(self, module_name: str) -> LoRALayerWeights | None:
         """Get LoRA for a given module by name"""
         return self.loras.get(module_name, None)
@@ -117,7 +110,6 @@ class LoRAModel:
         peft_helper: PEFTHelper,
         device: str = "cuda",
         dtype: torch.dtype | None = None,
-        embeddings: dict[str, torch.Tensor] | None = None,
         target_embedding_padding: int | None = None,
         embedding_modules: dict[str, str] | None = None,
         embedding_padding_modules: list[str] | None = None,
@@ -127,24 +119,14 @@ class LoRAModel:
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         loras: dict[str, LoRALayerWeights] = {}
         for tensor_name, tensor in tensors.items():
+            if is_base_embeddding_weights(tensor_name):
+                continue
             module_name, is_lora_a = parse_fine_tuned_lora_name(
                 tensor_name, weights_mapper
             )
             if module_name not in loras:
-                lora_embeddings_tensor = None
-                if embeddings:
-                    assert embedding_modules is not None
-                    embeddings_module = next(
-                        (k for k in embedding_modules if k in module_name), None
-                    )
-                    if embeddings_module:
-                        lora_embeddings_tensor = embeddings[
-                            embedding_modules[embeddings_module]
-                        ].to(device=device, dtype=dtype)
-                        if pin_memory:
-                            lora_embeddings_tensor = lora_embeddings_tensor.pin_memory()
                 loras[module_name] = LoRALayerWeights.from_config(
-                    module_name, peft_helper, lora_embeddings_tensor
+                    module_name, peft_helper
                 )
 
             if is_lora_a:
@@ -206,15 +188,17 @@ class LoRAModel:
         lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
         lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
         lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
-        new_embeddings_tensor_path = os.path.join(
-            lora_dir, "new_embeddings.safetensors"
-        )
-        new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
+        # new_embeddings_tensor_path = os.path.join(
+        #     lora_dir, "new_embeddings.safetensors"
+        # )
+        # new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
         tensors: dict[str, torch.Tensor] = {}
         unexpected_modules: list[list[str] | str] = []
 
         def check_unexpected_modules(modules: dict):
             for lora_module in modules.keys():  # noqa
+                if is_base_embeddding_weights(lora_module):
+                    continue
                 module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
                 # Handle FSDP file format where experts.base_layer is the
                 # gate_up_proj and experts is the down_proj
@@ -300,21 +284,12 @@ class LoRAModel:
         else:
             raise ValueError(f"{lora_dir} doesn't contain tensors")
 
-        embeddings = None
-        if os.path.isfile(new_embeddings_tensor_path):
-            embeddings = safetensors.torch.load_file(new_embeddings_tensor_path)
-        elif os.path.isfile(new_embeddings_bin_file_path):
-            embeddings = torch.load(
-                new_embeddings_bin_file_path, map_location=device, weights_only=True
-            )
-
         return cls.from_lora_tensors(
             lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id,
             tensors=tensors,
             peft_helper=peft_helper,
             device=device,
             dtype=dtype,
-            embeddings=embeddings,
             target_embedding_padding=target_embedding_padding,
             embedding_modules=embedding_modules,
             embedding_padding_modules=embedding_padding_modules,
@@ -474,7 +449,6 @@ class LoRAModelManager:
                     index,
                     module_lora.lora_a,
                     module_lora.lora_b,
-                    module_lora.embeddings_tensor,
                 )
             else:
                 module.reset_lora(index)
@@ -505,7 +479,6 @@ class LoRAModelManager:
             self.lora_index_to_id,
             self.lora_slots + 1,
             self.vocab_size,
-            self.lora_config.lora_extra_vocab_size,
         )
 
     def remove_all_adapters(self):
@@ -616,7 +589,6 @@ class LoRAModelManager:
                 if parts[-1] in embedding_modules:
                     input_dim = (
                         module.base_layer.org_vocab_size
-                        + self.lora_config.lora_extra_vocab_size
                         if hasattr(module.base_layer, "org_vocab_size")
                         else module.base_layer.weight.shape[1]
                     )
@@ -625,11 +597,6 @@ class LoRAModelManager:
                         if hasattr(module.base_layer, "embedding_dim")
                         else module.base_layer.weight.shape[0]
                     )
-                    embeddings_tensor_dim = (
-                        module.base_layer.embedding_dim
-                        if hasattr(module.base_layer, "embedding_dim")
-                        else module.base_layer.weight.shape[1]
-                    )
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
                         input_dim,
@@ -637,7 +604,6 @@ class LoRAModelManager:
                         rank,
                         module.lora_a_stacked[0].dtype,
                         "cpu",
-                        embeddings_tensor_dim=embeddings_tensor_dim,
                     )
                 else:
                     lora = LoRALayerWeights.create_dummy_lora_weights(
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index a6ffbb7b71ce4..7c0fc8167711d 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -31,7 +31,6 @@ class PunicaWrapperABC(ABC):
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
         **kwargs,
     ) -> None:
         """
@@ -172,8 +171,11 @@ class PunicaWrapperBase(PunicaWrapperABC):
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
     ):
+        # NOTE We have remove lora extra vocab support for now. So we set
+        # extra_vocab_size alwayzs to 0, and extra_vocab_size will be removed.
+
+        extra_vocab_size = 0
         (
             base_indices,
             sampler_indices,
@@ -285,12 +287,9 @@ class PunicaWrapperBase(PunicaWrapperABC):
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
         **kwargs,
     ):
-        self._update_base_metadata(
-            mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
-        )
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
 
         if mapping.is_prefill:
             # Update metadata required for prefill-related operators.
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index d863a5884d3c5..52138ef0cc3b0 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -65,13 +65,10 @@ class PunicaWrapperGPU(PunicaWrapperBase):
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
         **kwargs,
     ):
         self.is_prefill = mapping.is_prefill
-        self._update_base_metadata(
-            mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
-        )
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
 
         # Prepare cuda kernel metadata tensors
         self.token_mapping_meta.prepare_tensors(self.token_lora_indices)
diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py
index 090878dcd2546..0888772db54e7 100644
--- a/vllm/lora/punica_wrapper/punica_tpu.py
+++ b/vllm/lora/punica_wrapper/punica_tpu.py
@@ -292,7 +292,6 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
     ):
         # Make sure we don't accidentally collect outside operations
         torch_xla.sync()
@@ -313,7 +312,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
             lora_index_to_id,
             max_loras,
             vocab_size,
-            extra_vocab_size,
+            0,  # extra_vocab_size
             "cpu",
         )
         self._token_lora_indices = self._pad_to_shape(
diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py
index b95087d0ff834..00c00782896cf 100644
--- a/vllm/lora/punica_wrapper/punica_xpu.py
+++ b/vllm/lora/punica_wrapper/punica_xpu.py
@@ -43,13 +43,10 @@ class PunicaWrapperXPU(PunicaWrapperBase):
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
         **kwargs,
     ):
         self.is_prefill = mapping.is_prefill
-        self._update_base_metadata(
-            mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
-        )
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
 
     def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor:
         return torch.narrow(self._token_lora_indices, 0, 0, x.size(0))
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 0f43ff06d8f2b..a49a7d9d1669d 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -166,6 +166,16 @@ def parse_fine_tuned_lora_name(
     raise ValueError(f"{name} is unsupported LoRA weight")
 
 
+def is_base_embeddding_weights(name: str) -> bool:
+    # hardcoded subfixes for input & output embedding weights
+    input_embedding_subfix = ".embed_tokens.base_layer.weight"
+    output_embedding_subfix = ".lm_head.base_layer.weight"
+
+    return name.endswith(input_embedding_subfix) or name.endswith(
+        output_embedding_subfix
+    )
+
+
 def is_regex_target_modules(
     load_modules: str | list[str], expected_lora_modules: list[str]
 ) -> bool:
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index b85151f2c7592..4cc201a6414f1 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -121,8 +121,7 @@ class WorkerLoRAManager:
                 lora_model_id=lora_request.lora_int_id,
                 device="cpu",
                 dtype=self.lora_config.lora_dtype,
-                target_embedding_padding=self.vocab_size
-                + self.lora_config.lora_extra_vocab_size,
+                target_embedding_padding=self.vocab_size,
                 embedding_modules=self.embedding_modules,
                 embedding_padding_modules=self.embedding_padding_modules,
                 tensorizer_config_dict=lora_request.tensorizer_config_dict,
@@ -143,12 +142,6 @@ class WorkerLoRAManager:
             # For BadRequestError
             raise e
 
-        if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
-            raise ValueError(
-                f"LoRA added vocab size {lora.extra_vocab_size} "
-                f"is greater than lora_extra_vocab_size "
-                f"{self.lora_config.lora_extra_vocab_size}."
-            )
         return lora
 
     def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 1dc205b47753d..cd7ce2fc8f00a 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -46,7 +46,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -261,29 +260,16 @@ class GraniteModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
-                self.vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
             )
         else:
@@ -420,28 +406,18 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = GraniteModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -453,7 +429,7 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                 logit_scale /= config.logits_scaling
 
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, scale=logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d5b49d2fb4c26..ebf8addda4a54 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -47,7 +47,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -368,24 +367,18 @@ class LlamaModel(nn.Module):
 
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -562,9 +555,7 @@ class LlamaForCausalLM(
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
         self.config = config
-        self.lora_config = lora_config
 
         self.model = self._init_model(
             vllm_config=vllm_config,
@@ -573,20 +564,9 @@ class LlamaForCausalLM(
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -595,7 +575,7 @@ class LlamaForCausalLM(
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 54ab8dd493e73..0a9c3f136964e 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -51,7 +51,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -301,23 +300,18 @@ class MixtralModel(nn.Module):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         parallel_config = vllm_config.parallel_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
         self.org_vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.enable_eplb = parallel_config.enable_eplb
@@ -508,34 +502,24 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = MixtralModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py
index 8a0bec9dff848..bebd7bcaa9249 100644
--- a/vllm/model_executor/models/teleflm.py
+++ b/vllm/model_executor/models/teleflm.py
@@ -74,5 +74,5 @@ class TeleFLMForCausalLM(LlamaForCausalLM):
             self.output_mult = self.config.output_mult / self.mup_scale_factor
             logit_scale = self.output_mult
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, self.config.vocab_size, logit_scale
+                self.config.vocab_size, scale=logit_scale
             )
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index e9eb7cad38f88..923c31c187f31 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -219,9 +219,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.hidden_size = model_config.get_hidden_size()
         self.vocab_size = model_config.get_vocab_size()
 
-        if self.lora_config is not None:
-            self.vocab_size += self.lora_config.lora_extra_vocab_size
-
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope

From ed6ae1e36a03bed4a29287163e051a7772b1d8b1 Mon Sep 17 00:00:00 2001
From: Xiao Li <swing1979@gmail.com>
Date: Thu, 20 Nov 2025 17:54:35 -0800
Subject: [PATCH 091/249] [AITER] [ROCm] Fix crash when loading llama4 model
 with old aiter version installed, fallback to forward_native implementation
 (#29124)

Signed-off-by: Xiao Li <ilx@meta.com>
---
 vllm/v1/sample/ops/topk_topp_sampler.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index c6c7e924175f7..5b2d130b0ea42 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -60,13 +60,20 @@ class TopKTopPSampler(nn.Module):
             logprobs_mode not in ("processed_logits", "processed_logprobs")
             and rocm_aiter_ops.is_enabled()
         ):
-            import aiter.ops.sampling  # noqa: F401
+            try:
+                import aiter.ops.sampling  # noqa: F401
 
-            self.aiter_ops = torch.ops.aiter
-            logger.info_once(
-                "Using aiter sampler on ROCm (lazy import, sampling-only)."
-            )
-            self.forward = self.forward_hip
+                self.aiter_ops = torch.ops.aiter
+                logger.info_once(
+                    "Using aiter sampler on ROCm (lazy import, sampling-only)."
+                )
+                self.forward = self.forward_hip
+            except ImportError:
+                logger.warning_once(
+                    "aiter.ops.sampling is not available on ROCm. "
+                    "Falling back to forward_native implementation."
+                )
+                self.forward = self.forward_native
         else:
             self.forward = self.forward_native
 

From e1eefa4c40fc5b28bd7e83b6596bb5d2f420fd92 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 20 Nov 2025 20:54:59 -0500
Subject: [PATCH 092/249] [Bug] Fix torch warning of tf32 usage (#29112)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/batch_invariant.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 69fa6bdffd43f..bec7af0286345 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -852,5 +852,6 @@ def init_batch_invariance():
         enable_batch_invariant_mode()
 
         # Disable TF32 for batch invariance - it causes non-deterministic rounding
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cuda.matmul.fp32_precision = "ieee"
+        torch.backends.cudnn.conv.fp32_precision = "ieee"
+        torch.backends.cudnn.rnn.fp32_precision = "ieee"

From 3f5f36da3fefbae96960f60d41ccf8ac1155515e Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Thu, 20 Nov 2025 22:30:07 -0500
Subject: [PATCH 093/249] [ROCm] Fix for import when building with upstream
 triton for gfx1100 for gpt-oss serving (#29127)

Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
---
 .../layers/quantization/utils/mxfp4_utils.py         | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index cbc46810a26a6..d0c8b3d1a3093 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -39,15 +39,15 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
         value_layout = StridedLayout
         scale_layout = StridedLayout
     elif current_platform.is_rocm():
-        from triton_kernels.tensor_details.layout import (
-            GFX950MXScaleLayout,
-            StridedLayout,
-        )
-
         from vllm.platforms.rocm import on_gfx950
 
         value_layout = StridedLayout
-        scale_layout = GFX950MXScaleLayout if on_gfx950() else StridedLayout
+        if on_gfx950():
+            from triton_kernels.tensor_details.layout import GFX950MXScaleLayout
+
+            scale_layout = GFX950MXScaleLayout
+        else:
+            scale_layout = StridedLayout
     else:
         value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(
             mx_axis=1

From 56669c1f293d5c53b6a19ddf2f78802fa9fff2c2 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 20 Nov 2025 22:36:07 -0500
Subject: [PATCH 094/249] [CI] Fix mypy for `vllm/v1/worker` (#29037)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tools/pre_commit/mypy.py                      |   2 +-
 vllm/model_executor/utils.py                  |   2 +-
 vllm/multimodal/utils.py                      |   4 +-
 vllm/v1/worker/cpu_worker.py                  |  12 +-
 vllm/v1/worker/gpu_model_runner.py            | 128 +++++++++++-------
 vllm/v1/worker/gpu_ubatch_wrapper.py          |  16 ++-
 vllm/v1/worker/gpu_worker.py                  |  62 +++++----
 .../worker/kv_connector_model_runner_mixin.py |   2 +-
 vllm/v1/worker/tpu_model_runner.py            |  28 +++-
 vllm/v1/worker/tpu_worker.py                  |   5 +-
 vllm/v1/worker/utils.py                       |   8 +-
 vllm/v1/worker/worker_base.py                 |   2 +
 vllm/v1/worker/xpu_worker.py                  |   9 +-
 13 files changed, 178 insertions(+), 102 deletions(-)

diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 8d04848f8f780..34f6e8c928ffb 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -38,6 +38,7 @@ FILES = [
     "vllm/usage",
     "vllm/v1/core",
     "vllm/v1/engine",
+    "vllm/v1/worker",
 ]
 
 # After fixing errors resulting from changing follow_imports
@@ -62,7 +63,6 @@ SEPARATE_GROUPS = [
     "vllm/v1/sample",
     "vllm/v1/spec_decode",
     "vllm/v1/structured_output",
-    "vllm/v1/worker",
 ]
 
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 759b809433b14..8aad59e84ff25 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -10,7 +10,7 @@ import torch
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 
-def set_random_seed(seed: int) -> None:
+def set_random_seed(seed: int | None) -> None:
     from vllm.platforms import current_platform
 
     current_platform.seed_everything(seed)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 3f55c46ca334d..ac89bdacc01d5 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,7 +3,7 @@
 
 import asyncio
 import atexit
-from collections.abc import Iterable, Set
+from collections.abc import Generator, Set
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
@@ -403,7 +403,7 @@ def group_mm_kwargs_by_modality(
     pin_memory: bool = False,
     merge_by_field_config: bool | None = None,
     multimodal_cpu_fields: Set[str] = frozenset(),
-) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
+) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]:
     """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
     modality together into the same `MultiModalKwargs` instance.
 
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index 4420a057d1e58..b080fea1d2dd6 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -3,6 +3,7 @@
 import os
 import platform
 from collections.abc import Callable
+from typing import Any
 
 import torch
 
@@ -37,6 +38,9 @@ class CPUWorker(Worker):
 
         self.parallel_config.disable_custom_all_reduce = True
 
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        self.profiler: Any | None = None
         if envs.VLLM_TORCH_PROFILER_DIR:
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
@@ -80,13 +84,13 @@ class CPUWorker(Worker):
             self.local_omp_cpuid = "nobind"
         else:
             local_dp_rank = self.parallel_config.data_parallel_rank_local
-            omp_cpuids = omp_cpuids.split("|")
+            omp_cpuids_list = omp_cpuids.split("|")
             if local_dp_rank is not None:
                 world_size = self.parallel_config.world_size
-                omp_cpuids = omp_cpuids[
+                omp_cpuids_list = omp_cpuids_list[
                     local_dp_rank * world_size : (local_dp_rank + 1) * world_size
                 ]
-            self.local_omp_cpuid = omp_cpuids[self.rank]
+            self.local_omp_cpuid = omp_cpuids_list[self.rank]
 
         if self.local_omp_cpuid != "nobind":
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
@@ -120,7 +124,7 @@ class CPUWorker(Worker):
         pass
 
     def determine_available_memory(self) -> int:
-        return self.cache_config.cpu_kvcache_space_bytes  # type: ignore
+        return self.cache_config.cpu_kvcache_space_bytes or 0
 
     def compile_or_warm_up_model(self) -> None:
         # Reset the seed to ensure that the random state is not affected by
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4b0a08ab57e16..a7fa68b20ac50 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -5,7 +5,7 @@ import gc
 import itertools
 import time
 from collections import defaultdict
-from collections.abc import Iterator
+from collections.abc import Iterator, Sequence
 from contextlib import contextmanager
 from copy import copy, deepcopy
 from functools import reduce
@@ -53,6 +53,7 @@ from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.interfaces import (
+    SupportsMRoPE,
     SupportsMultiModal,
     is_mixture_of_experts,
     supports_eagle3,
@@ -126,6 +127,7 @@ from vllm.v1.outputs import (
 )
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
+from vllm.v1.sample.logits_processor.interface import LogitsProcessor
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
 from vllm.v1.sample.sampler import Sampler
@@ -404,7 +406,10 @@ class GPUModelRunner(
         # solution, we initialize the input batch here, and re-initialize it
         # in `initialize_kv_cache` if the block_sizes here is different from
         # the block_sizes in the kv cache config.
-        custom_logitsprocs = model_config.logits_processors
+        logits_processors = model_config.logits_processors
+        custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = (
+            tuple(logits_processors) if logits_processors is not None else ()
+        )
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
             # We need to use the encoder length for encoder-decoer
@@ -959,9 +964,13 @@ class GPUModelRunner(
     def _init_mrope_positions(self, req_state: CachedRequestState):
         model = self.get_model()
         assert supports_mrope(model), "M-RoPE support is not implemented."
+        assert req_state.prompt_token_ids is not None, (
+            "M-RoPE requires prompt_token_ids to be available."
+        )
+        mrope_model = cast(SupportsMRoPE, model)
 
         req_state.mrope_positions, req_state.mrope_position_delta = (
-            model.get_mrope_input_positions(
+            mrope_model.get_mrope_input_positions(
                 req_state.prompt_token_ids,
                 req_state.mm_features,
             )
@@ -1762,6 +1771,7 @@ class GPUModelRunner(
                 dst_start = mrope_pos_ptr
                 dst_end = mrope_pos_ptr + completion_part_len
 
+                assert req.mrope_position_delta is not None
                 MRotaryEmbedding.get_next_input_positions_tensor(
                     out=self.mrope_positions.np,
                     out_offset=dst_start,
@@ -1907,6 +1917,8 @@ class GPUModelRunner(
 
             for mm_input_id in encoder_input_ids:
                 mm_feature = req_state.mm_features[mm_input_id]
+                if mm_feature.data is None:
+                    continue
                 mm_hash = mm_feature.identifier
                 mm_kwargs.append(mm_feature.data)
                 mm_hashes_pos.append((mm_hash, mm_feature.mm_position))
@@ -1930,7 +1942,7 @@ class GPUModelRunner(
         # multimodal inputs. The proper solution should be reordering the
         # encoder outputs.
         model = cast(SupportsMultiModal, self.model)
-        encoder_outputs = []
+        encoder_outputs: list[torch.Tensor] = []
         for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
             mm_kwargs,
             device=self.device,
@@ -1938,7 +1950,7 @@ class GPUModelRunner(
             merge_by_field_config=model.merge_by_field_config,
             multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
-            curr_group_outputs = []
+            curr_group_outputs: list[torch.Tensor] = []
 
             # EVS-related change.
             # (ekhvedchenia): Temporary hack to limit peak memory usage when
@@ -1980,7 +1992,7 @@ class GPUModelRunner(
                 # 2. A list or tuple (length: num_items) of tensors,
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
-                curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
+                curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)  # type: ignore[assignment]
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
@@ -2180,7 +2192,7 @@ class GPUModelRunner(
     def sync_and_slice_intermediate_tensors(
         self,
         num_tokens: int,
-        intermediate_tensors: IntermediateTensors,
+        intermediate_tensors: IntermediateTensors | None,
         sync_self: bool,
     ) -> IntermediateTensors:
         assert self.intermediate_tensors is not None
@@ -2397,6 +2409,7 @@ class GPUModelRunner(
         if is_first_rank:
             intermediate_tensors = None
         else:
+            assert intermediate_tensors is not None
             intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                 num_input_tokens, intermediate_tensors, True
             )
@@ -2765,14 +2778,14 @@ class GPUModelRunner(
             uniform_decode = (
                 max_num_scheduled_tokens == self.uniform_decode_query_len
             ) and (num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
-            batch_descriptor = BatchDescriptor(
+            batch_desc = BatchDescriptor(
                 num_tokens=num_input_tokens,
                 uniform_decode=uniform_decode,
                 has_lora=len(self.input_batch.lora_id_to_lora_request) > 0,
             )
             cudagraph_runtime_mode, batch_descriptor = (
                 self.cudagraph_dispatcher.dispatch(
-                    batch_descriptor,
+                    batch_desc,
                     use_cascade_attn=cascade_attn_prefix_lens is not None,
                 )
             )
@@ -2856,15 +2869,15 @@ class GPUModelRunner(
                 else:
                     logits = self.model.compute_logits(sample_hidden_states)
 
-                model_output_broadcast_data = {}
+                model_output_broadcast_data: dict[str, Any] = {}
                 if logits is not None:
                     model_output_broadcast_data["logits"] = logits.contiguous()
 
-                model_output_broadcast_data = get_pp_group().broadcast_tensor_dict(
+                broadcasted = get_pp_group().broadcast_tensor_dict(
                     model_output_broadcast_data, src=len(get_pp_group().ranks) - 1
                 )
-                assert model_output_broadcast_data is not None
-                logits = model_output_broadcast_data["logits"]
+                assert broadcasted is not None
+                logits = broadcasted["logits"]
 
         self.execute_model_state = ExecuteModelState(
             scheduler_output,
@@ -2889,7 +2902,7 @@ class GPUModelRunner(
         if self.execute_model_state is None:
             # Nothing to do (PP non-final rank case), output isn't used.
             if not kv_connector_output:
-                return None  # noqa
+                return None  # type: ignore[return-value]
 
             # In case of PP with kv transfer, we need to pass through the
             # kv_connector_output
@@ -2941,33 +2954,37 @@ class GPUModelRunner(
                     spec_decode_common_attn_metadata,
                 )
 
+        spec_config = self.speculative_config
         use_padded_batch_for_eagle = (
-            self.speculative_config
-            and self.speculative_config.use_eagle()
-            and not self.speculative_config.disable_padded_drafter_batch
+            spec_config is not None
+            and spec_config.use_eagle()
+            and not spec_config.disable_padded_drafter_batch
         )
         effective_drafter_max_model_len = self.max_model_len
         if effective_drafter_max_model_len is None:
             effective_drafter_max_model_len = self.model_config.max_model_len
         if (
-            self.speculative_config
-            and self.speculative_config.draft_model_config is not None
-            and self.speculative_config.draft_model_config.max_model_len is not None
+            spec_config is not None
+            and spec_config.draft_model_config is not None
+            and spec_config.draft_model_config.max_model_len is not None
         ):
             effective_drafter_max_model_len = (
-                self.speculative_config.draft_model_config.max_model_len
+                spec_config.draft_model_config.max_model_len
             )
         input_fits_in_drafter = spec_decode_common_attn_metadata and (
             spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens
             <= effective_drafter_max_model_len
         )
         if use_padded_batch_for_eagle:
+            assert self.speculative_config is not None
+            assert isinstance(self.drafter, EagleProposer)
             sampled_token_ids = sampler_output.sampled_token_ids
             if input_fits_in_drafter:
                 # EAGLE speculative decoding can use the GPU sampled tokens
                 # as inputs, and does not need to wait for bookkeeping to finish.
                 propose_draft_token_ids(sampled_token_ids)
             elif self.valid_sampled_token_count_event is not None:
+                assert spec_decode_common_attn_metadata is not None
                 next_token_ids, valid_sampled_tokens_count = (
                     self.drafter.prepare_next_token_ids_padded(
                         spec_decode_common_attn_metadata,
@@ -3105,7 +3122,9 @@ class GPUModelRunner(
         common_attn_metadata: CommonAttentionMetadata,
     ) -> torch.Tensor | list[list[int]]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        if self.speculative_config.method == "ngram":
+        spec_config = self.speculative_config
+        assert spec_config is not None
+        if spec_config.method == "ngram":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, NgramProposer)
             draft_token_ids = self.drafter.propose(
@@ -3115,11 +3134,11 @@ class GPUModelRunner(
                 self.input_batch.token_ids_cpu,
                 self.input_batch.spec_decode_unsupported_reqs,
             )
-        elif self.speculative_config.method == "suffix":
+        elif spec_config.method == "suffix":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, SuffixDecodingProposer)
             draft_token_ids = self.drafter.propose(self.input_batch, sampled_token_ids)
-        elif self.speculative_config.method == "medusa":
+        elif spec_config.method == "medusa":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, MedusaProposer)
 
@@ -3144,10 +3163,10 @@ class GPUModelRunner(
                 target_hidden_states=hidden_states,
                 sampling_metadata=sampling_metadata,
             )
-        elif self.speculative_config.use_eagle():
+        elif spec_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
 
-            if self.speculative_config.disable_padded_drafter_batch:
+            if spec_config.disable_padded_drafter_batch:
                 # When padded-batch is disabled, the sampled_token_ids should be
                 # the cpu-side list[list[int]] of valid sampled tokens for each
                 # request, with invalid requests having empty lists.
@@ -3197,7 +3216,7 @@ class GPUModelRunner(
                 else:
                     target_hidden_states = hidden_states[:num_scheduled_tokens]
             else:
-                if self.speculative_config.disable_padded_drafter_batch:
+                if spec_config.disable_padded_drafter_batch:
                     token_indices_to_sample = None
                     common_attn_metadata, token_indices = self.drafter.prepare_inputs(
                         common_attn_metadata,
@@ -3292,9 +3311,12 @@ class GPUModelRunner(
                     and is_mixture_of_experts(self.drafter.model)
                     and self.parallel_config.enable_eplb
                 ):
+                    spec_config = self.vllm_config.speculative_config
+                    assert spec_config is not None
+                    assert spec_config.draft_model_config is not None
                     logger.info_once(
                         "EPLB is enabled for drafter model %s.",
-                        self.vllm_config.speculative_config.draft_model_config.model,
+                        spec_config.draft_model_config.model,
                     )
 
                     global_expert_load = (
@@ -3311,7 +3333,7 @@ class GPUModelRunner(
                         self.eplb_state = EplbState(self.parallel_config, self.device)
                     self.eplb_state.add_model(
                         self.drafter.model,
-                        self.vllm_config.speculative_config.draft_model_config,
+                        spec_config.draft_model_config,
                         global_expert_load,
                         old_global_expert_indices,
                         rank_mapping,
@@ -3346,9 +3368,11 @@ class GPUModelRunner(
             scope="local",
         )
         prepare_communication_buffer_for_model(self.model)
+        mm_config = self.model_config.multimodal_config
         self.is_multimodal_pruning_enabled = (
             supports_multimodal_pruning(self.get_model())
-            and self.model_config.multimodal_config.is_multimodal_pruning_enabled()
+            and mm_config is not None
+            and mm_config.is_multimodal_pruning_enabled()
         )
 
         if is_mixture_of_experts(self.model) and self.parallel_config.enable_eplb:
@@ -3383,15 +3407,14 @@ class GPUModelRunner(
         # CudagraphWraper and CudagraphDispatcher of vllm.
 
         # wrap the model with full cudagraph wrapper if needed.
-        if (
-            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-            and not self.parallel_config.enable_dbo
-        ):
+        cudagraph_mode = self.compilation_config.cudagraph_mode
+        assert cudagraph_mode is not None
+        if cudagraph_mode.has_full_cudagraphs() and not self.parallel_config.enable_dbo:
             self.model = CUDAGraphWrapper(
                 self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL
             )
         elif self.parallel_config.enable_dbo:
-            if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
+            if cudagraph_mode.has_full_cudagraphs():
                 self.model = UBatchWrapper(
                     self.model, self.vllm_config, CUDAGraphMode.FULL, self.device
                 )
@@ -4071,7 +4094,8 @@ class GPUModelRunner(
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
-            if self.model_config.multimodal_config.skip_mm_profiling:
+            mm_config = self.model_config.multimodal_config
+            if mm_config is not None and mm_config.skip_mm_profiling:
                 logger.info(
                     "Skipping memory profiling for multimodal encoder and "
                     "encoder cache."
@@ -4333,8 +4357,9 @@ class GPUModelRunner(
         def get_attn_backends_for_group(
             kv_cache_group_spec: KVCacheGroupSpec,
         ) -> tuple[dict[AttentionGroupKey, list[str]], set[type[AttentionBackend]]]:
+            layer_type = cast(type[Any], AttentionLayerBase)
             layers = get_layers_from_vllm_config(
-                self.vllm_config, AttentionLayerBase, kv_cache_group_spec.layer_names
+                self.vllm_config, layer_type, kv_cache_group_spec.layer_names
             )
             attn_backends = {}
             attn_backend_layers = defaultdict(list)
@@ -4349,7 +4374,7 @@ class GPUModelRunner(
                 if layer_name in self.kv_sharing_fast_prefill_eligible_layers:
                     attn_backend = create_fast_prefill_custom_backend(
                         "FastPrefill",
-                        attn_backend,
+                        attn_backend,  # type: ignore[arg-type]
                     )
 
                 full_cls_name = attn_backend.full_cls_name()
@@ -4448,6 +4473,7 @@ class GPUModelRunner(
                     min_cg_backend_name = attn_backend.__name__
         # Flexible resolve the cudagraph mode
         cudagraph_mode = self.compilation_config.cudagraph_mode
+        assert cudagraph_mode is not None
         # check cudagraph for mixed batch is supported
         if (
             cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL
@@ -4562,12 +4588,17 @@ class GPUModelRunner(
             self.compilation_config.adjust_cudagraph_sizes_for_spec_decode(
                 self.uniform_decode_query_len, self.parallel_config.tensor_parallel_size
             )
-            self.cudagraph_batch_sizes = self.compilation_config.cudagraph_capture_sizes
+            capture_sizes = self.compilation_config.cudagraph_capture_sizes
+            self.cudagraph_batch_sizes = (
+                capture_sizes if capture_sizes is not None else []
+            )
 
         # Trigger cudagraph dispatching keys initialization after
         # resolved cudagraph mode.
+        cudagraph_mode = self.compilation_config.cudagraph_mode
+        assert cudagraph_mode is not None
         self.cudagraph_dispatcher.initialize_cudagraph_keys(
-            self.compilation_config.cudagraph_mode, self.uniform_decode_query_len
+            cudagraph_mode, self.uniform_decode_query_len
         )
 
     def calculate_reorder_batch_threshold(self) -> None:
@@ -4579,7 +4610,7 @@ class GPUModelRunner(
         """
         min_none_high = lambda a, b: a if b is None else b if a is None else min(a, b)
 
-        reorder_batch_thresholds = [
+        reorder_batch_thresholds: list[int | None] = [
             group.get_metadata_builder().reorder_batch_threshold
             for group in self._attn_group_iterator()
         ]
@@ -4588,7 +4619,7 @@ class GPUModelRunner(
         if len(reorder_batch_thresholds) == 0:
             self.reorder_batch_threshold = None
             return
-        self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds)
+        self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds)  # type: ignore[assignment]
 
     @staticmethod
     def select_common_block_size(
@@ -5048,12 +5079,16 @@ class GPUModelRunner(
             kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
 
         if self.dcp_world_size > 1:
-            layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
+            layer_type = cast(type[Any], AttentionLayerBase)
+            layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
             for layer in layers.values():
-                assert layer.impl.need_to_return_lse_for_decode, (
+                layer_impl = getattr(layer, "impl", None)
+                if layer_impl is None:
+                    continue
+                assert layer_impl.need_to_return_lse_for_decode, (
                     "DCP requires attention impls to return"
                     " the softmax lse for decode, but the impl "
-                    f"{layer.impl.__class__.__name__} "
+                    f"{layer_impl.__class__.__name__} "
                     "does not return the softmax lse for decode."
                 )
 
@@ -5094,7 +5129,8 @@ class GPUModelRunner(
         if has_ec_transfer() and get_ec_transfer().is_producer:
             return {}
         kv_cache_spec: dict[str, KVCacheSpec] = {}
-        attn_layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
+        layer_type = cast(type[Any], AttentionLayerBase)
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
         for layer_name, attn_module in attn_layers.items():
             if isinstance(attn_module, Attention) and (
                 kv_tgt_layer := attn_module.kv_sharing_target_layer_name
diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
index 9de123263755b..2ce2b64512560 100644
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -121,18 +121,24 @@ class UBatchWrapper:
 
     @staticmethod
     def _create_sm_control_context(vllm_config: VllmConfig):
-        comm_sms = envs.VLLM_DBO_COMM_SMS
+        comm_sms: int = envs.VLLM_DBO_COMM_SMS
 
         set_comm_sms = lambda sms: None
         if vllm_config.parallel_config.enable_expert_parallel:
             # Currently only DeepEP highthroughput supports SM control so this
             # only affects that case.
-            all2all_manager = get_ep_group().device_communicator.all2all_manager
+            ep_group = get_ep_group()
+            device_communicator = ep_group.device_communicator
+            all2all_manager = None
+            if device_communicator is not None:
+                all2all_manager = device_communicator.all2all_manager
 
-            if all2all_manager.max_sms_used() is not None:
-                comm_sms = min(comm_sms, all2all_manager.max_sms_used())
+            if all2all_manager is not None:
+                max_sms_used = all2all_manager.max_sms_used()
+                if max_sms_used is not None:
+                    comm_sms = min(comm_sms, max_sms_used)
 
-            if comm_sms > 0:
+            if comm_sms > 0 and all2all_manager is not None:
                 set_comm_sms = lambda sms: all2all_manager.set_num_sms(sms)
 
         # TODO(lucas): support other kernels besides DeepGEMM
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 18cbc38262793..f1fd5be966c37 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -6,7 +6,7 @@ import gc
 import os
 from contextlib import AbstractContextManager, nullcontext
 from types import NoneType
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 
 import torch
 import torch.distributed
@@ -87,8 +87,10 @@ class Worker(WorkerBase):
         # Buffers saved before sleep
         self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
 
-        # Torch profiler. Enabled and configured through env vars:
+        # Torch/CUDA profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        # VLLM_TORCH_CUDA_PROFILE=1
+        self.profiler: Any | None = None
         if envs.VLLM_TORCH_PROFILER_DIR:
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
             self.profiler = TorchProfilerWrapper(
@@ -146,17 +148,17 @@ class Worker(WorkerBase):
                 assert allocator.get_current_usage() == 0, (
                     "Sleep mode can only be used for one instance per process."
                 )
-            context = allocator.use_memory_pool(tag=tag)
+            return allocator.use_memory_pool(tag=tag)
         else:
-            context = nullcontext()
-        return context
+            return nullcontext()
 
     def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
     def init_device(self):
-        if self.device_config.device.type == "cuda":
+        device = self.device_config.device
+        if isinstance(device, torch.device) and device.type == "cuda":
             # This env var set by Ray causes exceptions with graph building.
             os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
             if (
@@ -375,23 +377,21 @@ class Worker(WorkerBase):
             from vllm.device_allocator.cumem import CuMemAllocator
 
             allocator = CuMemAllocator.get_instance()
-            context = allocator.use_memory_pool(tag="kv_cache")
+            with allocator.use_memory_pool(tag="kv_cache"):
+                self.model_runner.initialize_kv_cache(kv_cache_config)
         else:
-            context = nullcontext()
-        with context:
             self.model_runner.initialize_kv_cache(kv_cache_config)
 
     def compile_or_warm_up_model(self) -> None:
         # warm up sizes that are not in cudagraph capture sizes,
         # but users still want to compile for better performance,
         # e.g. for the max-num-batched token size in chunked prefill.
-        warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
+        compile_sizes = self.vllm_config.compilation_config.compile_sizes
+        warmup_sizes = compile_sizes.copy() if compile_sizes is not None else []
         if not self.model_config.enforce_eager:
-            warmup_sizes = [
-                x
-                for x in warmup_sizes
-                if x not in self.vllm_config.compilation_config.cudagraph_capture_sizes
-            ]
+            capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
+            if capture_sizes is not None:
+                warmup_sizes = [x for x in warmup_sizes if x not in capture_sizes]
         # We skip EPLB here since we don't want to record dummy metrics
         for size in sorted(warmup_sizes, reverse=True):
             logger.info("Compile and warming up model for size %d", size)
@@ -532,12 +532,12 @@ class Worker(WorkerBase):
             )
         }
         if forward_pass and not get_pp_group().is_first_rank:
-            intermediate_tensors = IntermediateTensors(
-                get_pp_group().recv_tensor_dict(
-                    all_gather_group=get_tp_group(),
-                    all_gather_tensors=all_gather_tensors,
-                )
+            tensor_dict = get_pp_group().recv_tensor_dict(
+                all_gather_group=get_tp_group(),
+                all_gather_tensors=all_gather_tensors,
             )
+            assert tensor_dict is not None
+            intermediate_tensors = IntermediateTensors(tensor_dict)
 
         with self.annotate_profile(scheduler_output):
             output = self.model_runner.execute_model(
@@ -605,7 +605,7 @@ class Worker(WorkerBase):
         assert self.model_runner.eplb_state is not None
         self.model_runner.eplb_state.rearrange(
             execute_shuffle=True,
-            global_expert_load=None,
+            global_expert_loads=None,
             rank_mapping=rank_mapping,
         )
         torch.cuda.synchronize()
@@ -661,7 +661,7 @@ class Worker(WorkerBase):
 
     def _reconfigure_moe(
         self, old_ep_size: int, new_ep_size: int
-    ) -> torch.Tensor | None:
+    ) -> list[torch.Tensor] | None:
         """
         Reconfigure MoE modules with provided reconfig_request
 
@@ -728,26 +728,29 @@ class Worker(WorkerBase):
             num_local_physical_experts = num_local_experts
             assert self.model_runner.eplb_state is not None
             new_physical_experts = (
-                self.model_runner.eplb_state.physical_to_logical_map.shape[1]
+                self.model_runner.eplb_state.physical_to_logical_map.shape[1]  # type: ignore[attr-defined]
             )
             parallel_config.eplb_config.num_redundant_experts = (
                 new_physical_experts
-                - self.model_runner.eplb_state.logical_replica_count.shape[1]
+                - self.model_runner.eplb_state.logical_replica_count.shape[1]  # type: ignore[attr-defined]
             )
             global_expert_loads = None
         else:
-            num_local_physical_experts = torch.tensor(
+            num_local_physical_experts_tensor = torch.tensor(
                 [num_local_experts], dtype=torch.int32, device="cpu"
             )
             torch.distributed.broadcast(
-                num_local_physical_experts, group=get_ep_group().cpu_group, group_src=0
+                num_local_physical_experts_tensor,
+                group=get_ep_group().cpu_group,
+                group_src=0,
             )
-            num_local_physical_experts = num_local_physical_experts.item()
+            num_local_physical_experts = int(num_local_physical_experts_tensor.item())
             new_physical_experts = num_local_physical_experts * new_ep_size
             assert self.model_runner.eplb_state is not None
-            global_expert_loads = self.model_runner.eplb_state.rearrange(
+            global_expert_loads_any = self.model_runner.eplb_state.rearrange(
                 execute_shuffle=False
             )
+            global_expert_loads = cast(list[torch.Tensor], global_expert_loads_any)
             parallel_config.eplb_config.num_redundant_experts = (
                 new_physical_experts - global_expert_loads[0].shape[1]
             )
@@ -849,8 +852,9 @@ def init_worker_distributed_environment(
     init_batch_invariance()
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
+    init_method = distributed_init_method or "env://"
     init_distributed_environment(
-        parallel_config.world_size, rank, distributed_init_method, local_rank, backend
+        parallel_config.world_size, rank, init_method, local_rank, backend
     )
 
     ensure_model_parallel_initialized(
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index e59361f21372a..ff047d8d03f0e 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -59,7 +59,7 @@ class KVConnectorModelRunnerMixin:
     @staticmethod
     def ensure_kv_transfer_shutdown() -> None:
         # has_kv_transfer_group can be None during interpreter shutdown.
-        if has_kv_transfer_group and has_kv_transfer_group():
+        if has_kv_transfer_group and has_kv_transfer_group():  # type: ignore[truthy-function]
             ensure_kv_transfer_shutdown()
 
     @staticmethod
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 923c31c187f31..450160d28649f 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -572,7 +572,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             format. Layers that do not need KV cache are not included.
         """
 
-        layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
+        layers = get_layers_from_vllm_config(
+            self.vllm_config,
+            AttentionLayerBase,  # type: ignore[type-abstract]
+        )
         block_size = self.vllm_config.cache_config.block_size
         cache_dtype_str = self.vllm_config.cache_config.cache_dtype
 
@@ -725,7 +728,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             req_id = self.input_batch.req_ids[i]
             assert req_id is not None
             num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-            if not use_max_model_len and num_tokens > self.most_model_len:
+            if (
+                not use_max_model_len
+                and self.most_model_len is not None
+                and num_tokens > self.most_model_len
+            ):
                 use_max_model_len = True
             num_scheduled_tokens_per_req.append(num_tokens)
         if use_max_model_len:
@@ -737,6 +744,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             else:
                 end_index = num_reqs
         else:
+            assert self.num_reqs_most_model_len is not None
             if len(num_scheduled_tokens_per_req) > self.num_reqs_most_model_len:
                 num_scheduled_tokens_per_req = num_scheduled_tokens_per_req[
                     : self.num_reqs_most_model_len
@@ -829,6 +837,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             ].to(self.device)
             seq_lens = self.seq_lens_cpu[: self.num_reqs_max_model_len].to(self.device)
         else:
+            assert self.num_reqs_most_model_len is not None
             block_tables = self.block_table_cpu[
                 : self.num_reqs_most_model_len, : self.num_blocks_per_most_len_req
             ]
@@ -931,6 +940,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
             for mm_input_id in encoder_input_ids:
                 mm_feature = req_state.mm_features[mm_input_id]
+                if mm_feature.data is None:
+                    continue
                 mm_hash = mm_feature.identifier
                 mm_kwargs.append(mm_feature.data)
                 mm_hashes_pos.append((mm_hash, mm_feature.mm_position))
@@ -1114,7 +1125,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     ) -> ModelRunnerOutput:
         if self.scheduler_output is None:
             # Nothing to do (PP non-final rank case), output isn't used.
-            return None  # noqa
+            return None  # type: ignore[return-value]
         scheduler_output = self.scheduler_output
         mm_embed_inputs = self.mm_embed_inputs
         self.scheduler_output = None
@@ -1696,7 +1707,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     ) -> None:
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
-            if self.model_config.multimodal_config.skip_mm_profiling:
+            mm_config = self.model_config.multimodal_config
+            if mm_config is not None and mm_config.skip_mm_profiling:
                 logger.info(
                     "Skipping memory profiling for multimodal encoder and "
                     "encoder cache."
@@ -2166,5 +2178,9 @@ def replace_set_lora(model):
         if isinstance(module, BaseLayerWithLoRA):
             module._original_set_lora = module.set_lora
             module._original_reset_lora = module.reset_lora
-            module.set_lora = _tpu_set_lora.__get__(module, module.__class__)
-            module.reset_lora = _tpu_reset_lora.__get__(module, module.__class__)
+            module.set_lora = _tpu_set_lora.__get__(  # type: ignore[method-assign]
+                module, module.__class__
+            )
+            module.reset_lora = _tpu_reset_lora.__get__(  # type: ignore[method-assign]
+                module, module.__class__
+            )
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index a716a9c3aa822..569b2aaa766e4 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -141,8 +141,7 @@ class TPUWorker:
 
         # Set random seed.
         set_random_seed(self.model_config.seed)
-        if self.model_config.seed is not None:
-            xm.set_rng_state(self.model_config.seed, self.device)
+        xm.set_rng_state(self.model_config.seed, self.device)
 
         # Increase the cache size limit, which is the maximum number of
         # dynamo graphs that can be compiled.
@@ -332,7 +331,7 @@ class TPUWorker:
             world_size=parallel_config.world_size,
             rank=rank,
             local_rank=local_rank,
-            distributed_init_method=distributed_init_method,
+            distributed_init_method=distributed_init_method or "env://",
             backend=current_platform.dist_backend,
         )
         ensure_model_parallel_initialized(
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 9e99ea964ee08..92e4ce3abdba3 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -280,7 +280,7 @@ def bind_kv_cache(
     kv_caches: dict[str, torch.Tensor],
     forward_context: dict[str, "Attention"],
     runner_kv_caches: list[torch.Tensor],
-    num_attn_module: int | None = 1,
+    num_attn_module: int = 1,
 ) -> None:
     """
     Bind the allocated KV cache to both ModelRunner and forward context so
@@ -362,5 +362,7 @@ def is_residual_scattered_for_sp(
         or vllm_config.compilation_config.use_inductor_graph_partition
     ):
         return True
-
-    return num_input_tokens in vllm_config.compilation_config.compile_sizes
+    compile_sizes = vllm_config.compilation_config.compile_sizes
+    if compile_sizes is None:
+        return False
+    return num_input_tokens in compile_sizes
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 16f321c080779..57e7037e946ec 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -315,10 +315,12 @@ class WorkerWrapperBase:
 
     def initialize_from_config(self, kv_cache_configs: list[Any]) -> None:
         kv_cache_config = kv_cache_configs[self.global_rank]
+        assert self.vllm_config is not None
         with set_current_vllm_config(self.vllm_config):
             self.worker.initialize_from_config(kv_cache_config)  # type: ignore
 
     def init_device(self):
+        assert self.vllm_config is not None
         with set_current_vllm_config(self.vllm_config):
             # To make vLLM config available during device initialization
             self.worker.init_device()  # type: ignore
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 26c6f8d06bdcd..4d7864e90496a 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
+from typing import Any
 
 import torch
 import torch.distributed
@@ -37,6 +38,7 @@ class XPUWorker(Worker):
 
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        self.profiler: Any | None = None
         if envs.VLLM_TORCH_PROFILER_DIR:
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
@@ -148,7 +150,12 @@ class XPUWorker(Worker):
         return int(available_kv_cache_memory)
 
     def init_device(self):
-        if self.device_config.device.type == "xpu" and current_platform.is_xpu():
+        device = self.device_config.device
+        if (
+            isinstance(device, torch.device)
+            and device.type == "xpu"
+            and current_platform.is_xpu()
+        ):
             self.device = torch.device(f"xpu:{self.local_rank}")
             current_platform.set_device(self.device)
             current_platform.check_if_supports_dtype(self.model_config.dtype)

From 0e741c12e3dc45093b2ddab8a31310703aa27002 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Nov 2025 11:38:35 +0800
Subject: [PATCH 095/249] [Bugfix] Fix Plamo3 rope handling (#29092)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/plamo3.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py
index 5bb07722a5fc1..4aeb9d432dcc6 100644
--- a/vllm/model_executor/models/plamo3.py
+++ b/vllm/model_executor/models/plamo3.py
@@ -62,7 +62,7 @@ class Plamo3Config(PretrainedConfig):  # type: ignore
     # if `sliding_window` is list
     interleaved_sliding_window: list[int | None]
     sliding_window_pattern: int
-    rope_theta: int
+    rope_parameters: dict[str, Any]
     rope_local_theta: int
     # MLP
     intermediate_size: int
@@ -153,13 +153,24 @@ class Plamo3AttentionMixer(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.o_proj",
         )
-        layer_idx = extract_layer_index(prefix)
-        full_attn = config.interleaved_sliding_window[layer_idx] is None
 
-        self.rope_theta = config.rope_theta if full_attn else config.rope_local_theta
-        self.rope_scaling = (
-            config.rope_scaling if hasattr(config, "rope_scaling") else None
-        )
+        layer_idx = extract_layer_index(prefix)
+        layer_type = config.layer_types[layer_idx]
+        is_sliding = layer_type == "sliding_attention"
+
+        # Initialize the rotary embedding.
+        if layer_type in config.rope_parameters:
+            # Transformers v5 rope config.
+            rope_parameters = config.rope_parameters[layer_type]
+        else:
+            # Transformers v4 rope config.
+            # Global attention. Use the values in config.json.
+            rope_parameters = config.rope_parameters
+            # Local attention. Override the values in config.json.
+            if is_sliding:
+                rope_parameters = dict(
+                    rope_type="default", rope_theta=config.rope_local_theta
+                )
         max_position = config.max_position_embeddings
         if hasattr(vllm_config.model_config, "max_model_len") and isinstance(
             vllm_config.model_config.max_model_len, int
@@ -170,8 +181,7 @@ class Plamo3AttentionMixer(nn.Module):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
         set_weight_attrs(

From a982f5b5ea4a1932424927ea357b532d0e45caf1 Mon Sep 17 00:00:00 2001
From: zhrrr <43847754+izhuhaoran@users.noreply.github.com>
Date: Fri, 21 Nov 2025 11:39:09 +0800
Subject: [PATCH 096/249] [kernel][perf] support uncontiguous input for
 rms_norm kernel (#28103)

Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 csrc/dispatch_utils.h             | 21 ++++++++
 csrc/layernorm_kernels.cu         | 80 +++++++++++++++++++++----------
 vllm/_custom_ops.py               |  5 +-
 vllm/compilation/matcher_utils.py |  4 +-
 4 files changed, 77 insertions(+), 33 deletions(-)

diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 9ae0ed975edde..e1d131e4a7851 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -117,3 +117,24 @@
       break;                                  \
     }                                         \
   }
+
+#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...)                                   \
+  switch (NUM_DIMS) {                                                          \
+    case 2: {                                                                  \
+      constexpr int tensor_rank = 2;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    case 3: {                                                                  \
+      constexpr int tensor_rank = 3;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    case 4: {                                                                  \
+      constexpr int tensor_rank = 4;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    default:                                                                   \
+      TORCH_CHECK(false, "Expects rank 2, 3 or 4 tensors but got ", NUM_DIMS); \
+  }
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index 48771e4b3aff9..dfc67b933ccae 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -10,16 +10,38 @@
 namespace vllm {
 
 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t, int VEC_SIZE>
+template <typename scalar_t, int VEC_SIZE, int NUM_DIMS>
 __global__ void rms_norm_kernel(
-    scalar_t* __restrict__ out,          // [..., hidden_size]
-    const scalar_t* __restrict__ input,  // [..., hidden_size]
-    const int64_t input_stride,
+    scalar_t* __restrict__ out,           // [..., hidden_size]
+    const scalar_t* __restrict__ input,   // [..., hidden_size]
+    const int64_t input_stride_d2,        // input.stride(-2)
+    const int64_t input_stride_d3,        // input.stride(-3)
+    const int64_t input_stride_d4,        // input.stride(-4)
+    const int64_t input_shape_d2,         // input.size(-2)
+    const int64_t input_shape_d3,         // input.size(-3)
     const scalar_t* __restrict__ weight,  // [hidden_size]
     const float epsilon, const int num_tokens, const int hidden_size) {
   __shared__ float s_variance;
   float variance = 0.0f;
-  const scalar_t* input_row = input + blockIdx.x * input_stride;
+  const scalar_t* input_row;
+  if constexpr (NUM_DIMS == 2) {
+    // 2D for layernorm normal case [batch_size, hidden]
+    input_row = input + blockIdx.x * input_stride_d2;
+  } else if constexpr (NUM_DIMS == 3) {
+    // 3D for q/k norm [batch_size, num_heads, head_size]
+    int batch_idx = blockIdx.x / input_shape_d2;
+    int head_idx = blockIdx.x % input_shape_d2;
+    input_row =
+        input + batch_idx * input_stride_d3 + head_idx * input_stride_d2;
+  } else if constexpr (NUM_DIMS == 4) {
+    // 4D for transformers model_impl qk norm [batch, seq, head, head_dim]
+    int batch_idx = blockIdx.x / (input_shape_d3 * input_shape_d2);
+    int remaining = blockIdx.x % (input_shape_d3 * input_shape_d2);
+    int seq_idx = remaining / input_shape_d2;
+    int head_idx = remaining % input_shape_d2;
+    input_row = input + batch_idx * input_stride_d4 +
+                seq_idx * input_stride_d3 + head_idx * input_stride_d2;
+  }
 
   auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
@@ -164,38 +186,44 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
               torch::Tensor& weight,  // [hidden_size]
               double epsilon) {
   TORCH_CHECK(out.is_contiguous());
+  if (input.stride(-1) != 1) {
+    input = input.contiguous();
+  }
   TORCH_CHECK(input.stride(-1) == 1);
   TORCH_CHECK(weight.is_contiguous());
 
   int hidden_size = input.size(-1);
 
-  // We cannot just use `input.stride(-2)` if the tensor is not row-major.
-  // Instead, we use a 2d view to get the second-innermost stride.
-  // That way the dimensions (except the last one) can be arbitrarily permuted.
-  torch::Tensor input_view = input.view({-1, hidden_size});
-
-  int num_tokens = input_view.numel() / hidden_size;
-  int64_t input_stride = input_view.stride(-2);
+  int num_tokens = input.numel() / hidden_size;
+  int num_dims = input.dim();
+  int64_t input_stride_d2 = input.stride(-2);
+  int64_t input_stride_d3 = (num_dims >= 3) ? input.stride(-3) : 0;
+  int64_t input_stride_d4 = (num_dims >= 4) ? input.stride(-4) : 0;
+  int64_t input_shape_d2 = (num_dims >= 3) ? input.size(-2) : 0;
+  int64_t input_shape_d3 = (num_dims >= 4) ? input.size(-3) : 0;
 
   // For large num_tokens, use smaller blocks to increase SM concurrency.
   const int max_block_size = (num_tokens < 256) ? 1024 : 256;
   dim3 grid(num_tokens);
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      input_view.scalar_type(), "rms_norm_kernel", [&] {
-        const int calculated_vec_size =
-            std::gcd(16 / sizeof(scalar_t), hidden_size);
-        const int block_size =
-            std::min(hidden_size / calculated_vec_size, max_block_size);
-        dim3 block(block_size);
-        VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
-          vllm::rms_norm_kernel<scalar_t, vec_size><<<grid, block, 0, stream>>>(
-              out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
-              input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
-              hidden_size);
-        });
+  VLLM_DISPATCH_RANK234(num_dims, [&] {
+    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
+      const int calculated_vec_size =
+          std::gcd(16 / sizeof(scalar_t), hidden_size);
+      const int block_size =
+          std::min(hidden_size / calculated_vec_size, max_block_size);
+      dim3 block(block_size);
+      VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
+        vllm::rms_norm_kernel<scalar_t, vec_size, tensor_rank>
+            <<<grid, block, 0, stream>>>(
+                out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+                input_stride_d2, input_stride_d3, input_stride_d4,
+                input_shape_d2, input_shape_d3, weight.data_ptr<scalar_t>(),
+                epsilon, num_tokens, hidden_size);
       });
+    });
+  });
 }
 
 #define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 66cf6472eee40..0f625a7945241 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -328,10 +328,7 @@ def rotary_embedding(
 def rms_norm(
     out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, epsilon: float
 ) -> None:
-    # TODO: Remove this contiguous call when the kernel is updated to support non-contiguous input
-    # If removed, also need to remove contiguous in MatcherRMSNorm
-    input_contiguous = input.contiguous()
-    torch.ops._C.rms_norm(out, input_contiguous, weight, epsilon)
+    torch.ops._C.rms_norm(out, input, weight, epsilon)
 
 
 def fused_add_rms_norm(
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
index 38eb4e5301a18..e4cd063d2aee1 100644
--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -162,12 +162,10 @@ class MatcherRMSNorm(MatcherCustomOp):
         weight: torch.Tensor,
     ) -> torch.Tensor:
         result = torch.empty_like(input)
-        # TODO: support non-contiguous input for RMSNorm and remove this
-        input_contiguous = input.contiguous()
         _, result = auto_functionalized(
             RMS_OP,
             result=result,
-            input=input_contiguous,
+            input=input,
             weight=weight,
             epsilon=self.epsilon,
         )

From 0730414999343e722590ace615d5814c7e5b6827 Mon Sep 17 00:00:00 2001
From: jeremyteboul <80506730+jeremyteboul@users.noreply.github.com>
Date: Thu, 20 Nov 2025 19:39:47 -0800
Subject: [PATCH 097/249] [Core] Add audio_embeds support to chat completions
 (#29059)

Signed-off-by: Jeremy Teboul <jeremyteboul@fb.com>
Co-authored-by: Jeremy Teboul <jeremyteboul@fb.com>
---
 docs/features/multimodal_inputs.md   |  32 ++++++
 tests/entrypoints/test_chat_utils.py | 145 ++++++++++++++++++++++++++
 vllm/entrypoints/chat_utils.py       | 149 ++++++++++++++++++++++++++-
 vllm/multimodal/audio.py             |  24 +++++
 vllm/multimodal/utils.py             |  13 ++-
 5 files changed, 360 insertions(+), 3 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 5f684604e6031..4656ee43ea251 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -365,6 +365,8 @@ You must enable this feature via `enable_mm_embeds=True`.
     The vLLM engine may crash if incorrect shape of embeddings is passed.
     Only enable this flag for trusted users!
 
+#### Image Embeddings
+
 ??? code
 
     ```python
@@ -441,6 +443,36 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
         print(generated_text)
     ```
 
+#### Audio Embeddings
+
+You can pass pre-computed audio embeddings similar to image embeddings:
+
+??? code
+
+    ```python
+    from vllm import LLM
+    import torch
+
+    # Enable audio embeddings support
+    llm = LLM(model="fixie-ai/ultravox-v0_5-llama-3_2-1b", enable_mm_embeds=True)
+
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <audio>\nWhat is in this audio?\nASSISTANT:"
+
+    # Load pre-computed audio embeddings
+    # torch.Tensor of shape (1, audio_feature_size, hidden_size of LM)
+    audio_embeds = torch.load(...)
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"audio": audio_embeds},
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
+
 ## Online Serving
 
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index ca87b3e76b3f4..7baf564ad01a4 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -103,6 +103,19 @@ def qwen2_audio_model_config():
     )
 
 
+@pytest.fixture(scope="function")
+def audio_embeds_model_config():
+    return ModelConfig(
+        QWEN2AUDIO_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "audio": 2,
+        },
+        enable_mm_embeds=True,
+    )
+
+
 @pytest.fixture(scope="module")
 def qwen2_audio_tokenizer():
     return get_tokenizer(QWEN2AUDIO_MODEL_ID)
@@ -843,6 +856,138 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
 
 
+def test_parse_chat_messages_empty_audio_embeds_with_uuid(
+    audio_embeds_model_config,
+    qwen2_audio_tokenizer,
+):
+    """Test audio_embeds with UUID (no actual embeds data)."""
+    uuid = "test-audio-uuid-123"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this audio"},
+                    {"type": "audio_embeds", "audio_embeds": None, "uuid": uuid},
+                ],
+            }
+        ],
+        audio_embeds_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    # Should have audio in mm_data as None (UUID provided)
+    assert mm_data is not None
+    assert "audio" in mm_data
+    assert mm_data["audio"] is None
+    # UUID should be recorded
+    assert mm_uuids is not None
+    assert "audio" in mm_uuids
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid])
+
+
+def test_parse_chat_messages_audio_embeds_with_string(
+    audio_embeds_model_config,
+    qwen2_audio_tokenizer,
+):
+    """Test audio_embeds with base64 string embedding data."""
+    import base64
+    import io
+
+    import torch
+
+    # Create a sample audio embedding tensor
+    audio_embedding = torch.randn(1, 128, 768)
+
+    # Encode it as base64
+    buffer = io.BytesIO()
+    torch.save(audio_embedding, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this audio"},
+                    {
+                        "type": "audio_embeds",
+                        "audio_embeds": base64_audio_embedding,
+                    },
+                ],
+            }
+        ],
+        audio_embeds_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    # Should have audio embedding in mm_data (single tensor, not a list)
+    assert mm_data is not None
+    assert "audio" in mm_data
+    assert isinstance(mm_data["audio"], torch.Tensor)
+    assert mm_data["audio"].shape == audio_embedding.shape
+    # No UUID provided
+    assert mm_uuids is not None
+    assert "audio" in mm_uuids
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_audio_embeds_async(
+    audio_embeds_model_config,
+    qwen2_audio_tokenizer,
+):
+    """Test audio_embeds with async futures."""
+    import base64
+    import io
+
+    import torch
+
+    # Create a sample audio embedding tensor
+    audio_embedding = torch.randn(1, 128, 768)
+
+    # Encode it as base64
+    buffer = io.BytesIO()
+    torch.save(audio_embedding, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this audio"},
+                    {
+                        "type": "audio_embeds",
+                        "audio_embeds": base64_audio_embedding,
+                    },
+                ],
+            }
+        ],
+        audio_embeds_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    # Should have audio embedding in mm_data (single tensor, not a list)
+    mm_data = await mm_future
+    assert mm_data is not None
+    assert "audio" in mm_data
+    assert isinstance(mm_data["audio"], torch.Tensor)
+    assert mm_data["audio"].shape == audio_embedding.shape
+    # No UUID provided
+    assert mm_uuids is not None
+    assert "audio" in mm_uuids
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
     phi3v_model_config_image_embeds,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 03214c4d131bc..aaf8a3ae9d2dd 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -94,6 +94,22 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
     """
 
 
+class ChatCompletionContentPartAudioEmbedsParam(TypedDict, total=False):
+    audio_embeds: str | dict[str, str] | None
+    """
+    The audio embeddings. It can be either:
+    - A single base64 string representing a serialized torch tensor.
+    - A dictionary where each value is a base64 string.
+    """
+    type: Required[Literal["audio_embeds"]]
+    """The type of the content part."""
+    uuid: str | None
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
+
+
 class VideoURL(TypedDict, total=False):
     url: Required[str]
     """
@@ -211,6 +227,7 @@ ChatCompletionContentPartParam: TypeAlias = (
     | CustomChatCompletionContentPILImageParam
     | CustomChatCompletionContentSimpleImageParam
     | ChatCompletionContentPartImageEmbedsParam
+    | ChatCompletionContentPartAudioEmbedsParam
     | CustomChatCompletionContentSimpleAudioParam
     | CustomChatCompletionContentSimpleVideoParam
     | str
@@ -599,7 +616,7 @@ def resolve_chat_template_content_format(
     return detected_format
 
 
-ModalityStr = Literal["image", "audio", "video", "image_embeds"]
+ModalityStr = Literal["image", "audio", "video", "image_embeds", "audio_embeds"]
 _T = TypeVar("_T")
 
 
@@ -684,6 +701,11 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
             mm_uuids["image"] = uuids_by_modality["image_embeds"]
         if "image" in uuids_by_modality:
             mm_uuids["image"] = uuids_by_modality["image"]  # UUIDs of images
+        if "audio_embeds" in uuids_by_modality:
+            audio_embeds_uuids = uuids_by_modality["audio_embeds"]
+            if len(audio_embeds_uuids) > 1:
+                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
+            mm_uuids["audio"] = uuids_by_modality["audio_embeds"]
         if "audio" in uuids_by_modality:
             mm_uuids["audio"] = uuids_by_modality["audio"]  # UUIDs of audios
         if "video" in uuids_by_modality:
@@ -703,6 +725,8 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
         items_by_modality = dict(self._items_by_modality)
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
             raise ValueError("Mixing raw image and embedding inputs is not allowed")
+        if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
+            raise ValueError("Mixing raw audio and embedding inputs is not allowed")
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
@@ -711,6 +735,11 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
             mm_inputs["image"] = image_embeds_lst[0]
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
+        if "audio_embeds" in items_by_modality:
+            audio_embeds_lst = items_by_modality["audio_embeds"]
+            if len(audio_embeds_lst) > 1:
+                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
+            mm_inputs["audio"] = audio_embeds_lst[0]
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
@@ -738,6 +767,8 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
 
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
             raise ValueError("Mixing raw image and embedding inputs is not allowed")
+        if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
+            raise ValueError("Mixing raw audio and embedding inputs is not allowed")
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
@@ -746,6 +777,11 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
             mm_inputs["image"] = image_embeds_lst[0]
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
+        if "audio_embeds" in items_by_modality:
+            audio_embeds_lst = items_by_modality["audio_embeds"]
+            if len(audio_embeds_lst) > 1:
+                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
+            mm_inputs["audio"] = audio_embeds_lst[0]
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
@@ -804,6 +840,14 @@ class BaseMultiModalContentParser(ABC):
     ) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_audio_embeds(
+        self,
+        audio_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
+    ) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
         raise NotImplementedError
@@ -861,6 +905,31 @@ class MultiModalContentParser(BaseMultiModalContentParser):
 
         self._add_placeholder("image", placeholder)
 
+    def parse_audio_embeds(
+        self,
+        audio_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
+    ) -> None:
+        mm_config = self.model_config.get_multimodal_config()
+        if not mm_config.enable_mm_embeds:
+            raise ValueError(
+                "You must set `--enable-mm-embeds` to input `audio_embeds`"
+            )
+
+        if isinstance(audio_embeds, dict):
+            embeds = {
+                k: self._connector.fetch_audio_embedding(v)
+                for k, v in audio_embeds.items()
+            }
+            placeholder = self._tracker.add("audio_embeds", embeds, uuid)
+        elif isinstance(audio_embeds, str):
+            embedding = self._connector.fetch_audio_embedding(audio_embeds)
+            placeholder = self._tracker.add("audio_embeds", embedding, uuid)
+        else:
+            placeholder = self._tracker.add("audio_embeds", None, uuid)
+
+        self._add_placeholder("audio", placeholder)
+
     def parse_image_pil(
         self, image_pil: Image.Image | None, uuid: str | None = None
     ) -> None:
@@ -950,6 +1019,67 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         placeholder = self._tracker.add("image_embeds", future, uuid)
         self._add_placeholder("image", placeholder)
 
+    def parse_audio_embeds(
+        self,
+        audio_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
+    ) -> None:
+        mm_config = self.model_config.get_multimodal_config()
+        if not mm_config.enable_mm_embeds:
+            raise ValueError(
+                "You must set `--enable-mm-embeds` to input `audio_embeds`"
+            )
+
+        logger.info(
+            "🎵 Parsing audio_embeds: type=%s, uuid=%s, is_dict=%s, "
+            "is_str=%s, is_none=%s",
+            type(audio_embeds).__name__,
+            uuid,
+            isinstance(audio_embeds, dict),
+            isinstance(audio_embeds, str),
+            audio_embeds is None,
+        )
+
+        future: asyncio.Future[str | dict[str, str] | None] = asyncio.Future()
+
+        if isinstance(audio_embeds, dict):
+            logger.info(
+                "🎵 Processing dict audio_embeds with %d entries",
+                len(audio_embeds),
+            )
+            embeds = {
+                k: self._connector.fetch_audio_embedding(v)
+                for k, v in audio_embeds.items()
+            }
+            future.set_result(embeds)
+            logger.info(
+                "🎵 Successfully loaded %d audio embeddings from dict",
+                len(embeds),
+            )
+
+        if isinstance(audio_embeds, str):
+            base64_size = len(audio_embeds)
+            logger.info(
+                "🎵 Processing base64 audio_embeds: %d chars (%.2f KB)",
+                base64_size,
+                base64_size / 1024,
+            )
+            embedding = self._connector.fetch_audio_embedding(audio_embeds)
+            future.set_result(embedding)
+            logger.info(
+                "🎵 Successfully loaded audio embedding tensor: shape=%s, dtype=%s",
+                embedding.shape,
+                embedding.dtype,
+            )
+
+        if audio_embeds is None:
+            logger.info("🎵 Audio embeds is None (UUID-only reference)")
+            future.set_result(None)
+
+        placeholder = self._tracker.add("audio_embeds", future, uuid)
+        self._add_placeholder("audio", placeholder)
+        logger.info("🎵 Added audio_embeds placeholder with uuid=%s", uuid)
+
     def parse_image_pil(
         self, image_pil: Image.Image | None, uuid: str | None = None
     ) -> None:
@@ -1132,6 +1262,7 @@ def _get_full_multimodal_text_prompt(
 # No need to validate using Pydantic again
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
+_AudioEmbedsParser = partial(cast, ChatCompletionContentPartAudioEmbedsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
@@ -1155,6 +1286,7 @@ MM_PARSER_MAP: dict[
     "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None),
     "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
+    "audio_embeds": lambda part: _AudioEmbedsParser(part).get("audio_embeds", None),
     "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
     "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio": lambda part: _InputAudioParser(part).get("input_audio", None),
@@ -1223,8 +1355,17 @@ def _parse_chat_message_content_mm_part(
             )
             image_embeds = image_params.get("image_embeds", None)
             return "image_embeds", image_embeds
+        if "audio_embeds" in part:
+            # "audio_embeds" could be None if UUID is provided.
+            audio_params = cast(  # type: ignore[assignment]
+                ChatCompletionContentPartAudioEmbedsParam, part
+            )
+            audio_embeds = audio_params.get("audio_embeds", None)
+            return "audio_embeds", audio_embeds
         if "audio_url" in part:
-            audio_params = cast(CustomChatCompletionContentSimpleAudioParam, part)
+            audio_params = cast(  # type: ignore[assignment]
+                CustomChatCompletionContentSimpleAudioParam, part
+            )
             audio_url = audio_params.get("audio_url", None)
             if isinstance(audio_url, dict):
                 # Can potentially happen if user provides a uuid
@@ -1348,6 +1489,10 @@ def _parse_chat_message_content_part(
         content = cast(str | dict[str, str], content) if content is not None else None
         mm_parser.parse_image_embeds(content, uuid)
         modality = "image"
+    elif part_type == "audio_embeds":
+        content = cast(str | dict[str, str], content) if content is not None else None
+        mm_parser.parse_audio_embeds(content, uuid)
+        modality = "audio"
     elif part_type == "audio_url":
         str_content = cast(str, content)
         mm_parser.parse_audio(str_content, uuid)
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 53052ddc6343c..b93a42ffd24c1 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -7,6 +7,8 @@ from typing import Literal
 
 import numpy as np
 import numpy.typing as npt
+import pybase64
+import torch
 
 from vllm.utils.import_utils import PlaceholderModule
 
@@ -116,3 +118,25 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
             data = buffer.getvalue()
 
         return base64.b64encode(data).decode("utf-8")
+
+
+class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def load_bytes(self, data: bytes) -> torch.Tensor:
+        buffer = BytesIO(data)
+        return torch.load(buffer, weights_only=True)
+
+    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
+        return self.load_bytes(pybase64.b64decode(data, validate=True))
+
+    def load_file(self, filepath: Path) -> torch.Tensor:
+        return torch.load(filepath, weights_only=True)
+
+    def encode_base64(self, media: torch.Tensor) -> str:
+        buffer = BytesIO()
+        torch.save(media, buffer)
+        buffer.seek(0)
+        binary_data = buffer.read()
+        return pybase64.b64encode(binary_data).decode("utf-8")
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index ac89bdacc01d5..1020554e2e073 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -22,7 +22,7 @@ from vllm.logger import init_logger
 from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.registry import ExtensionManager
 
-from .audio import AudioMediaIO
+from .audio import AudioEmbeddingMediaIO, AudioMediaIO
 from .base import MediaIO
 from .image import ImageEmbeddingMediaIO, ImageMediaIO
 from .video import VideoMediaIO
@@ -342,6 +342,17 @@ class MediaConnector:
 
         return image_embedding_io.load_base64("", data)
 
+    def fetch_audio_embedding(
+        self,
+        data: str,
+    ) -> torch.Tensor:
+        """
+        Load audio embedding from a URL.
+        """
+        audio_embedding_io = AudioEmbeddingMediaIO()
+
+        return audio_embedding_io.load_base64("", data)
+
 
 def encode_audio_base64(
     audio: np.ndarray,

From 698024ecce3ebe9a108e9583395ca612757f7845 Mon Sep 17 00:00:00 2001
From: Qidong Su <soodoshll@gmail.com>
Date: Thu, 20 Nov 2025 22:40:25 -0500
Subject: [PATCH 098/249] [Doc] update installation guide regarding
 aarch64+cuda pytorch build (#28875)

Signed-off-by: Qidong Su <soodoshll@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docker/Dockerfile                               | 17 -----------------
 docs/deployment/docker.md                       |  7 +++----
 .../installation/gpu.cuda.inc.md                |  5 +----
 3 files changed, 4 insertions(+), 25 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 964700e2a43ac..709b79e84fbbc 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -56,7 +56,6 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 
 # PyTorch provides its own indexes for standard and nightly builds
 ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 
 # PIP supports multiple authentication schemes, including keyring
 # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
@@ -98,7 +97,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Activate virtual environment and add uv to PATH
@@ -317,7 +315,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Install uv for faster pip installs
@@ -337,20 +334,6 @@ ENV UV_LINK_MODE=copy
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# arm64 (GH200) build follows the practice of "use existing pytorch" build,
-# we need to install torch and torchvision from the nightly builds first,
-# pytorch will not appear as a vLLM dependency in all of the following steps
-# after this step
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            --pre pytorch_triton==3.3.0+gitab727c40 ; \
-    fi
-
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 1c639f3533d47..0e636c87f38a4 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -82,8 +82,7 @@ DOCKER_BUILDKIT=1 docker build . \
 
 ## Building for Arm64/aarch64
 
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
-of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
 
 !!! note
     Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
@@ -94,7 +93,6 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 
     ```bash
     # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
-    python3 use_existing_torch.py
     DOCKER_BUILDKIT=1 docker build . \
     --file docker/Dockerfile \
     --target vllm-openai \
@@ -102,7 +100,8 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
     -t vllm/vllm-gh200-openai:latest \
     --build-arg max_jobs=66 \
     --build-arg nvcc_threads=2 \
-    --build-arg torch_cuda_arch_list="9.0 10.0+PTX"
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
+    --build-arg RUN_WHEEL_CHECK=false
     ```
 
 !!! note
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index b2d0d64a2d355..601d3659af886 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -158,10 +158,7 @@ uv pip install -e .
 
 ##### Use an existing PyTorch installation
 
-There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, e.g.:
-
-- Building vLLM with PyTorch nightly or a custom PyTorch build.
-- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it.
+There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, for example, when building vLLM with non-default PyTorch builds (like nightly or a custom build).
 
 To build vLLM using an existing PyTorch installation:
 

From 56e96b37e4951946c06379b4891d8170e743dcc2 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Nov 2025 11:40:40 +0800
Subject: [PATCH 099/249] [V0 Deprecation] Remove `best_of` (#29090)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/usage/v1_guide.md                        |  4 +-
 tests/v1/sample/test_sampling_params_e2e.py   |  8 ----
 vllm/entrypoints/openai/protocol.py           |  4 --
 vllm/entrypoints/openai/serving_completion.py | 10 +----
 vllm/sampling_params.py                       | 40 -------------------
 vllm/v1/engine/processor.py                   |  3 --
 6 files changed, 4 insertions(+), 65 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 8d8a9e0f50805..e46bee3f4ef20 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -169,8 +169,8 @@ As part of the major architectural rework in vLLM V1, several legacy features ha
 - **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
 - **Per-Request Logits Processors**: In V0, users could pass custom
   processing functions to adjust logits on a per-request basis. In vLLM V1, this
-  feature has been deprecated. Instead, the design is moving toward supporting **global logits
-  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).
+  feature has been deprecated. Instead, we now support **global logits processors**
+  which are set at startup time, see [RFC #17799](https://github.com/vllm-project/vllm/issues/17799).
 
 ##### KV Cache features
 
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index 915b9957031d8..1684252174d3d 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -22,14 +22,6 @@ def test_n_gt_1(llm):
     assert len(outputs[0].outputs) == 3
 
 
-def test_best_of(llm):
-    """Raise a ValueError since best_of is deprecated."""
-
-    params = SamplingParams(n=2, best_of=3)
-    with pytest.raises(ValueError):
-        _ = llm.generate(PROMPT, params)
-
-
 def test_penalties(llm):
     """Check that we do not get errors if applied."""
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 65bd15ba387b9..41172d8ec2f72 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -565,7 +565,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
     user: str | None = None
 
     # --8<-- [start:chat-completion-sampling-params]
-    best_of: int | None = None
     use_beam_search: bool = False
     top_k: int | None = None
     min_p: float | None = None
@@ -889,7 +888,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
             extra_args["kv_transfer_params"] = self.kv_transfer_params
         return SamplingParams.from_optional(
             n=self.n,
-            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=repetition_penalty,
@@ -1088,7 +1086,6 @@ class CompletionRequest(OpenAIBaseModel):
     # https://platform.openai.com/docs/api-reference/completions/create
     model: str | None = None
     prompt: list[int] | list[list[int]] | str | list[str] | None = None
-    best_of: int | None = None
     echo: bool | None = False
     frequency_penalty: float | None = 0.0
     logit_bias: dict[str, float] | None = None
@@ -1375,7 +1372,6 @@ class CompletionRequest(OpenAIBaseModel):
             extra_args["kv_transfer_params"] = self.kv_transfer_params
         return SamplingParams.from_optional(
             n=self.n,
-            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=repetition_penalty,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 1cfb45ef40366..9681aa8c71e6d 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -250,14 +250,8 @@ class OpenAIServingCompletion(OpenAIServing):
         model_name = self.models.model_name(lora_request)
         num_prompts = len(engine_prompts)
 
-        # Similar to the OpenAI API, when n != best_of, we do not stream the
-        # results. Noting that best_of is only supported in V0. In addition,
-        # we do not stream the results when use beam search.
-        stream = (
-            request.stream
-            and (request.best_of is None or request.n == request.best_of)
-            and not request.use_beam_search
-        )
+        # We do not stream the results when using beam search.
+        stream = request.stream and not request.use_beam_search
 
         # Streaming response
         if stream:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 0fb1d67687c82..fbbe3d4cabb9a 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -144,12 +144,6 @@ class SamplingParams(
         are generated and streamed cumulatively per request. To see all `n`
         outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY`
         in `SamplingParams`."""
-    best_of: int | None = None
-    """Number of output sequences that are generated from the prompt. From
-    these `best_of` sequences, the top `n` sequences are returned. `best_of`
-    must be greater than or equal to `n`. By default, `best_of` is set to `n`.
-    Warning, this is only supported in V0."""
-    _real_n: int | None = None
     presence_penalty: float = 0.0
     """Penalizes new tokens based on whether they appear in the generated text
     so far. Values > 0 encourage the model to use new tokens, while values < 0
@@ -265,7 +259,6 @@ class SamplingParams(
     @staticmethod
     def from_optional(
         n: int | None = 1,
-        best_of: int | None = None,
         presence_penalty: float | None = 0.0,
         frequency_penalty: float | None = 0.0,
         repetition_penalty: float | None = 1.0,
@@ -315,7 +308,6 @@ class SamplingParams(
 
         return SamplingParams(
             n=1 if n is None else n,
-            best_of=best_of,
             presence_penalty=0.0 if presence_penalty is None else presence_penalty,
             frequency_penalty=0.0 if frequency_penalty is None else frequency_penalty,
             repetition_penalty=1.0
@@ -348,22 +340,6 @@ class SamplingParams(
         )
 
     def __post_init__(self) -> None:
-        # how we deal with `best_of`:
-        # if `best_of` is not set, we default to `n`;
-        # if `best_of` is set, we set `n` to `best_of`,
-        # and set `_real_n` to the original `n`.
-        # when we return the result, we will check
-        # if we need to return `n` or `_real_n` results
-        if self.best_of:
-            if self.best_of < self.n:
-                raise ValueError(
-                    f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}."
-                )
-            if not self._real_n:
-                self._real_n = self.n
-                self.n = self.best_of
-
         if 0 < self.temperature < _MAX_TEMP:
             logger.warning(
                 "temperature %s is less than %s, which may cause numerical "
@@ -433,18 +409,6 @@ class SamplingParams(
             raise ValueError(f"n must be an int, but is of type {type(self.n)}")
         if self.n < 1:
             raise ValueError(f"n must be at least 1, got {self.n}.")
-        if self.best_of is not None:
-            if not isinstance(self.best_of, int):
-                raise ValueError(
-                    f"best_of must be an integer, got {type(self.best_of)}"
-                )
-            if self.best_of < 1:
-                raise ValueError(f"best_of must be at least 1, got {self.best_of}")
-            if self.best_of < self.n:
-                raise ValueError(
-                    f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}."
-                )
         if not -2.0 <= self.presence_penalty <= 2.0:
             raise ValueError(
                 f"presence_penalty must be in [-2, 2], got {self.presence_penalty}."
@@ -519,10 +483,6 @@ class SamplingParams(
                 "stop strings are only supported when detokenize is True. "
                 "Set detokenize=True to use stop."
             )
-        if self.best_of != self._real_n and self.output_kind == (
-            RequestOutputKind.DELTA
-        ):
-            raise ValueError("best_of must equal n to use output_kind=DELTA")
 
     def _verify_greedy_sampling(self) -> None:
         if self.n > 1:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 4cb911d8e22b7..905ad406b307e 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -142,9 +142,6 @@ class Processor:
         self,
         params: SamplingParams,
     ) -> None:
-        # Best of not yet supported.
-        if params.best_of is not None and params.best_of > 1:
-            raise ValueError("vLLM V1 does not yet support best_of.")
         # Logits processors not supported.
         if params.logits_processors:
             raise ValueError(

From 8c25f9cfb619c46ecaf8fdb1ef697da9cc21d7e0 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 20 Nov 2025 19:50:59 -0800
Subject: [PATCH 100/249] [BugFix] skip combo kernel on cpu (#29129)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/config/compilation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 1c3ef502f0f45..abdae49106120 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -664,6 +664,8 @@ class CompilationConfig:
             is_torch_equal_or_newer("2.9.0.dev")
             and "combo_kernels" not in self.inductor_compile_config
             and "benchmark_combo_kernel" not in self.inductor_compile_config
+            # (fixme @boyuan) combo kernel does not support cpu yet.
+            and not current_platform.is_cpu()
         ):
             # use horizontal fusion, which is useful for fusing qk-norm and
             # qk-rope when query and key have different shapes.

From 11857a00b0a59183286eab393df4e13b20efec3a Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 20 Nov 2025 23:24:43 -0500
Subject: [PATCH 101/249] [Attention] Add ROCM_AITER_MLA_SPARSE to attention
 backend registry (#29103)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/attention/backends/registry.py | 3 +++
 vllm/platforms/rocm.py              | 5 +----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index 91e1cad01f4fd..6747cf7743b14 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -52,6 +52,9 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     ROCM_AITER_FA = (
         "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
     )
+    ROCM_AITER_MLA_SPARSE = (
+        "vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse.ROCMAiterMLASparseBackend"
+    )
     TORCH_SDPA = ""  # this tag is only used for ViT
     FLASHINFER = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
     FLASHINFER_MLA = (
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 1a2f9226ddce8..f9005fd7d044c 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -233,10 +233,7 @@ class RocmPlatform(Platform):
                 "Sparse MLA backend on ROCm only supports block size 1 for now."
             )
             logger.info_once("Using Sparse MLA backend on V1 engine.")
-            return (
-                "vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse."
-                "ROCMAiterMLASparseBackend"
-            )
+            return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path()
 
         if use_mla:
             if selected_backend is None:

From 30b9c6774396c168bdf019b488f2cfc133b09b35 Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Thu, 20 Nov 2025 21:27:45 -0800
Subject: [PATCH 102/249] Revert "[Redo] #26368 (#28771)" (#29121)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 tests/v1/core/test_async_scheduler.py         |  3 +-
 .../v1/core/test_priority_scheduler_random.py |  6 +-
 tests/v1/core/test_scheduler.py               | 88 ++++++++-----------
 .../kv_connector/unit/test_nixl_connector.py  |  7 +-
 tests/v1/kv_connector/unit/utils.py           |  3 +-
 tests/v1/spec_decode/test_eagle.py            |  5 +-
 tests/v1/spec_decode/test_ngram.py            | 18 ++--
 vllm/utils/gc_utils.py                        | 13 +--
 vllm/v1/core/sched/scheduler.py               |  4 +-
 vllm/v1/outputs.py                            |  4 +-
 vllm/v1/sample/rejection_sampler.py           |  8 +-
 vllm/v1/spec_decode/eagle.py                  |  7 +-
 vllm/v1/spec_decode/ngram_proposer.py         |  6 +-
 vllm/v1/spec_decode/suffix_decoding.py        | 10 +--
 vllm/v1/worker/gpu_model_runner.py            | 36 +++-----
 vllm/v1/worker/tpu_model_runner.py            |  8 +-
 16 files changed, 99 insertions(+), 127 deletions(-)

diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index 1d80ee9875913..e0645ed43015e 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import deque
 
-import numpy as np
 import pytest
 
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -22,7 +21,7 @@ def _make_model_runner_output(
     return ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)},
-        sampled_token_ids=[np.array([i]) for i in range(len(req_ids))],
+        sampled_token_ids=[[i] for i in range(len(req_ids))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py
index ba0b703302e38..b4805be802723 100644
--- a/tests/v1/core/test_priority_scheduler_random.py
+++ b/tests/v1/core/test_priority_scheduler_random.py
@@ -3,7 +3,6 @@
 import random
 import uuid
 
-import numpy as np
 import pytest
 
 from vllm.config import VllmConfig
@@ -100,7 +99,8 @@ def _mock_execute_model(
         random.randint(*num_output_tokens_range) for _ in range(len(request_ids))
     ]
     sampled_token_ids = [
-        np.random.randint(0, 100, size=num_tokens) for num_tokens in num_output_tokens
+        [random.randint(0, 100) for _ in range(num_tokens)]
+        for num_tokens in num_output_tokens
     ]
 
     return ModelRunnerOutput(
@@ -196,8 +196,6 @@ def test_priority_scheduling_blast(
     num_blocks: int,
 ):
     random.seed(42)
-    np.random.seed(42)
-
     seen_request_prompt_length = dict[str, int]()
     seen_request_ids = set[str]()
     seen_mm_hashes = set[str]()
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 0570c0854c678..04e738293cd77 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -3,7 +3,6 @@
 import dataclasses
 from unittest.mock import Mock
 
-import numpy as np
 import pytest
 import torch
 
@@ -170,7 +169,7 @@ def test_schedule_partial_requests():
         req_id_to_index=req_to_index,
         # Only the first request has a sampled token id because
         # the rest requests are still being prefilled.
-        sampled_token_ids=[np.array([0]), np.array([]), np.array([])],
+        sampled_token_ids=[[0], [], []],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -217,7 +216,7 @@ def test_no_mm_input_chunking():
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
+        sampled_token_ids=[[] for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -277,7 +276,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
+        sampled_token_ids=[[] for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -301,8 +300,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([0]), np.array([0])]
-        + [np.array([]) for _ in range(len(requests) - 2)],
+        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -349,8 +347,8 @@ def test_stop_via_update_from_output():
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
         sampled_token_ids=[
-            np.array([EOS_TOKEN_ID]),
-            np.array([10, 11]),
+            [EOS_TOKEN_ID],
+            [10, 11],
         ],  # First request hits EOS, second continues
         logprobs=None,
         prompt_logprobs_dict={},
@@ -394,10 +392,7 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[
-            np.array([10, 42, 12]),
-            np.array([13, 14]),
-        ],  # First request hits stop token
+        sampled_token_ids=[[10, 42, 12], [13, 14]],  # First request hits stop token
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -441,10 +436,7 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[
-            np.array([10, 11, 12]),
-            np.array([13]),
-        ],  # First request exceeds max_tokens
+        sampled_token_ids=[[10, 11, 12], [13]],  # First request exceeds max_tokens
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -483,7 +475,7 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
+        sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -624,7 +616,7 @@ def test_schedule_concurrent_batches(
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -641,7 +633,7 @@ def test_schedule_concurrent_batches(
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -678,7 +670,7 @@ def test_preempt_during_execution():
     model_runner_output0 = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -695,7 +687,7 @@ def test_preempt_during_execution():
     model_runner_output1 = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[np.array([42])],
+        sampled_token_ids=[[42]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -712,18 +704,14 @@ def test_preempt_during_execution():
 @pytest.mark.parametrize(
     "spec_tokens,output_tokens,expected",
     [
-        ([[1, 2, 3]], [np.array([1, 2, 3, 4])], (1, 3, 3, [1, 1, 1])),  # perfect match
-        ([[1, 2, 3]], [np.array([1, 5])], (1, 3, 1, [1, 0, 0])),  # early mismatch
-        (
-            [[1, 2], [3]],
-            [np.array([1, 2, 5]), np.array([3, 4])],
-            (2, 3, 3, [2, 1]),
-        ),  # multiple sequences
-        ([[1]], [np.array([1, 2])], (1, 1, 1, [1])),  # single token sequence
-        ([[]], [np.array([5])], (0, 0, 0, [0])),  # empty sequence
+        ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])),  # perfect match
+        ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])),  # early mismatch
+        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (2, 3, 3, [2, 1])),  # multiple sequences
+        ([[1]], [[1, 2]], (1, 1, 1, [1])),  # single token sequence
+        ([[]], [[5]], (0, 0, 0, [0])),  # empty sequence
         (
             [[1, 2, 3], [4, 5, 6]],
-            [np.array([1, 2, 7]), np.array([4, 8])],
+            [[1, 2, 7], [4, 8]],
             (2, 6, 3, [2, 1, 0]),
         ),  # multiple mismatches
     ],
@@ -757,7 +745,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     model_runner_output = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([0]) for _ in range(len(requests))],
+        sampled_token_ids=[[0] for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -984,7 +972,7 @@ def test_kv_connector_basic(is_async: bool):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1037,7 +1025,7 @@ def test_kv_connector_basic(is_async: bool):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1100,7 +1088,7 @@ def test_external_prefix_cache_metrics():
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=[r.request_id for r in requests],
         req_id_to_index={r.request_id: i for i, r in enumerate(requests)},
-        sampled_token_ids=[np.array([1000])] * NUM_REQUESTS,
+        sampled_token_ids=[[1000]] * NUM_REQUESTS,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1166,7 +1154,7 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1251,7 +1239,7 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1344,7 +1332,7 @@ def make_output(scheduler: Scheduler):
     return ModelRunnerOutput(
         req_ids=[req.request_id for req in scheduler.running],
         req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)},
-        sampled_token_ids=[np.array([1000])] * len(scheduler.running),
+        sampled_token_ids=[[1000]] * len(scheduler.running),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1761,7 +1749,7 @@ def test_priority_scheduling_preemption():
         req_id_to_index={
             req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
+        sampled_token_ids=[[100] for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1830,7 +1818,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
         req_id_to_index={
             req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
+        sampled_token_ids=[[100] for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -2076,7 +2064,7 @@ def test_priority_scheduling_heap_property():
             model_output = ModelRunnerOutput(
                 req_ids=[req.req_id],
                 req_id_to_index={req.req_id: 0},
-                sampled_token_ids=[np.array([100])],
+                sampled_token_ids=[[100]],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[],
@@ -2162,7 +2150,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[request_low.request_id],
         req_id_to_index={request_low.request_id: 0},
-        sampled_token_ids=[np.array([100])],
+        sampled_token_ids=[[100]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2193,7 +2181,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[np.array([100]) for _ in requests],
+        sampled_token_ids=[[100] for _ in requests],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2219,7 +2207,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[np.array([]), np.array([100])],
+        sampled_token_ids=[[], [100]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2636,7 +2624,7 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
     model_output = ModelRunnerOutput(
         req_ids=[request1.request_id],
         req_id_to_index={request1.request_id: 0},
-        sampled_token_ids=[np.array([100])],
+        sampled_token_ids=[[100]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2842,7 +2830,7 @@ def test_ec_connector_unable_to_allocate(use_kv_connector):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -2955,7 +2943,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
     model_output = ModelRunnerOutput(
         req_ids=[request_low.request_id],
         req_id_to_index={request_low.request_id: 0},
-        sampled_token_ids=[np.array([100])],
+        sampled_token_ids=[[100]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -3006,7 +2994,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[np.array([100]) for _ in requests],
+        sampled_token_ids=[[100] for _ in requests],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -3041,7 +3029,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[np.array([100]), np.array([100, 200])],
+        sampled_token_ids=[[100], [100, 200]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -3227,7 +3215,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
     model_output = ModelRunnerOutput(
         req_ids=[request1.request_id, request2.request_id],
         req_id_to_index={request1.request_id: 0, request2.request_id: 1},
-        sampled_token_ids=[np.array([100]), np.array([121])],
+        sampled_token_ids=[[100], [121]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index b264e5108c16d..b7d7a10057b8b 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -11,7 +11,6 @@ import uuid
 from collections import defaultdict
 from unittest.mock import patch
 
-import numpy as np
 import pytest
 import ray
 import torch
@@ -827,7 +826,7 @@ def test_kv_connector_stats_aggregation():
         output = ModelRunnerOutput(
             req_ids=[f"req_{i}"],
             req_id_to_index={f"req_{i}": 0},
-            sampled_token_ids=[np.array([123])],  # dummy token
+            sampled_token_ids=[[123]],  # dummy token
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[None],
@@ -908,7 +907,7 @@ def test_multi_kv_connector_stats_aggregation():
         output = ModelRunnerOutput(
             req_ids=[f"req_{i}"],
             req_id_to_index={f"req_{i}": 0},
-            sampled_token_ids=[np.array([123])],
+            sampled_token_ids=[[123]],
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[None],
@@ -966,7 +965,7 @@ def test_scheduler_kv_connector_stats_aggregation():
     model_output = ModelRunnerOutput(
         req_ids=["req_0"],
         req_id_to_index={"req_0": 0},
-        sampled_token_ids=[np.array([123])],
+        sampled_token_ids=[[123]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[None],
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index c248104d5b5ea..f35f91bb3adf8 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -7,7 +7,6 @@ from dataclasses import dataclass
 from itertools import chain, count
 from typing import Any
 
-import numpy as np
 import torch
 
 from vllm import SamplingParams
@@ -229,7 +228,7 @@ def create_model_runner_output(
 
     # Make sampled tokens.
     sampled_token = EOS_TOKEN_ID if use_eos else token_id
-    sampled_token_ids = [np.array([sampled_token]) for _ in req_ids]
+    sampled_token_ids = [[sampled_token] for _ in req_ids]
 
     kv_connector_output = (
         None
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 805b8c86b0804..c93c59d1f4c42 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -3,7 +3,6 @@
 
 from unittest import mock
 
-import numpy as np
 import pytest
 import torch
 
@@ -113,9 +112,7 @@ def test_prepare_next_token_ids():
     sampled_token_ids_tensor = torch.tensor(
         sampled_token_ids, dtype=torch.int32, device=device
     )
-    sampled_token_ids_cpu = [
-        np.array([i for i in seq if i != -1]) for seq in sampled_token_ids
-    ]
+    sampled_token_ids_cpu = [[i for i in seq if i != -1] for seq in sampled_token_ids]
 
     expected_next_token_ids_cpu = [1, 4, 30, 40]
     expected_next_token_ids_tensor = torch.tensor(
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 563bc1d957f41..692c39282c372 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -77,7 +77,7 @@ def test_ngram_proposer():
     # No match.
     token_ids_cpu = np.array([[1, 2, 3, 4, 5]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -88,7 +88,7 @@ def test_ngram_proposer():
     # No match for 4-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=4, max_n=4, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -99,7 +99,7 @@ def test_ngram_proposer():
     # No match for 4-gram but match for 3-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -111,7 +111,7 @@ def test_ngram_proposer():
     # In this case, the proposer should return the 4-gram match.
     token_ids_cpu = np.array([[2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -122,7 +122,7 @@ def test_ngram_proposer():
     # Match for 2-gram and 3-gram, but not 4-gram.
     token_ids_cpu = np.array([[3, 4, 5, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=2, max_n=4, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -133,7 +133,7 @@ def test_ngram_proposer():
     # Multiple 3-gram matched, but always pick the first one.
     token_ids_cpu = np.array([[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=3, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -144,7 +144,7 @@ def test_ngram_proposer():
     # check empty input
     token_ids_cpu = np.array([[]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -157,7 +157,7 @@ def test_ngram_proposer():
     # second request has 3 tokens and no match. Padded with -1 for max len 5
     token_ids_cpu = np.array([[1, 2, 3, 1, 2], [4, 5, 6, -1, -1]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[np.array([0]), np.array([1])],
+        sampled_token_ids=[[0], [1]],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([5, 3]),
         token_ids_cpu=token_ids_cpu,
@@ -181,7 +181,7 @@ def test_ngram_proposer():
     input_2[:3] = [4, 5, 6]
     token_ids_cpu = np.array([input_1, input_2])
     result = ngram_proposer.propose(
-        sampled_token_ids=[np.array([0]), np.array([1])],
+        sampled_token_ids=[[0], [1]],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([len(input_1), 3]),
         token_ids_cpu=token_ids_cpu,
diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py
index 3436e450a269f..c56b1794230e9 100644
--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@@ -53,6 +53,7 @@ class GCDebugger:
         self.config = config
         # Start time in micro second of this GC cycle
         self.start_time_ns: int = time.monotonic_ns()
+        self.num_objects: int = 0
         # If config.top_objects is positive,
         # compute top collected objects by object types
         self.gc_top_collected_objects: str = ""
@@ -68,19 +69,21 @@ class GCDebugger:
             # Before GC started, record GC start time
             # and top collected objects
             self.start_time_ns = time.monotonic_ns()
-            if (top_objects := self.config.top_objects) > 0:
-                self.gc_top_collected_objects = _compute_top_gc_collected_objects(
-                    gc.get_objects(generation), top_objects
-                )
+            objects = gc.get_objects(generation)
+            self.num_objects = len(objects)
+            self.gc_top_collected_objects = _compute_top_gc_collected_objects(
+                objects, self.config.top_objects
+            )
         elif phase == "stop":
             # After GC finished, Record GC elapsed time and
             # optionally top collected objects
             elpased_ms = (time.monotonic_ns() - self.start_time_ns) / 1e6
             logger.info(
                 "GC took %.3fms to complete. "
-                "Collected %s objects in GC generation %d.%s",
+                "Collected %s objects (out of %d) in GC generation %d.%s",
                 elpased_ms,
                 str(info.get("collected", "?")),
+                self.num_objects,
                 generation,
                 (
                     f" Top collected objects: \n{self.gc_top_collected_objects}"
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4cc4c29591cc0..1ac8520a8ed25 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1013,8 +1013,8 @@ class Scheduler(SchedulerInterface):
                 continue
 
             req_index = model_runner_output.req_id_to_index[req_id]
-            generated_token_ids: list[int] = (
-                sampled_token_ids[req_index].tolist() if sampled_token_ids else []
+            generated_token_ids = (
+                sampled_token_ids[req_index] if sampled_token_ids else []
             )
 
             scheduled_spec_token_ids = (
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index c0b2835c3124c..e32d5bb608b1d 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -158,7 +158,7 @@ class ModelRunnerOutput:
     # num_generated_tokens is the number of tokens
     # generated in the current step. It can be different for
     # each request due to speculative/jump decoding.
-    sampled_token_ids: list[np.ndarray]
+    sampled_token_ids: list[list[int]]
 
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
@@ -220,7 +220,7 @@ def make_empty_encoder_model_runner_output(
     req_id_to_index: dict[str, int] = {rid: idx for idx, rid in enumerate(req_ids)}
 
     # No tokens generated yet ⇒ one empty list per request
-    sampled_token_ids: list[list[int]] = [np.array([0]) for _ in req_ids]
+    sampled_token_ids: list[list[int]] = [[0] for _ in req_ids]
 
     # Pooler outputs are not available yet ⇒ use None placeholders
     pooler_output: list[torch.Tensor | None] = [None for _ in req_ids]
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index f31a0cddda9ae..926305d25f56b 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -3,7 +3,6 @@
 
 from dataclasses import replace
 
-import numpy as np
 import torch
 import torch.nn as nn
 
@@ -205,7 +204,7 @@ class RejectionSampler(nn.Module):
     def parse_output(
         output_token_ids: torch.Tensor,
         vocab_size: int,
-    ) -> list[np.ndarray]:
+    ) -> list[list[int]]:
         """Parse the output of the rejection sampler.
         Args:
             output_token_ids: The sampled token IDs in shape
@@ -221,7 +220,10 @@ class RejectionSampler(nn.Module):
         valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
             output_token_ids_np < vocab_size
         )
-        return [row[valid_mask[i]] for i, row in enumerate(output_token_ids_np)]
+        outputs = [
+            row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
+        ]
+        return outputs
 
     def apply_logits_processors(
         self,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index ba37bc81607fe..0df9cd3214e53 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -496,7 +496,7 @@ class EagleProposer:
 
     def prepare_next_token_ids_cpu(
         self,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
         requests: dict[str, CachedRequestState],
         gpu_input_batch: InputBatch,
         num_scheduled_tokens: dict[str, int],
@@ -511,7 +511,7 @@ class EagleProposer:
         req_ids = gpu_input_batch.req_ids
         next_token_ids: list[int] = []
         for i, token_ids in enumerate(sampled_token_ids):
-            if token_ids.shape[0] > 0:
+            if token_ids:
                 # Common case.
                 next_token_id = token_ids[-1]
             else:
@@ -522,9 +522,10 @@ class EagleProposer:
                 seq_len = req_state.num_computed_tokens + num_scheduled_tokens[req_id]
                 next_token_id = req_state.get_token_id(seq_len)
             next_token_ids.append(next_token_id)
-        return torch.tensor(
+        next_token_ids = torch.tensor(
             next_token_ids, dtype=torch.int32, device=self.input_ids.device
         )
+        return next_token_ids
 
     def prepare_next_token_ids_padded(
         self,
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 378937dba9882..e2f83cb24aa90 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -54,7 +54,7 @@ class NgramProposer:
         # Trigger Numba JIT compilation for N-gram proposer.
         # This usually takes less than 1 second.
         self.propose(
-            [np.array([])] * 1024,
+            [[]] * 1024,
             [""] * 1024,
             np.zeros(1024, dtype=np.int32),
             np.zeros((1024, self.max_model_len), dtype=np.int32),
@@ -131,7 +131,7 @@ class NgramProposer:
 
     def propose(
         self,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
         req_ids: list[str],
         num_tokens_no_spec: np.ndarray,
         token_ids_cpu: np.ndarray,
@@ -140,7 +140,7 @@ class NgramProposer:
         # find which requests need ngram proposals
         valid_ngram_requests = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            num_sampled_ids = sampled_ids.shape[0]
+            num_sampled_ids = len(sampled_ids)
             if not num_sampled_ids:
                 # Skip speculative decoding.
                 continue
diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py
index d76e0ffe778d4..049e335db3254 100644
--- a/vllm/v1/spec_decode/suffix_decoding.py
+++ b/vllm/v1/spec_decode/suffix_decoding.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import numpy as np
-
 from vllm.config import VllmConfig
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
@@ -34,16 +32,16 @@ class SuffixDecodingProposer:
     def propose(
         self,
         input_batch: InputBatch,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
     ) -> list[list[int]]:
         """
         Propose speculative tokens for each request in the input batch. Suffix Decoding
         will speculate a dynamic number of tokens for each request every decoding step,
         so each entry in the returned list may have different lengths.
         """
-        draft_token_ids: list[np.ndarray] = []
+        draft_token_ids: list[list[int]] = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            if sampled_ids.shape[0] == 0:
+            if not sampled_ids:
                 # Skip speculative decoding for partial prefills.
                 draft_token_ids.append([])
                 continue
@@ -72,7 +70,7 @@ class SuffixDecodingProposer:
                 self.suffix_cache.start_request(req_id, prompt_token_ids)
 
             # Append the newly sampled ids to the suffix cache for this request.
-            self.suffix_cache.add_active_response(req_id, sampled_ids.tolist())
+            self.suffix_cache.add_active_response(req_id, sampled_ids)
 
             # Suffix decoding only uses the most recent tokens up to max_tree_depth, so
             # we extract the pattern from the end of the input.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a7fa68b20ac50..4c65a5e9b0292 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -221,16 +221,14 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
         del self._sampled_token_ids
         max_gen_len = self.sampled_token_ids_cpu.shape[-1]
         if max_gen_len == 1:
-            valid_sampled_token_ids: list[np.ndarray] = [
-                row for row in self.sampled_token_ids_cpu.numpy()
-            ]
+            valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
         else:
             valid_sampled_token_ids = RejectionSampler.parse_output(
                 self.sampled_token_ids_cpu,
                 self.vocab_size,
             )
         for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i] = np.array([])
+            valid_sampled_token_ids[i].clear()
 
         output = self._model_runner_output
         output.sampled_token_ids = valid_sampled_token_ids
@@ -2466,7 +2464,7 @@ class GPUModelRunner(
     ) -> tuple[
         dict[str, int],
         LogprobsLists | None,
-        list[np.ndarray],
+        list[list[int]],
         dict[str, LogprobsTensors | None],
         list[str],
         dict[str, int],
@@ -2492,7 +2490,6 @@ class GPUModelRunner(
         num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
         invalid_req_indices = []
-        valid_sampled_token_ids: list[np.ndarray]
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
@@ -2507,7 +2504,7 @@ class GPUModelRunner(
                 )
             # Mask out the sampled tokens that should not be sampled.
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[int(i)] = np.array([])
+                valid_sampled_token_ids[int(i)].clear()
         else:
             valid_sampled_token_ids = []
             invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
@@ -2537,24 +2534,19 @@ class GPUModelRunner(
             [0] if spec_decode_metadata and logprobs_tensors else None
         )
         for req_idx in range(num_sampled_tokens):
-            sampled_ids: np.ndarray | None
             if self.use_async_scheduling:
-                sampled_ids = (
-                    np.array([-1]) if req_idx not in invalid_req_indices_set else None
-                )
+                sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
             else:
                 sampled_ids = valid_sampled_token_ids[req_idx]
 
-            num_sampled_ids: int = (
-                sampled_ids.shape[0] if sampled_ids is not None else 0
-            )
+            num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0
 
             if cu_num_accepted_tokens is not None:
                 cu_num_accepted_tokens.append(
                     cu_num_accepted_tokens[-1] + num_sampled_ids
                 )
 
-            if sampled_ids is None or num_sampled_ids == 0:
+            if not sampled_ids:
                 continue
 
             start_idx = self.input_batch.num_tokens_no_spec[req_idx]
@@ -2938,9 +2930,7 @@ class GPUModelRunner(
 
         self.input_batch.prev_sampled_token_ids = None
 
-        def propose_draft_token_ids(
-            sampled_token_ids: torch.Tensor | list[np.ndarray],
-        ) -> None:
+        def propose_draft_token_ids(sampled_token_ids):
             assert spec_decode_common_attn_metadata is not None
             with record_function_or_nullcontext("gpu_model_runner: draft"):
                 self._draft_token_ids = self.propose_draft_token_ids(
@@ -3113,14 +3103,14 @@ class GPUModelRunner(
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
-        sampled_token_ids: torch.Tensor | list[np.ndarray],
+        sampled_token_ids: torch.Tensor | list[list[int]],
         sampling_metadata: SamplingMetadata,
         hidden_states: torch.Tensor,
         sample_hidden_states: torch.Tensor,
         aux_hidden_states: list[torch.Tensor] | None,
         spec_decode_metadata: SpecDecodeMetadata | None,
         common_attn_metadata: CommonAttentionMetadata,
-    ) -> torch.Tensor | list[list[int]]:
+    ) -> list[list[int]] | torch.Tensor:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         spec_config = self.speculative_config
         assert spec_config is not None
@@ -3154,7 +3144,7 @@ class GPUModelRunner(
                 for num_draft, tokens in zip(
                     spec_decode_metadata.num_draft_tokens, sampled_token_ids
                 ):
-                    indices.append(offset + tokens.shape[0] - 1)
+                    indices.append(offset + len(tokens) - 1)
                     offset += num_draft + 1
                 indices = torch.tensor(indices, device=self.device)
                 hidden_states = sample_hidden_states[indices]
@@ -5150,7 +5140,7 @@ class GPUModelRunner(
 
         return kv_cache_spec
 
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
+    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
         # This is a short term mitigation for issue mentioned in
         # https://github.com/vllm-project/vllm/issues/22754.
         # `tolist` would trigger a cuda wise stream sync, which
@@ -5163,4 +5153,4 @@ class GPUModelRunner(
         pinned.copy_(sampled_token_ids, non_blocking=True)
         self.transfer_event.record()
         self.transfer_event.synchronize()
-        return [row for row in pinned.numpy()]
+        return pinned.tolist()
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 450160d28649f..5f6012ec614c2 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1262,15 +1262,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         max_gen_len = selected_token_ids.shape[-1]
         if max_gen_len == 1:
-            valid_sampled_token_ids: list[np.ndarray] = [
-                row for row in selected_token_ids.numpy()
-            ]
+            valid_sampled_token_ids = selected_token_ids.tolist()
 
             # Mask out the sampled tokens that should not be sampled.
             # TODO: Keep in sync with gpu_model_runner.py, in particular
             #       the "else" case here
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[i] = np.array([])
+                valid_sampled_token_ids[i].clear()
 
             # Append sampled tokens
             for i, req_state, seq_len in request_seq_lens:
@@ -1283,7 +1281,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             valid_mask = selected_token_ids != INVALID_TOKEN_ID
             gen_lens = valid_mask.sum(dim=1).tolist()
             valid_sampled_token_ids = [
-                seq.numpy() for seq in selected_token_ids[valid_mask].split(gen_lens)
+                seq.tolist() for seq in selected_token_ids[valid_mask].split(gen_lens)
             ]
             self.input_batch.num_tokens[:num_reqs] += gen_lens
             for i, req_state, seq_len in request_seq_lens:

From b4734b9550eb85cb8e38e3d93ef8fccb83ddf8fe Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Thu, 20 Nov 2025 22:32:30 -0700
Subject: [PATCH 103/249] [Bugfix] Fix default MM LoRA alignment for single str
 prompts (#29140)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
---
 tests/lora/test_default_mm_loras.py | 35 +++++++++++++++++++++++++++++
 vllm/entrypoints/llm.py             |  2 +-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/tests/lora/test_default_mm_loras.py b/tests/lora/test_default_mm_loras.py
index dfc45e78e464f..407b29fdd1d58 100644
--- a/tests/lora/test_default_mm_loras.py
+++ b/tests/lora/test_default_mm_loras.py
@@ -5,7 +5,9 @@ Tests for applying default registered multimodal loras.
 """
 
 import os
+import unittest.mock as mock
 
+import pytest
 from huggingface_hub import snapshot_download
 
 from vllm.lora.request import LoRARequest
@@ -114,3 +116,36 @@ def test_default_mm_lora_fails_with_overridden_lora_request(
         default_mm_loras={"audio": IMAGE_LORA_PATH},
         expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
     )
+
+
+def test_default_mm_lora_does_not_expand_string_reqs(vllm_runner):
+    class MockEngineException(Exception):
+        pass
+
+    # Regression test for ensuring default multimodal lora resolution
+    # does not expand the lora req if the prompt type is a string.
+    vllm_runner_kwargs = {
+        **VLLM_RUNNER_BASE_KWARGS,
+        **{"default_mm_loras": {"audio": AUDIO_LORA_PATH}},
+    }
+
+    # Avoid the full generation call since these tests are expensive;
+    # just check what lora request is actually submitted to the engine
+    mock_err = "Engine is mocked for this test"
+
+    with (
+        mock.patch(
+            "vllm.v1.engine.llm_engine.LLMEngine.add_request",
+            side_effect=MockEngineException(mock_err),
+        ) as mock_add_request,
+        vllm_runner(**vllm_runner_kwargs) as vllm_model,
+    ):
+        # Die once we actually submit the request to the engine
+        with pytest.raises(MockEngineException):
+            vllm_model.llm.generate(prompts=AUDIO_PROMPT)
+
+        # Then check to make sure the submitted lora request
+        # and text prompt were zipped together correctly
+        engine_args, engine_kwargs = mock_add_request.call_args
+        assert engine_kwargs["lora_request"] is None
+        assert engine_kwargs["prompt_text"] == AUDIO_PROMPT
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index b0786bd355aa6..7421eb8b8abc9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -466,7 +466,7 @@ class LLM:
         ):
             return lora_request
 
-        if not isinstance(prompts, Sequence):
+        if not isinstance(prompts, Sequence) or isinstance(prompts, str):
             prompts = [prompts]
 
         optional_loras = (

From e4c3182c6851af08c366eff725dbee876fa9fdeb Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Thu, 20 Nov 2025 22:54:10 -0800
Subject: [PATCH 104/249] [Small] Capture AttributeError when checking ray
 dependency.  (#29024)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 vllm/ray/lazy_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/ray/lazy_utils.py b/vllm/ray/lazy_utils.py
index 64b5f51571a35..06c91cc3943ae 100644
--- a/vllm/ray/lazy_utils.py
+++ b/vllm/ray/lazy_utils.py
@@ -10,6 +10,8 @@ def is_ray_initialized():
         return ray.is_initialized()
     except ImportError:
         return False
+    except AttributeError:
+        return False
 
 
 def is_in_ray_actor():
@@ -24,3 +26,5 @@ def is_in_ray_actor():
         )
     except ImportError:
         return False
+    except AttributeError:
+        return False

From 7d6da483b0a1177744be4c30eb18beb6e98f53a3 Mon Sep 17 00:00:00 2001
From: Canlin Guo <canlinguosdu@gmail.com>
Date: Fri, 21 Nov 2025 15:52:34 +0800
Subject: [PATCH 105/249] [Minor][Clean] Remove the legacy assertion in video
 (#29150)

Signed-off-by: gcanlin <canlinguosdu@gmail.com>
---
 vllm/multimodal/video.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 5c75bee54dd30..763f90fde7b6d 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -113,11 +113,6 @@ class VideoLoader:
                 valid_num_frames,
             )
 
-        assert i == valid_num_frames, (
-            f"Expected reading {valid_num_frames} frames, "
-            f"but only loaded {i} frames from video."
-        )
-
         return frames[:valid_num_frames], valid_num_frames, valid_frame_indices
 
 
From 8ac3a4148796648d206a46144aa0dacea8977d55 Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Thu, 20 Nov 2025 23:53:30 -0800
Subject: [PATCH 106/249] [CI Failure] Fix Gemma3 RoPE configuration for
 sliding attention layers (#29111)

Signed-off-by: Huamin Li <3ericli@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/gemma3.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 565719ae7faeb..4ad6fc89dcaf2 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -166,10 +166,12 @@ class Gemma3Attention(nn.Module):
         else:
             # Transformers v4 rope config.
             # Global attention. Use the values in config.json.
-            rope_parameters = config.rope_parameters.copy()
+            rope_parameters = config.rope_parameters
             # Local attention. Override the values in config.json.
             if self.is_sliding:
-                rope_parameters["rope_theta"] = config.rope_local_base_freq
+                rope_parameters = dict(
+                    rope_type="default", rope_theta=config.rope_local_base_freq
+                )
 
         self.rotary_emb = get_rope(
             self.head_dim,

From 4d7231e7743e80078bbc68ccc37b5ba5a1f28bf5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Nov 2025 17:40:17 +0800
Subject: [PATCH 107/249] Revert #28875 (#29159)

---
 docker/Dockerfile                               | 17 +++++++++++++++++
 docs/deployment/docker.md                       |  7 ++++---
 .../installation/gpu.cuda.inc.md                |  5 ++++-
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 709b79e84fbbc..964700e2a43ac 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -56,6 +56,7 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 
 # PyTorch provides its own indexes for standard and nightly builds
 ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 
 # PIP supports multiple authentication schemes, including keyring
 # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
@@ -97,6 +98,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Activate virtual environment and add uv to PATH
@@ -315,6 +317,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Install uv for faster pip installs
@@ -334,6 +337,20 @@ ENV UV_LINK_MODE=copy
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            --pre pytorch_triton==3.3.0+gitab727c40 ; \
+    fi
+
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 0e636c87f38a4..1c639f3533d47 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -82,7 +82,8 @@ DOCKER_BUILDKIT=1 docker build . \
 
 ## Building for Arm64/aarch64
 
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
+of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
 
 !!! note
     Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
@@ -93,6 +94,7 @@ A docker container can be built for aarch64 systems such as the Nvidia Grace-Hop
 
     ```bash
     # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
+    python3 use_existing_torch.py
     DOCKER_BUILDKIT=1 docker build . \
     --file docker/Dockerfile \
     --target vllm-openai \
@@ -100,8 +102,7 @@ A docker container can be built for aarch64 systems such as the Nvidia Grace-Hop
     -t vllm/vllm-gh200-openai:latest \
     --build-arg max_jobs=66 \
     --build-arg nvcc_threads=2 \
-    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
-    --build-arg RUN_WHEEL_CHECK=false
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX"
     ```
 
 !!! note
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index 601d3659af886..b2d0d64a2d355 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -158,7 +158,10 @@ uv pip install -e .
 
 ##### Use an existing PyTorch installation
 
-There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, for example, when building vLLM with non-default PyTorch builds (like nightly or a custom build).
+There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, e.g.:
+
+- Building vLLM with PyTorch nightly or a custom PyTorch build.
+- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it.
 
 To build vLLM using an existing PyTorch installation:
 

From b34129bf8e5412e4094b89aba5246605c280a5fd Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Fri, 21 Nov 2025 17:41:20 +0800
Subject: [PATCH 108/249] [Misc] remove useless v1 env (#29164)

Signed-off-by: David Chen <530634352@qq.com>
---
 tests/v1/e2e/test_lora_with_spec_decode.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/v1/e2e/test_lora_with_spec_decode.py b/tests/v1/e2e/test_lora_with_spec_decode.py
index 14532f2795443..8c9ab58c3c0ab 100644
--- a/tests/v1/e2e/test_lora_with_spec_decode.py
+++ b/tests/v1/e2e/test_lora_with_spec_decode.py
@@ -61,8 +61,6 @@ def test_batch_inference_correctness(
     model_setup: (method, model_name, spec_model_name, lora_path, tp_size)
     """
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Disable randomness
         m.setenv("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
         torch.manual_seed(SEED)

From aab0102a267eba814cdc09170b530a3aed96be60 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Nov 2025 19:56:59 +0800
Subject: [PATCH 109/249] [V0 deprecation] Remove more V0 references (#29088)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/contributing/model/basic.md              |  2 --
 docs/design/prefix_caching.md                 |  3 ---
 docs/usage/reproducibility.md                 |  9 ++-----
 docs/usage/v1_guide.md                        |  2 +-
 examples/offline_inference/reproducibility.py |  8 ++----
 examples/offline_inference/rlhf_utils.py      |  8 +++---
 .../offline_inference/save_sharded_state.py   | 19 +++-----------
 examples/offline_inference/spec_decode.py     |  6 +----
 .../model_loader/test_sharded_state_loader.py | 13 ++--------
 tests/tool_use/utils.py                       | 25 ++++++++++---------
 vllm/entrypoints/llm.py                       |  1 -
 vllm/entrypoints/openai/protocol.py           |  6 ++---
 .../layers/mamba/mamba_mixer2.py              |  1 -
 vllm/model_executor/models/interfaces.py      |  2 --
 vllm/model_executor/models/plamo2.py          |  1 -
 15 files changed, 31 insertions(+), 75 deletions(-)

diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index d7f5d2f311a37..e828de0adf3c2 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -133,8 +133,6 @@ We consider 3 different scenarios:
 For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
 The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
 For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
-Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
-V0-only classes and code will be removed in the very near future.
 The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.
 
 For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md
index 48536a877bd3f..cf792fdabe1a6 100644
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -94,9 +94,6 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache
 
 With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
 
-!!! note
-    Cache isolation is not supported in engine V0.
-
 ## Data Structure
 
 The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md
index d8a1943209c1e..afc25b63902e2 100644
--- a/docs/usage/reproducibility.md
+++ b/docs/usage/reproducibility.md
@@ -1,10 +1,7 @@
 # Reproducibility
 
-vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve
-reproducible results:
-
-- For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
-- For V0: Set the global seed (see below).
+vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. To achieve
+reproducible results, you need to turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
 
 Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py)
 
@@ -30,8 +27,6 @@ However, in some cases, setting the seed will also [change the random state in u
 
 ### Default Behavior
 
-In V0, the `seed` parameter defaults to `None`. When the `seed` parameter is `None`, the random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that each run of vLLM will produce different results if `temperature > 0`, as expected.
-
 In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`.
 
 !!! note
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index e46bee3f4ef20..22f4e6761ea9a 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -2,7 +2,7 @@
 
 !!! announcement
 
-    We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
+    We have fully deprecated V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
 
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py
index d909438b41042..e135bc1b2abb7 100644
--- a/examples/offline_inference/reproducibility.py
+++ b/examples/offline_inference/reproducibility.py
@@ -11,13 +11,9 @@ import random
 
 from vllm import LLM, SamplingParams
 
-# V1 only: Turn off multiprocessing to make the scheduling deterministic.
+# Turn off multiprocessing to make the scheduling deterministic.
 os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
 
-# V0 only: Set the global seed. The default seed is None, which is
-# not reproducible.
-SEED = 42
-
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -28,7 +24,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 
 def main():
-    llm = LLM(model="facebook/opt-125m", seed=SEED)
+    llm = LLM(model="facebook/opt-125m")
     outputs = llm.generate(prompts, sampling_params)
     print("-" * 50)
     for output in outputs:
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
index 13def88439ef2..5c0787b8778d6 100644
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -30,8 +30,8 @@ class WorkerExtension:
     """
     The class for vLLM's worker to inherit from.
     By defining an extension class, the code can work no matter what is
-    the underlying worker class. This way, the code can be compatible
-    with both vLLM V0 and V1.
+    the underlying worker class.
+
     NOTE: we define this class in a separate module, and the main module
     should pass the full qualified name as `worker_extension_cls` argument.
     """
@@ -96,8 +96,8 @@ class ColocateWorkerExtension:
     """
     The class for vLLM's worker to inherit from, in the colocate setting.
     By defining an extension class, the code can work no matter what is
-    the underlying worker class. This way, the code can be compatible
-    with both vLLM V0 and V1.
+    the underlying worker class.
+
     NOTE: we define this class in a separate module, and the main module
     should pass the full qualified name as `worker_extension_cls` argument.
     """
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index e25f46b126e6f..88ee48b98bff6 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -67,22 +67,9 @@ def main(args):
     Path(args.output).mkdir(exist_ok=True)
     # Dump worker states to output directory
 
-    # Check which engine version is being used
-    is_v1_engine = hasattr(llm.llm_engine, "engine_core")
-
-    if is_v1_engine:
-        # For V1 engine, we need to use engine_core.save_sharded_state
-        print("Using V1 engine save path")
-        llm.llm_engine.engine_core.save_sharded_state(
-            path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
-        )
-    else:
-        # For V0 engine
-        print("Using V0 engine save path")
-        model_executor = llm.llm_engine.model_executor
-        model_executor.save_sharded_state(
-            path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
-        )
+    llm.llm_engine.engine_core.save_sharded_state(
+        path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
+    )
 
     # Copy metadata files to output directory
     for file in os.listdir(model_path):
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 3cdc3b245b72a..67a0732459709 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -158,11 +158,7 @@ def main(args):
             print(f"generated text: {output.outputs[0].text}")
             print("-" * 50)
 
-    try:
-        metrics = llm.get_metrics()
-    except AssertionError:
-        print("Metrics are not supported in the V0 engine.")
-        return
+    metrics = llm.get_metrics()
 
     total_num_output_tokens = sum(
         len(output.outputs[0].token_ids) for output in outputs
diff --git a/tests/model_executor/model_loader/test_sharded_state_loader.py b/tests/model_executor/model_loader/test_sharded_state_loader.py
index 5bb841bf2fa0e..cf06b000efb51 100644
--- a/tests/model_executor/model_loader/test_sharded_state_loader.py
+++ b/tests/model_executor/model_loader/test_sharded_state_loader.py
@@ -60,18 +60,9 @@ def llama_3p2_1b_files():
 
 def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
     llm_sharded_writer = LLM(model=input_dir, **kwargs)
-    # Check which engine version is being used
-    is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core")
+
     # Dump worker states to output directory
-    if is_v1_engine:
-        # For V1 engine, we need to use engine_core.save_sharded_state
-        print("Using V1 engine save path")
-        llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir)
-    else:
-        # For V0 engine
-        print("Using V0 engine save path")
-        model_executor = llm_sharded_writer.llm_engine.model_executor
-        model_executor.save_sharded_state(path=output_dir)
+    llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir)
 
     # Copy metadata files to output directory
     for file in os.listdir(input_dir):
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 38def6f874d7d..d188b21863812 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -140,21 +140,22 @@ CONFIGS: dict[str, ServerConfig] = {
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
         "to the user's question - just respond to it normally.",
     },
-    # V1 Test: Passing locally but failing in CI. This runs the
-    # V0 Engine because of CPU offloading. Need to debug why.
+    # FIXME: This test currently fails, need to debug why.
     # "granite20b": {
-    #     "model":
-    #     "mbayser/granite-20b-functioncalling-FP8-KV",
+    #     "model": "mbayser/granite-20b-functioncalling-FP8-KV",
     #     "arguments": [
-    #         "--tool-call-parser", "granite-20b-fc", "--chat-template",
-    #         str(VLLM_PATH /
-    #             "examples/tool_chat_template_granite_20b_fc.jinja"),
-    #         "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
+    #         "--tool-call-parser",
+    #         "granite-20b-fc",
+    #         "--chat-template",
+    #         str(VLLM_PATH / "examples/tool_chat_template_granite_20b_fc.jinja"),
+    #         "--max_num_seqs",
+    #         "1",
+    #         "--enforce-eager",
+    #         "--cpu-offload-gb",
+    #         "20",
     #     ],
-    #     "supports_parallel":
-    #     False,
-    #     "supports_rocm":
-    #     False,
+    #     "supports_parallel": False,
+    #     "supports_rocm": False,
     # },
     "granite-3.0-8b": {
         "model": "ibm-granite/granite-3.0-8b-instruct",
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 7421eb8b8abc9..848916dbd8763 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -339,7 +339,6 @@ class LLM:
 
         log_non_default_args(engine_args)
 
-        # Create the Engine (autoselects V0 vs V1)
         self.llm_engine = LLMEngine.from_engine_args(
             engine_args=engine_args, usage_context=UsageContext.LLM_CLASS
         )
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 41172d8ec2f72..b352c3ad01db0 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -377,7 +377,7 @@ class ResponsesRequest(OpenAIBaseModel):
             "environments. The salt should be random, protected from "
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."
+            "to 256 bit)."
         ),
     )
 
@@ -763,7 +763,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "environments. The salt should be random, protected from "
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."
+            "to 256 bit)."
         ),
     )
     kv_transfer_params: dict[str, Any] | None = Field(
@@ -1249,7 +1249,7 @@ class CompletionRequest(OpenAIBaseModel):
             "environments. The salt should be random, protected from "
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."
+            "to 256 bit)."
         ),
     )
 
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 900701c46348b..0ea5805305eda 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -590,7 +590,6 @@ class MambaMixer2(MambaBase, CustomOp):
             hidden_states, _B, _C = self.split_hidden_states_B_C_fn(hidden_states_B_C)
             return hidden_states
 
-        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
         num_prefills = attn_metadata.num_prefills  # request count
         num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
         num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index dc4caf2f02f9d..9966498e1b4c9 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -586,13 +586,11 @@ class IsHybrid(Protocol):
     def get_mamba_state_shape_from_config(
         cls,
         vllm_config: VllmConfig,
-        use_v1: bool = True,
     ) -> tuple[tuple[int, int], tuple[int, int, int]]:
         """Calculate shapes for Mamba's convolutional and state caches.
 
         Args:
             vllm_config: vLLM config
-            use_v1: Get shapes for V1 (or V0)
 
         Returns:
             Tuple containing:
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 22f9c87fc905b..472de5590dcf8 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -290,7 +290,6 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
         has_decode = num_decodes > 0
         num_actual_tokens = num_prefill_tokens + num_decodes
 
-        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
         # Separate prefill and decode by splitting varlen input
         # Split along token dimension
         hidden_states_d, hidden_states_p = torch.split(

From cca2d2cdbe56529205c10e58363c7bd2d31e15df Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 21 Nov 2025 07:01:54 -0500
Subject: [PATCH 110/249] [Core] Align whisper closer to other multimodal
 models (#27292)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/model_executor/models/whisper.py | 13 ++++---
 vllm/v1/worker/gpu_model_runner.py    | 49 +++++++--------------------
 2 files changed, 21 insertions(+), 41 deletions(-)

diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 91a10b95a08c0..50587c627160d 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -599,15 +599,16 @@ class WhisperModel(nn.Module):
 
     def forward(
         self,
-        input_features: torch.Tensor | list[torch.Tensor] | None,
         input_ids: torch.Tensor | None,
         positions: torch.Tensor,
+        encoder_outputs: list[torch.Tensor],
     ) -> torch.Tensor:
-        encoder_outputs = self.get_encoder_outputs(input_features)
+        assert len(encoder_outputs) in (0, 1)
+        enc_states = encoder_outputs[0] if len(encoder_outputs) == 1 else None
         decoder_outputs = self.decoder(
             input_ids=input_ids,
             positions=positions,
-            encoder_hidden_states=encoder_outputs,
+            encoder_hidden_states=enc_states,
         )
         return decoder_outputs
 
@@ -894,13 +895,15 @@ class WhisperForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
+        encoder_outputs: list[torch.Tensor] | None = None,
         **kwargs,
     ) -> torch.Tensor:
-        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if encoder_outputs is None:
+            encoder_outputs = []
         decoder_outputs = self.model(
-            input_features=audio_input["input_features"],
             input_ids=input_ids,
             positions=positions,
+            encoder_outputs=encoder_outputs,
         )
         return decoder_outputs
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4c65a5e9b0292..e786cd8bc7c97 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1923,14 +1923,16 @@ class GPUModelRunner(
 
         return mm_kwargs, mm_hashes_pos
 
-    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _execute_mm_encoder(
+        self, scheduler_output: "SchedulerOutput"
+    ) -> list[torch.Tensor]:
         # Batch the multi-modal inputs using the helper method.
         mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
             scheduler_output
         )
 
         if not mm_kwargs:
-            return
+            return []
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -2007,6 +2009,8 @@ class GPUModelRunner(
             logger.debug("Finish execute for mm hash %s", mm_hash)
             self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash)
 
+        return encoder_outputs
+
     def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
@@ -2095,38 +2099,6 @@ class GPUModelRunner(
 
         return mm_embeds, is_mm_embed
 
-    def _extract_encoder_inputs(
-        self,
-        scheduler_output: "SchedulerOutput",
-    ) -> dict[str, torch.Tensor]:
-        """Extract encoder inputs for encoder-decoder models.
-
-        This method extracts multimodal input features from scheduled encoder
-        inputs and formats them for the encoder-decoder model forward pass.
-        """
-        # Batch the multi-modal inputs using the helper method.
-        mm_kwargs, _ = self._batch_mm_kwargs_from_scheduler(scheduler_output)
-
-        if not mm_kwargs:
-            return {}
-
-        # Group MM kwargs by modality and extract features
-        model = cast(SupportsMultiModal, self.model)
-        encoder_features = {}
-        for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
-            mm_kwargs,
-            device=self.device,
-            pin_memory=self.pin_memory,
-            merge_by_field_config=model.merge_by_field_config,
-            multimodal_cpu_fields=model.multimodal_cpu_fields,
-        ):
-            # Add the grouped features to encoder_features dict
-            # This allows the model to receive them as kwargs (e.g.,
-            # input_features=...)
-            encoder_features.update(mm_kwargs_group)
-
-        return encoder_features
-
     def get_model(self) -> nn.Module:
         # get raw model out of the cudagraph wrapper.
         if isinstance(self.model, (CUDAGraphWrapper, UBatchWrapper)):
@@ -2416,8 +2388,13 @@ class GPUModelRunner(
             self.model_config.is_encoder_decoder
             and scheduler_output.scheduled_encoder_inputs
         ):
-            encoder_inputs = self._extract_encoder_inputs(scheduler_output)
-            model_kwargs.update(encoder_inputs)
+            # Run the encoder, just like we do with other multimodal inputs.
+            # For an encoder-decoder model, our processing here is a bit
+            # simpler, because the outputs are just passed to the decoder.
+            # We are not doing any prompt replacement. We also will only
+            # ever have a single encoder input.
+            encoder_outputs = self._execute_mm_encoder(scheduler_output)
+            model_kwargs.update({"encoder_outputs": encoder_outputs})
 
         return (
             input_ids,

From 2b1b3dfa4b02456b11b2bdbcd0857ddb96214a71 Mon Sep 17 00:00:00 2001
From: Bhagyashri <Bhagyashri.Gaikwad2@ibm.com>
Date: Fri, 21 Nov 2025 17:54:09 +0530
Subject: [PATCH 111/249] Update Dockerfile to use gcc-toolset-14 and fix test
 case failures on power (ppc64le) (#28957)

Signed-off-by: Bhagyashri <Bhagyashri.Gaikwad2@ibm.com>
---
 .../hardware_ci/run-cpu-test-ppc64le.sh       | 10 +++---
 docker/Dockerfile.ppc64le                     | 32 +++++++++++--------
 requirements/common.txt                       |  4 +--
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 39ea180173081..3728f73fa2a36 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -25,20 +25,22 @@ function cpu_tests() {
 
   # offline inference
   podman exec -it "$container_id" bash -c "
+    export TORCH_COMPILE_DISABLE=1
     set -xve
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
 
   # Run basic model test
   podman exec -it "$container_id" bash -c "
+    export TORCH_COMPILE_DISABLE=1
     set -evx
     pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
-    pip install sentence-transformers datamodel_code_generator
+    pip install sentence-transformers datamodel_code_generator tblib 
 
     # Note: disable Bart until supports V1
     # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
     # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
     # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
index ad9eae94b83dd..b16bea3607d2f 100644
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -8,8 +8,8 @@ FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openbl
 
 ARG MAX_JOBS
 ARG OPENBLAS_VERSION=0.3.30
-RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
-    && source /opt/rh/gcc-toolset-13/enable \
+RUN microdnf install -y dnf && dnf install -y gcc-toolset-14 make wget unzip \
+    && source /opt/rh/gcc-toolset-14/enable \
     && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
     && unzip OpenBLAS-$OPENBLAS_VERSION.zip \
     && cd OpenBLAS-$OPENBLAS_VERSION \
@@ -57,7 +57,7 @@ COPY --from=openblas-builder /tmp/control /dev/null
 RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
     dnf install -y openssl-devel \
     && dnf install -y \
-       git tar gcc-toolset-13 automake libtool \
+       git tar gcc-toolset-14 automake libtool \
        pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
        libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
        harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
@@ -84,7 +84,7 @@ ARG _GLIBCXX_USE_CXX11_ABI=1
 ARG OPENBLAS_VERSION=0.3.30
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable &&  \
+    source /opt/rh/gcc-toolset-14/enable &&  \
     git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
     cd pytorch && \
     uv pip install -r requirements.txt && \
@@ -97,7 +97,7 @@ ARG TORCHVISION_VERSION=0.22.0
 ARG TORCHVISION_USE_NVJPEG=0
 ARG TORCHVISION_USE_FFMPEG=0
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \
     cd vision && \
     MAX_JOBS=${MAX_JOBS:-$(nproc)} \
@@ -113,7 +113,7 @@ ARG USE_ROCM=0
 ARG USE_CUDA=0
 ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \
     cd audio && \
     MAX_JOBS=${MAX_JOBS:-$(nproc)} \
@@ -130,7 +130,7 @@ ARG MAX_JOBS
 ARG PYARROW_PARALLEL
 ARG PYARROW_VERSION=21.0.0
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
     cd arrow/cpp && \
     mkdir build && cd build && \
@@ -162,7 +162,7 @@ ARG OPENCV_VERSION=86
 ARG OPENCV_PATCH=97f3f39
 ARG ENABLE_HEADLESS=1
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
     cd opencv-python && \
     sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
@@ -196,7 +196,7 @@ ARG MAX_JOBS
 ARG NUMBA_VERSION=0.61.2
 
 # Clone all required dependencies
-RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
+RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-14/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
     git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
     cd ./numba && \
     if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
@@ -211,6 +211,9 @@ RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset
 
 FROM base-builder AS vllmcache-builder
 
+ENV LLVM_CONFIG=/usr/lib64/llvm15/bin/llvm-config
+ENV PATH=/usr/lib64/llvm15/bin:$PATH
+
 COPY --from=torch-builder /tmp/control /dev/null
 COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
@@ -225,10 +228,13 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 RUN --mount=type=cache,target=/root/.cache/uv \
     dnf install llvm15 llvm15-devel -y && \
     rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
     uv pip install maturin && \
     uv build --wheel --out-dir /hf_wheels/
+
+ENV CXXFLAGS="-fno-lto -Wno-error=free-nonheap-object" \
+    CFLAGS="-fno-lto"
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
     --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
@@ -236,7 +242,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
     --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
     --mount=type=bind,src=.,dst=/src/,rw \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     export PATH=$PATH:/usr/lib64/llvm15/bin && \
     uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \
     sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
@@ -260,7 +266,7 @@ FROM base-builder AS lapack-builder
 ARG MAX_JOBS
 ARG LAPACK_VERSION=3.12.1
 RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \
-    && cd lapack && source /opt/rh/gcc-toolset-13/enable \
+    && cd lapack && source /opt/rh/gcc-toolset-14/enable \
     && cmake -B build -S . \
     && cmake --build build -j ${MAX_JOBS:-$(nproc)}
 
@@ -299,7 +305,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
     rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
     microdnf install --nodocs -y \
-    libomp tar findutils openssl llvm15 llvm15-devel \
+    libomp libicu tar findutils openssl llvm15 llvm15-devel \
     pkgconfig xsimd g++ gcc-fortran libsndfile \
     libtiff libjpeg openjpeg2 zlib zeromq \
     freetype lcms2 libwebp tcl tk utf8proc \
diff --git a/requirements/common.txt b/requirements/common.txt
index f2d1c0762ef6a..3f8cd588422d0 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -19,12 +19,12 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.11.3
-llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x"
+llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x" or platform_machine == "ppc64le"
 outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
+xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs

From 9452863088b458912634f13273784bf6e16c8a4c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Nov 2025 20:27:43 +0800
Subject: [PATCH 112/249] Revert "Revert #28875 (#29159)" (#29179)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docker/Dockerfile                               | 17 -----------------
 docs/deployment/docker.md                       |  7 +++----
 .../installation/gpu.cuda.inc.md                |  5 +----
 3 files changed, 4 insertions(+), 25 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 964700e2a43ac..709b79e84fbbc 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -56,7 +56,6 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 
 # PyTorch provides its own indexes for standard and nightly builds
 ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 
 # PIP supports multiple authentication schemes, including keyring
 # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
@@ -98,7 +97,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Activate virtual environment and add uv to PATH
@@ -317,7 +315,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Install uv for faster pip installs
@@ -337,20 +334,6 @@ ENV UV_LINK_MODE=copy
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# arm64 (GH200) build follows the practice of "use existing pytorch" build,
-# we need to install torch and torchvision from the nightly builds first,
-# pytorch will not appear as a vLLM dependency in all of the following steps
-# after this step
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            --pre pytorch_triton==3.3.0+gitab727c40 ; \
-    fi
-
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 1c639f3533d47..0e636c87f38a4 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -82,8 +82,7 @@ DOCKER_BUILDKIT=1 docker build . \
 
 ## Building for Arm64/aarch64
 
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
-of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
 
 !!! note
     Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
@@ -94,7 +93,6 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 
     ```bash
     # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
-    python3 use_existing_torch.py
     DOCKER_BUILDKIT=1 docker build . \
     --file docker/Dockerfile \
     --target vllm-openai \
@@ -102,7 +100,8 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
     -t vllm/vllm-gh200-openai:latest \
     --build-arg max_jobs=66 \
     --build-arg nvcc_threads=2 \
-    --build-arg torch_cuda_arch_list="9.0 10.0+PTX"
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
+    --build-arg RUN_WHEEL_CHECK=false
     ```
 
 !!! note
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index b2d0d64a2d355..601d3659af886 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -158,10 +158,7 @@ uv pip install -e .
 
 ##### Use an existing PyTorch installation
 
-There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, e.g.:
-
-- Building vLLM with PyTorch nightly or a custom PyTorch build.
-- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it.
+There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, for example, when building vLLM with non-default PyTorch builds (like nightly or a custom build).
 
 To build vLLM using an existing PyTorch installation:
 

From fc9f821d2062d412474ced64b9087c881651eb30 Mon Sep 17 00:00:00 2001
From: who who who <fsx950223@outlook.com>
Date: Fri, 21 Nov 2025 20:55:43 +0800
Subject: [PATCH 113/249] fix cross attention (#28346)

Signed-off-by: fsx950223 <fsx950223@outlook.com>
---
 vllm/v1/attention/backends/triton_attn.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 889c79db18ef5..09c36043c8c86 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -244,14 +244,11 @@ class TritonAttentionImpl(AttentionImpl):
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        if attn_type != AttentionType.DECODER:
+        if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]:
             raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "TritonAttentionImpl"
+                "Encoder self-attention is not implemented for TritonAttentionImpl"
             )
-
+        self.attn_type = attn_type
         self.fp8_dtype = current_platform.fp8_dtype()
 
         self.sinks = sinks
@@ -312,7 +309,11 @@ class TritonAttentionImpl(AttentionImpl):
         num_actual_tokens = attn_metadata.num_actual_tokens
         key_cache, value_cache = kv_cache.unbind(1)
 
-        if self.kv_sharing_target_layer_name is None:
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             if self.kv_cache_dtype.startswith("fp8"):
@@ -346,7 +347,7 @@ class TritonAttentionImpl(AttentionImpl):
         max_seqlen_k = attn_metadata.max_seq_len
         block_table = attn_metadata.block_table
 
-        descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+        descale_shape = (cu_seqlens_q.shape[0] - 1, key_cache.shape[2])
 
         unified_attention(
             q=query[:num_actual_tokens],

From 2092ce8c39a4f01a93dcb32d3c92d05586507e7c Mon Sep 17 00:00:00 2001
From: sfbemerk <benjaminmerkel@mail.de>
Date: Fri, 21 Nov 2025 13:57:19 +0100
Subject: [PATCH 114/249] Tool Call Parser logs should not contain user input /
 model output except on DEBUG (#29160)

Signed-off-by: Benjamin Merkel <benjamin.merkel@tngtech.com>
Co-authored-by: Benjamin Merkel <benjamin.merkel@tngtech.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 .../openai/tool_parsers/glm4_moe_tool_parser.py    |  2 +-
 .../openai/tool_parsers/qwen3coder_tool_parser.py  | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
index 120e63b929b16..389e9754b34da 100644
--- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
@@ -78,7 +78,7 @@ class Glm4MoeModelToolParser(ToolParser):
                         .get("type", None)
                     )
                     return arg_type == "string"
-            logger.warning("No tool named '%s'.", tool_name)
+            logger.debug("No tool named '%s'.", tool_name)
             return False
 
         def _deserialize(value: str) -> Any:
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
index 26261c0065ead..9d4c079eba188 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -128,7 +128,7 @@ class Qwen3CoderToolParser(ToolParser):
                     return params
                 else:
                     return {}
-        logger.warning("Tool '%s' is not defined in the tools list.", func_name)
+        logger.debug("Tool '%s' is not defined in the tools list.", func_name)
         return {}
 
     def _convert_param_value(
@@ -141,7 +141,7 @@ class Qwen3CoderToolParser(ToolParser):
 
         if param_name not in param_config:
             if param_config != {}:
-                logger.warning(
+                logger.debug(
                     "Parsed parameter '%s' is not defined in the tool "
                     "parameters for tool '%s', directly returning the "
                     "string value.",
@@ -169,7 +169,7 @@ class Qwen3CoderToolParser(ToolParser):
             try:
                 return int(param_value)
             except (ValueError, TypeError):
-                logger.warning(
+                logger.debug(
                     "Parsed value '%s' of parameter '%s' is not an "
                     "integer in tool '%s', degenerating to string.",
                     param_value,
@@ -186,7 +186,7 @@ class Qwen3CoderToolParser(ToolParser):
                     else int(float_param_value)
                 )
             except (ValueError, TypeError):
-                logger.warning(
+                logger.debug(
                     "Parsed value '%s' of parameter '%s' is not a float "
                     "in tool '%s', degenerating to string.",
                     param_value,
@@ -197,7 +197,7 @@ class Qwen3CoderToolParser(ToolParser):
         elif param_type in ["boolean", "bool", "binary"]:
             param_value = param_value.lower()
             if param_value not in ["true", "false"]:
-                logger.warning(
+                logger.debug(
                     "Parsed value '%s' of parameter '%s' is not a boolean "
                     "(`true` or `false`) in tool '%s', degenerating to "
                     "false.",
@@ -216,7 +216,7 @@ class Qwen3CoderToolParser(ToolParser):
                     param_value = json.loads(param_value)
                     return param_value
                 except (json.JSONDecodeError, TypeError, ValueError):
-                    logger.warning(
+                    logger.debug(
                         "Parsed value '%s' of parameter '%s' cannot be "
                         "parsed with json.loads in tool '%s', will try "
                         "other methods to parse it.",
@@ -227,7 +227,7 @@ class Qwen3CoderToolParser(ToolParser):
             try:
                 param_value = ast.literal_eval(param_value)  # safer
             except (ValueError, SyntaxError, TypeError):
-                logger.warning(
+                logger.debug(
                     "Parsed value '%s' of parameter '%s' cannot be "
                     "converted via Python `ast.literal_eval()` in tool "
                     "'%s', degenerating to string.",

From 434f3d3eb869606af221f0307e16548c1f99da20 Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Fri, 21 Nov 2025 15:01:20 +0100
Subject: [PATCH 115/249] Fix mistral config (#29172)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 vllm/transformers_utils/configs/mistral.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index 8f72f0b28b0de..fe202b2ed1568 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -90,6 +90,10 @@ def _remap_mistral_yarn_args(config: dict) -> dict:
         "rope_type": "yarn",
         "mscale_all_dim": 1,
     }
+
+    if rope_theta := config.pop("rope_theta", None):
+        config["rope_parameters"]["rope_theta"] = rope_theta
+
     for old_name, new_name in yarn_config_map.items():
         if old_name in yarn_config:
             config["rope_parameters"][new_name] = yarn_config.pop(old_name)

From f1805db1a671ffb1c99b2eae98e1b1b729fbcc65 Mon Sep 17 00:00:00 2001
From: skaraban3807 <siddappa.karabannavar@amd.com>
Date: Fri, 21 Nov 2025 19:43:52 +0530
Subject: [PATCH 116/249] [Perf] These changes enhance the NUMA functionality
 of vllm for systems with more than one NUMA nodes per socket (#25559)

Signed-off-by: Siddappa Karabannavar <siddappa.karabannavar@amd.com>
---
 csrc/cpu/utils.cpp | 65 +++++++++++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index c5a48352e3089..5199ba2af024f 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -45,31 +45,54 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
   // Memory node binding
   if (numa_available() != -1) {
     int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
-    // Verify all CPUs are on the same NUMA node
-    for (size_t i = 1; i < omp_cpu_ids.size(); ++i) {
-      int node_id = numa_node_of_cpu(omp_cpu_ids[i]);
-      TORCH_CHECK(node_id == mem_node_id, "CPU ", omp_cpu_ids[i],
-                  " is on NUMA node ", node_id, ", but CPU ",
-                  omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
-                  ". All CPUs should be on the same NUMA node for optimal "
-                  "performance. Memory will be bound to NUMA node ",
-                  mem_node_id, ".");
+    std::set<int> node_ids;
+    for (const auto& cpu_id : omp_cpu_ids) {
+      int node_id = numa_node_of_cpu(cpu_id);
+      if (node_id != -1) {
+        node_ids.insert(node_id);
+      }
+      TORCH_WARN(node_id == mem_node_id, "CPU ", cpu_id, " is on NUMA node ",
+                 node_id, ", but CPU ", omp_cpu_ids.front(),
+                 " is on NUMA node ", mem_node_id,
+                 ". All CPUs should be on the same NUMA node for optimal "
+                 "performance. Memory will be bound to NUMA node ",
+                 mem_node_id, ".");
     }
-    bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
-    bitmask* src_mask = numa_get_membind();
+    // Concatenate all node_ids into a single comma-separated string
+    if (!node_ids.empty()) {
+      std::string node_ids_str;
+      for (const int node_id : node_ids) {
+        if (!node_ids_str.empty()) {
+          node_ids_str += ",";
+        }
+        node_ids_str += std::to_string(node_id);
+      }
 
-    int pid = getpid();
+      bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
+      bitmask* src_mask = numa_get_membind();
 
-    // move all existing pages to the specified numa node.
-    *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
-    int page_num = numa_migrate_pages(pid, src_mask, mask);
-    if (page_num == -1) {
-      TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
+      int pid = getpid();
+
+      if (mask && src_mask) {
+        // move all existing pages to the specified numa node.
+        *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
+        int page_num = numa_migrate_pages(pid, src_mask, mask);
+        if (page_num == -1) {
+          TORCH_WARN("numa_migrate_pages failed. errno: " +
+                     std::to_string(errno));
+        }
+
+        // restrict memory allocation node.
+        numa_set_membind(mask);
+        numa_set_strict(1);
+
+        numa_free_nodemask(mask);
+        numa_free_nodemask(src_mask);
+      } else {
+        TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " +
+                   std::to_string(errno));
+      }
     }
-
-    // restrict memory allocation node.
-    numa_set_membind(mask);
-    numa_set_strict(1);
   }
 
   // OMP threads binding

From 4050bae4171edeadb24be5b6b1f8a3287612f872 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Fri, 21 Nov 2025 22:57:26 +0800
Subject: [PATCH 117/249] [Doc] Update plugin doc (#28532)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 docs/design/plugin_system.md | 98 +++++++++++++++++++++++++++++++++++-
 vllm/plugins/__init__.py     |  3 ++
 vllm/v1/metrics/loggers.py   |  4 +-
 3 files changed, 101 insertions(+), 4 deletions(-)

diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index e8db8047ca4e6..9c84889f7f03d 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -4,7 +4,7 @@ The community frequently requests the ability to extend vLLM with custom feature
 
 ## How Plugins Work in vLLM
 
-Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
+Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_plugins_by_group][vllm.plugins.load_plugins_by_group] function in the `vllm.plugins` module.
 
 ## How vLLM Discovers Plugins
 
@@ -57,6 +57,100 @@ Every plugin has three parts:
 
 - **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
 
+### Platform plugins guidelines
+
+1. Create a platform plugin project, for example, `vllm_add_dummy_platform`. The project structure should look like this:
+
+    ```shell
+    vllm_add_dummy_platform/
+    ├── vllm_add_dummy_platform/
+    │   ├── __init__.py
+    │   ├── my_dummy_platform.py
+    │   ├── my_dummy_worker.py
+    │   ├── my_dummy_attention.py
+    │   ├── my_dummy_device_communicator.py
+    │   ├── my_dummy_custom_ops.py
+    ├── setup.py
+    ```
+
+2. In the `setup.py` file, add the following entry point:
+
+    ```python
+    setup(
+        name="vllm_add_dummy_platform",
+        ...
+        entry_points={
+            "vllm.platform_plugins": [
+                "my_dummy_platform = vllm_add_dummy_platform:register"
+            ]
+        },
+        ...
+    )
+        ```
+
+    Please make sure `vllm_add_dummy_platform:register` is a callable function and returns the platform class's fully qualified name. for example:
+
+    ```python
+    def register():
+        return "vllm_add_dummy_platform.my_dummy_platform.MyDummyPlatform"
+    ```
+
+3. Implement the platform class `MyDummyPlatform` in `my_dummy_platform.py`. The platform class should inherit from `vllm.platforms.interface.Platform`. Please follow the interface to implement the functions one by one. There are some important functions and properties that should be implemented at least:
+
+    - `_enum`: This property is the device enumeration from [PlatformEnum][vllm.platforms.interface.PlatformEnum]. Usually, it should be `PlatformEnum.OOT`, which means the platform is out-of-tree.
+    - `device_type`: This property should return the type of the device which pytorch uses. For example, `"cpu"`, `"cuda"`, etc.
+    - `device_name`: This property is set the same as `device_type` usually. It's mainly used for logging purposes.
+    - `check_and_update_config`: This function is called very early in the vLLM's initialization process. It's used for plugins to update the vllm configuration. For example, the block size, graph mode config, etc, can be updated in this function. The most important thing is that the **worker_cls** should be set in this function to let vLLM know which worker class to use for the worker process.
+    - `get_attn_backend_cls`: This function should return the attention backend class's fully qualified name.
+    - `get_device_communicator_cls`: This function should return the device communicator class's fully qualified name.
+
+4. Implement the worker class `MyDummyWorker` in `my_dummy_worker.py`. The worker class should inherit from [WorkerBase][vllm.v1.worker.worker_base.WorkerBase]. Please follow the interface to implement the functions one by one. Basically, all interfaces in the base class should be implemented, since they are called here and there in vLLM. To make sure a model can be executed, the basic functions should be implemented are:
+
+    - `init_device`: This function is called to set up the device for the worker.
+    - `initialize_cache`: This function is called to set cache config for the worker.
+    - `load_model`: This function is called to load the model weights to device.
+    - `get_kv_cache_spaces`: This function is called to generate the kv cache spaces for the model.
+    - `determine_available_memory`: This function is called to profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs.
+    - `initialize_from_config`: This function is called to allocate device KV cache with the specified kv_cache_config
+    - `execute_model`: This function is called every step to inference the model.
+
+    Additional functions that can be implemented are:
+
+    - If the plugin wants to support sleep mode feature, please implement the `sleep` and `wakeup` functions.
+    - If the plugin wants to support graph mode feature, please implement the `compile_or_warm_up_model` function.
+    - If the plugin wants to support speculative decoding feature, please implement the `take_draft_token_ids` function.
+    - If the plugin wants to support lora feature, please implement the `add_lora`,`remove_lora`,`list_loras` and `pin_lora` functions.
+    - If the plugin wants to support data parallelism feature, please implement the `execute_dummy_batch` functions.
+
+    Please look at the worker base class [WorkerBase][vllm.v1.worker.worker_base.WorkerBase] for more functions that can be implemented.
+
+5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.attention.backends.abstract.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.
+
+6. Implement custom ops for high performance. Most ops can be ran by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports:
+
+    - pytorch ops
+      there are 3 kinds of pytorch ops:
+
+        - `communicator ops`: Device communicator op. Such as all-reduce, all-gather, etc.
+          Please implement the device communicator class `MyDummyDeviceCommunicator` in `my_dummy_device_communicator.py`. The device communicator class should inherit from [DeviceCommunicatorBase][vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase].
+        - `common ops`: Common ops. Such as matmul, softmax, etc.
+          Please implement the common ops by register oot way. See more detail in [CustomOp][vllm.model_executor.custom_op.CustomOp] class.
+        - `csrc ops`: C++ ops. This kind of ops are implemented in C++ and are registered as torch custom ops.
+          Following csrc module and `vllm._custom_ops` to implement your ops.
+
+    - triton ops
+      Custom way doesn't work for triton ops now.
+
+7. (optional) Implement other plugable modules, such as lora, graph backend, quantization, mamba attention backend, etc.
+
 ## Compatibility Guarantee
 
-vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
+vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets.
+
+The interface for the model/module may change during vLLM's development. If you see any deprecation log info, please upgrade your plugin to the latest version.
+
+## Deprecation announcement
+
+!!! warning "Deprecations"
+    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It will be removed in v0.13.0 or v1.0.0.
+    - `_Backend` in `vllm.attention` is deprecated. It will be removed in v0.13.0 or v1.0.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 0d8988f27959f..4c59d5364a763 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -17,6 +17,9 @@ IO_PROCESSOR_PLUGINS_GROUP = "vllm.io_processor_plugins"
 # Platform plugins group will be loaded in all processes when
 # `vllm.platforms.current_platform` is called and the value not initialized,
 PLATFORM_PLUGINS_GROUP = "vllm.platform_plugins"
+# Stat logger plugins group will be loaded in process0 only when serve vLLM with
+# async mode.
+STAT_LOGGER_PLUGINS_GROUP = "vllm.stat_logger_plugins"
 
 # make sure one process only loads plugins once
 plugins_loaded = False
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index cb36e7973650e..e2d82241ce210 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -16,7 +16,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
     KVConnectorPrometheus,
 )
 from vllm.logger import init_logger
-from vllm.plugins import load_plugins_by_group
+from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
 from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.prometheus import unregister_vllm_metrics
 from vllm.v1.metrics.stats import (
@@ -67,7 +67,7 @@ class StatLoggerBase(ABC):
 def load_stat_logger_plugin_factories() -> list[StatLoggerFactory]:
     factories: list[StatLoggerFactory] = []
 
-    for name, plugin_class in load_plugins_by_group("vllm.stat_logger_plugins").items():
+    for name, plugin_class in load_plugins_by_group(STAT_LOGGER_PLUGINS_GROUP).items():
         if not isinstance(plugin_class, type) or not issubclass(
             plugin_class, StatLoggerBase
         ):

From d7219bcda3e6508cb14881bec303e2d0ab68c898 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Nov 2025 23:27:44 +0800
Subject: [PATCH 118/249] [Misc] Move dynamic seed initialization to
 `EngineArgs` (#29165)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config/model.py         | 34 +++++++---------------------------
 vllm/config/speculative.py   |  7 +------
 vllm/engine/arg_utils.py     | 16 +++++++++++++++-
 vllm/v1/worker/tpu_worker.py |  3 ---
 4 files changed, 23 insertions(+), 37 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 97cba6ea7295e..8f59673f4e1c3 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -146,9 +146,12 @@ class ModelConfig:
     - "bfloat16" for a balance between precision and range.\n
     - "float" is shorthand for FP32 precision.\n
     - "float32" for FP32 precision."""
-    seed: int | None = None
-    """Random seed for reproducibility. Initialized to None in V0, but
-    initialized to 0 in V1."""
+    seed: int = 0
+    """Random seed for reproducibility.
+
+    We must set the global seed because otherwise,
+    different tensor parallel workers would sample different tokens,
+    leading to inconsistent results."""
     hf_config: PretrainedConfig = field(init=False)
     """The Hugging Face config of the model."""
     hf_text_config: PretrainedConfig = field(init=False)
@@ -415,7 +418,7 @@ class ModelConfig:
     def __post_init__(
         self,
         # Multimodal config init vars
-        limit_mm_per_prompt: dict[str, int] | None,
+        limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
         enable_mm_embeds: bool | None,
         media_io_kwargs: dict[str, dict[str, Any]] | None,
         mm_processor_kwargs: dict[str, Any] | None,
@@ -428,23 +431,6 @@ class ModelConfig:
         skip_mm_profiling: bool | None,
         video_pruning_rate: float | None,
     ) -> None:
-        # Set the default seed to 0 in V1.
-        # NOTE(woosuk): In V1, we use separate processes for workers (unless
-        # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
-        # doesn't affect the user process. However, without a consistent seed,
-        # different tensor parallel workers would sample different tokens,
-        # leading to inconsistent results.
-        if self.seed is None:
-            self.seed = 0
-            if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
-                logger.warning(
-                    "The global random seed is set to %d. Since "
-                    "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
-                    "affect the random state of the Python process that "
-                    "launched vLLM.",
-                    self.seed,
-                )
-
         # Keep set served_model_name before maybe_model_redirect(self.model)
         self.served_model_name = get_served_model_name(
             self.model, self.served_model_name
@@ -1151,12 +1137,6 @@ class ModelConfig:
         self,
         parallel_config: ParallelConfig,
     ) -> None:
-        if parallel_config.distributed_executor_backend == "external_launcher":
-            assert self.seed is not None, (
-                "Seed must be set when using external launcher backend to "
-                "make sure sampling results are the same across workers."
-            )
-
         total_num_attention_heads = getattr(
             self.hf_text_config, "num_attention_heads", 0
         )
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index a0c65b6049e1e..d7c019c73d598 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -9,6 +9,7 @@ from pydantic import Field, SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
 from typing_extensions import Self
 
+from vllm.config.model import ModelConfig
 from vllm.config.parallel import ParallelConfig
 from vllm.config.utils import config
 from vllm.logger import init_logger
@@ -18,10 +19,8 @@ if TYPE_CHECKING:
     from transformers import PretrainedConfig
 
     import vllm.model_executor.layers.quantization as me_quant
-    from vllm.config import ModelConfig
 else:
     PretrainedConfig = Any
-    ModelConfig = Any
 
     me_quant = LazyLoader(
         "model_executor", globals(), "vllm.model_executor.layers.quantization"
@@ -316,10 +315,6 @@ class SpeculativeConfig:
             self.prompt_lookup_min = 0
 
             if self.model is not None:
-                # TODO: Move this import to the top once `ModelConfig`
-                # lives in `vllm.config.model`.
-                from vllm.config import ModelConfig
-
                 self.draft_model_config = ModelConfig(
                     model=self.model,
                     runner="draft",
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bcb90119f9b04..6eaf328eb1655 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -367,7 +367,7 @@ class EngineArgs:
     config_format: str = ModelConfig.config_format
     dtype: ModelDType = ModelConfig.dtype
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
-    seed: int | None = ModelConfig.seed
+    seed: int | None = None
     max_model_len: int | None = ModelConfig.max_model_len
     cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
     cudagraph_capture_sizes: list[int] | None = (
@@ -1188,6 +1188,20 @@ class EngineArgs:
         if check_gguf_file(self.model):
             self.quantization = self.load_format = "gguf"
 
+        # NOTE(woosuk): In V1, we use separate processes for workers (unless
+        # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
+        # doesn't affect the user process.
+        if self.seed is None:
+            self.seed = 0
+            if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
+                logger.warning(
+                    "The global random seed is set to %d. Since "
+                    "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
+                    "affect the random state of the Python process that "
+                    "launched vLLM.",
+                    self.seed,
+                )
+
         if self.disable_mm_preprocessor_cache:
             logger.warning(
                 "`--disable-mm-preprocessor-cache` is deprecated "
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 569b2aaa766e4..e1a109eca0a88 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -106,9 +106,6 @@ class TPUWorker:
                 "Profiling enabled. Traces will be saved to: %s", self.profile_dir
             )
 
-        if self.model_config.seed is None:
-            self.model_config.seed = 0
-
     def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks

From 711241c13cf9c1e543a1948bb25a40623f3da78c Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 21 Nov 2025 09:58:38 -0600
Subject: [PATCH 119/249] [CI/Build] Fix illegal memory access and unsupported
 test in kernels/attention/test_cache.py (#29118)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/kernels/attention/test_cache.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index f33a27d1fd85a..028e164cb801b 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -68,6 +68,7 @@ def test_copy_blocks(
         pytest.skip()
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
     # Generate random block mappings where each source block is mapped to two
     # destination blocks.
     assert 2 * num_mappings <= num_blocks
@@ -152,6 +153,7 @@ def test_reshape_and_cache(
         pytest.skip()
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
     slot_mapping_lst = random.sample(range(num_slots), num_tokens)
@@ -272,6 +274,7 @@ def test_reshape_and_cache_flash(
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
     assert implementation in ["cuda", "triton"]
     if implementation == "triton" and kv_cache_layout == "HND":
         pytest.skip("Triton implementation only supports NHD layout.")
@@ -593,6 +596,7 @@ def test_concat_and_cache_mla(
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
@@ -662,11 +666,14 @@ def test_concat_and_cache_ds_mla(
     seed: int,
     device: str,
 ) -> None:
+    if current_platform.is_rocm():
+        pytest.skip("concat_and_cache_mla doesn't support fp8_ds_mla on ROCm")
     if dtype.itemsize != 2:
         pytest.skip("ds_mla only supports 16-bit input")
     kv_cache_dtype = "fp8_ds_mla"
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
@@ -779,6 +786,7 @@ def test_copy_blocks_mla(
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
 
     entry_size = kv_lora_rank + qk_rope_head_dim
 
@@ -843,6 +851,7 @@ def test_swap_blocks_mla(
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
 
     entry_size = kv_lora_rank + qk_rope_head_dim
 

From 1f400c58b8a6d2852b137cd841206a6ea8aaf43a Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 21 Nov 2025 11:20:33 -0500
Subject: [PATCH 120/249] [CI] Add batch invariant test to ci (#27842)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .buildkite/test-pipeline.yaml                 | 12 ++++++++++++
 tests/v1/determinism/test_batch_invariance.py |  2 ++
 tests/v1/determinism/utils.py                 |  3 ++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6169b279dc8a4..a5719d438eece 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -346,6 +346,18 @@ steps:
   commands:
     - pytest -v -s v1/attention
 
+- label: Batch Invariance Tests (H100) # 10min
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
 - label: V1 Test attention (B200) # 10min
   timeout_in_minutes: 30
   gpu: b200
diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index 74ae5e182da78..b9e2daafb8705 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -190,6 +190,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",  # not everything is supported
+        gpu_memory_utilization=0.9,
     )
 
     # Use more realistic prompts for better token generation
@@ -444,6 +445,7 @@ def test_logprobs_without_batch_invariance_should_fail(
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
 
     # CRITICAL: Disable batch invariance for this test
+    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
     monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
index 7ee442551e2c3..ecbb6a1126933 100644
--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -6,6 +6,7 @@ import random
 import pytest
 import torch
 
+from vllm.attention.utils.fa_utils import flash_attn_supports_mla
 from vllm.platforms import current_platform
 
 skip_unsupported = pytest.mark.skipif(
@@ -18,7 +19,7 @@ BACKENDS: list[str] = [
     "FLASHINFER",
 ]
 
-if current_platform.is_cuda() and current_platform.is_device_capability(90):
+if flash_attn_supports_mla():
     BACKENDS.append("FLASH_ATTN_MLA")
 
 DEFAULT_MODEL = "Qwen/Qwen3-1.7B"

From 30b44a1598ea62fd3dcfd0d72a799ca4685e829e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 21 Nov 2025 08:20:55 -0800
Subject: [PATCH 121/249] GPU Model Runner V2 (#25266)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .github/CODEOWNERS                       |   3 +
 vllm/envs.py                             |   5 +
 vllm/v1/attention/backends/flashinfer.py |   3 +
 vllm/v1/core/sched/output.py             |  24 +
 vllm/v1/core/sched/scheduler.py          |  28 +-
 vllm/v1/worker/gpu/README.md             |   4 +
 vllm/v1/worker/gpu/__init__.py           |   0
 vllm/v1/worker/gpu/async_utils.py        |  89 +++
 vllm/v1/worker/gpu/attn_utils.py         | 187 ++++++
 vllm/v1/worker/gpu/block_table.py        | 315 +++++++++
 vllm/v1/worker/gpu/cudagraph_utils.py    | 198 ++++++
 vllm/v1/worker/gpu/dp_utils.py           |  22 +
 vllm/v1/worker/gpu/input_batch.py        | 265 ++++++++
 vllm/v1/worker/gpu/model_runner.py       | 814 +++++++++++++++++++++++
 vllm/v1/worker/gpu/sampler.py            | 327 +++++++++
 vllm/v1/worker/gpu/states.py             | 265 ++++++++
 vllm/v1/worker/gpu/structured_outputs.py |  76 +++
 vllm/v1/worker/gpu_worker.py             |  26 +-
 18 files changed, 2639 insertions(+), 12 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/README.md
 create mode 100644 vllm/v1/worker/gpu/__init__.py
 create mode 100644 vllm/v1/worker/gpu/async_utils.py
 create mode 100644 vllm/v1/worker/gpu/attn_utils.py
 create mode 100644 vllm/v1/worker/gpu/block_table.py
 create mode 100644 vllm/v1/worker/gpu/cudagraph_utils.py
 create mode 100644 vllm/v1/worker/gpu/dp_utils.py
 create mode 100644 vllm/v1/worker/gpu/input_batch.py
 create mode 100644 vllm/v1/worker/gpu/model_runner.py
 create mode 100644 vllm/v1/worker/gpu/sampler.py
 create mode 100644 vllm/v1/worker/gpu/states.py
 create mode 100644 vllm/v1/worker/gpu/structured_outputs.py

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 6e178bb690c56..0e834c057c401 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -35,6 +35,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC
 
+# Model runner V2
+/vllm/v1/worker/gpu @WoosukKwon
+
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 
 /tests/distributed/test_multi_node_assignment.py @youkaichao
diff --git a/vllm/envs.py b/vllm/envs.py
index 888a09cf6d3ec..d2d6917403420 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -231,6 +231,7 @@ if TYPE_CHECKING:
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
+    VLLM_USE_V2_MODEL_RUNNER: bool = False
 
 
 def get_default_cache_root():
@@ -1522,6 +1523,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices(
         "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"]
     ),
+    # Flag to enable v2 model runner.
+    "VLLM_USE_V2_MODEL_RUNNER": lambda: bool(
+        int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0"))
+    ),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 3ad7e8c52fc1f..e3f499216d7f1 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -593,6 +593,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             )
         return self._workspace_buffer
 
+    def set_workspace_buffer(self, workspace_buffer: torch.Tensor):
+        self._workspace_buffer = workspace_buffer
+
     def _get_prefill_wrapper(
         self,
     ) -> BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper:
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 20fdb3446404b..7902513dce49a 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -44,11 +44,15 @@ class NewRequestData:
     lora_request: LoRARequest | None
     prompt_embeds: "torch.Tensor | None" = None
 
+    # Only used for v2 model runner.
+    prefill_token_ids: list[int] | None = None
+
     @classmethod
     def from_request(
         cls,
         request: Request,
         block_ids: tuple[list[int], ...],
+        prefill_token_ids: list[int] | None = None,
     ) -> "NewRequestData":
         return cls(
             req_id=request.request_id,
@@ -60,6 +64,7 @@ class NewRequestData:
             num_computed_tokens=request.num_computed_tokens,
             lora_request=request.lora_request,
             prompt_embeds=request.prompt_embeds,
+            prefill_token_ids=prefill_token_ids,
         )
 
     def __repr__(self) -> str:
@@ -68,6 +73,7 @@ class NewRequestData:
             f"NewRequestData("
             f"req_id={self.req_id},"
             f"prompt_token_ids={self.prompt_token_ids},"
+            f"prefill_token_ids={self.prefill_token_ids},"
             f"mm_features={self.mm_features},"
             f"sampling_params={self.sampling_params},"
             f"block_ids={self.block_ids},"
@@ -183,6 +189,10 @@ class SchedulerOutput:
     # freed from the encoder cache.
     free_encoder_mm_hashes: list[str]
 
+    # Request IDs that are preempted in this step.
+    # Only used for v2 model runner.
+    preempted_req_ids: set[str] | None = None
+
     # Whether the scheduled requests have all the output tokens they
     # need to perform grammar bitmask computation.
     pending_structured_output_tokens: bool = False
@@ -193,6 +203,20 @@ class SchedulerOutput:
     # EC Cache Connector metadata
     ec_connector_metadata: ECConnectorMetadata | None = None
 
+    @classmethod
+    def make_empty(cls) -> "SchedulerOutput":
+        return cls(
+            scheduled_new_reqs=[],
+            scheduled_cached_reqs=CachedRequestData.make_empty(),
+            num_scheduled_tokens={},
+            total_num_scheduled_tokens=0,
+            scheduled_spec_decode_tokens={},
+            scheduled_encoder_inputs={},
+            num_common_prefix_blocks=[],
+            finished_req_ids=set(),
+            free_encoder_mm_hashes=[],
+        )
+
 
 @dataclass
 class GrammarOutput:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 1ac8520a8ed25..9195b112d8690 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -6,6 +6,7 @@ from collections import defaultdict
 from collections.abc import Iterable
 from typing import Any
 
+from vllm import envs
 from vllm.config import VllmConfig
 from vllm.distributed.ec_transfer.ec_connector.base import (
     ECConnectorMetadata,
@@ -187,6 +188,7 @@ class Scheduler(SchedulerInterface):
             pcp_world_size=self.pcp_world_size,
         )
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
+        self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
 
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
@@ -658,12 +660,25 @@ class Scheduler(SchedulerInterface):
                 )
 
         # Construct the scheduler output.
-        new_reqs_data = [
-            NewRequestData.from_request(
-                req, req_to_new_blocks[req.request_id].get_block_ids()
-            )
-            for req in scheduled_new_reqs
-        ]
+        if self.use_v2_model_runner:
+            scheduled_new_reqs = scheduled_new_reqs + scheduled_resumed_reqs
+            scheduled_resumed_reqs = []
+            new_reqs_data = [
+                NewRequestData.from_request(
+                    req,
+                    req_to_new_blocks[req.request_id].get_block_ids(),
+                    req._all_token_ids,
+                )
+                for req in scheduled_new_reqs
+            ]
+        else:
+            new_reqs_data = [
+                NewRequestData.from_request(
+                    req, req_to_new_blocks[req.request_id].get_block_ids()
+                )
+                for req in scheduled_new_reqs
+            ]
+
         with record_function_or_nullcontext("schedule: make_cached_request_data"):
             cached_reqs_data = self._make_cached_request_data(
                 scheduled_running_reqs,
@@ -685,6 +700,7 @@ class Scheduler(SchedulerInterface):
             scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
             num_common_prefix_blocks=num_common_prefix_blocks,
+            preempted_req_ids={req.request_id for req in preempted_reqs},
             # finished_req_ids is an existing state in the scheduler,
             # instead of being newly scheduled in this step.
             # It contains the request IDs that are finished in between
diff --git a/vllm/v1/worker/gpu/README.md b/vllm/v1/worker/gpu/README.md
new file mode 100644
index 0000000000000..093f524b3250f
--- /dev/null
+++ b/vllm/v1/worker/gpu/README.md
@@ -0,0 +1,4 @@
+# [Experimental] Model Runner V2
+
+This directory contains the new model runner which is under active development.
+Ping [Woosuk Kwon](https://github.com/WoosukKwon) for any changes.
diff --git a/vllm/v1/worker/gpu/__init__.py b/vllm/v1/worker/gpu/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
new file mode 100644
index 0000000000000..638ec6fb0b082
--- /dev/null
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import contextmanager
+
+import numpy as np
+import torch
+
+from vllm.v1.outputs import (
+    AsyncModelRunnerOutput,
+    ModelRunnerOutput,
+    SamplerOutput,
+)
+
+
+class AsyncOutput(AsyncModelRunnerOutput):
+    def __init__(
+        self,
+        model_runner_output: ModelRunnerOutput,
+        sampler_output: SamplerOutput,
+        num_sampled_tokens: np.ndarray,
+        copy_stream: torch.cuda.Stream,
+        copy_event: torch.cuda.Event,
+    ):
+        self.model_runner_output = model_runner_output
+        self.sampler_output = sampler_output
+        self.num_sampled_tokens = num_sampled_tokens
+        self.copy_stream = copy_stream
+        self.copy_event = copy_event
+
+        default_stream = torch.cuda.current_stream()
+        with torch.cuda.stream(self.copy_stream):
+            self.copy_stream.wait_stream(default_stream)
+
+            # NOTE(woosuk): We must ensure that CPU tensors are not freed
+            # before the device-to-host copy is fully completed. For instance,
+            # operations like
+            # self.sampled_token_np = ...to("cpu", non_blocking=True).numpy()
+            # are unsafe because the underlying CPU tensor can be prematurely freed and
+            # reused by other tensors before the asynchronous copy finishes, potentially
+            # causing race conditions. To prevent this, we delay freeing by holding
+            # references until the copy event signals completion.
+            # Likewise, we also need to keep the reference to the GPU tensors.
+            # This is done by keeping the reference to sampler_output and
+            # model_runner_output.
+            self.sampled_token_ids = sampler_output.sampled_token_ids.to(
+                "cpu", non_blocking=True
+            )
+            if sampler_output.logprobs_tensors is not None:
+                self.logprobs_tensors = (
+                    sampler_output.logprobs_tensors.to_cpu_nonblocking()
+                )
+            else:
+                self.logprobs_tensors = None
+            self.prompt_logprobs_dict = {}
+            if self.model_runner_output.prompt_logprobs_dict:
+                for k, v in self.model_runner_output.prompt_logprobs_dict.items():
+                    self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking()
+            self.copy_event.record(self.copy_stream)
+
+    def get_output(self) -> ModelRunnerOutput:
+        self.copy_event.synchronize()
+
+        # NOTE(woosuk): The following code is to ensure compatibility with
+        # the existing model runner.
+        # Going forward, we should keep the data structures as NumPy arrays
+        # rather than Python lists.
+        sampled_token_ids_np = self.sampled_token_ids.numpy()
+        num_reqs = sampled_token_ids_np.shape[0]
+        sampled_token_ids: list[np.ndarray] = [
+            sampled_token_ids_np[i, : self.num_sampled_tokens[i]]
+            for i in range(num_reqs)
+        ]
+        self.model_runner_output.sampled_token_ids = sampled_token_ids
+
+        if self.logprobs_tensors is not None:
+            self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
+        self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
+        return self.model_runner_output
+
+
+@contextmanager
+def async_barrier(event: torch.cuda.Event | None):
+    if event is not None:
+        event.synchronize()
+    try:
+        yield
+    finally:
+        if event is not None:
+            event.record()
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
new file mode 100644
index 0000000000000..8850c18092299
--- /dev/null
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Sequence
+from typing import Any
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
+from vllm.v1.kv_cache_interface import (
+    KVCacheConfig,
+    KVCacheSpec,
+)
+from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.worker.utils import bind_kv_cache
+
+
+def get_kv_cache_spec(vllm_config: VllmConfig) -> dict[str, KVCacheSpec]:
+    kv_cache_spec: dict[str, KVCacheSpec] = {}
+    attn_layers = get_layers_from_vllm_config(vllm_config, AttentionLayerBase)
+    for layer_name, attn_module in attn_layers.items():
+        # Skip modules that don't need KV cache (eg encoder-only attention)
+        if spec := attn_module.get_kv_cache_spec(vllm_config):
+            kv_cache_spec[layer_name] = spec
+    return kv_cache_spec
+
+
+def init_attn_backend(
+    kv_cache_config: KVCacheConfig,
+    vllm_config: VllmConfig,
+    device: torch.device,
+):
+    attn_backends: dict[str, AttentionBackend] = {}
+    attn_metadata_builders: list[AttentionMetadataBuilder] = []
+    flashinfer_workspace: torch.Tensor | None = None
+    for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
+        layer_names = kv_cache_group_spec.layer_names
+        any_layer_name = next(iter(layer_names))
+
+        attn_layers = get_layers_from_vllm_config(
+            vllm_config, AttentionLayerBase, layer_names
+        )
+        attn_backend = attn_layers[any_layer_name].get_attn_backend()
+        for layer_name in layer_names:
+            attn_backends[layer_name] = attn_backend
+
+        attn_metadata_builder = attn_backend.get_builder_cls()(
+            kv_cache_group_spec.kv_cache_spec,
+            layer_names,
+            vllm_config,
+            device,
+        )
+        attn_metadata_builders.append(attn_metadata_builder)  # type: ignore
+
+        if "FLASHINFER" in attn_backend.get_name():
+            if flashinfer_workspace is None:
+                flashinfer_workspace = attn_metadata_builder._get_workspace_buffer()
+            else:
+                attn_metadata_builder.set_workspace_buffer(flashinfer_workspace)
+    return attn_backends, attn_metadata_builders
+
+
+def _allocate_kv_cache(
+    kv_cache_config: KVCacheConfig,
+    device: torch.device,
+):
+    kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        tensor = torch.zeros(kv_cache_tensor.size, dtype=torch.int8, device=device)
+        for layer_name in kv_cache_tensor.shared_by:
+            kv_cache_raw_tensors[layer_name] = tensor
+
+    layer_names = set()
+    for group in kv_cache_config.kv_cache_groups:
+        for layer_name in group.layer_names:
+            layer_names.add(layer_name)
+    assert layer_names == set(kv_cache_raw_tensors.keys()), (
+        "Some layers are not correctly initialized"
+    )
+    return kv_cache_raw_tensors
+
+
+def _reshape_kv_cache(
+    kv_cache_config: KVCacheConfig,
+    kv_cache_raw_tensors: dict[str, torch.Tensor],
+    attn_backends: dict[str, AttentionBackend],
+) -> dict[str, torch.Tensor]:
+    kv_caches: dict[str, torch.Tensor] = {}
+    for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
+        kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+        for layer_name in kv_cache_group_spec.layer_names:
+            raw_tensor = kv_cache_raw_tensors[layer_name]
+            assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
+            num_blocks = raw_tensor.numel() // kv_cache_spec.page_size_bytes
+
+            attn_backend = attn_backends[layer_name]
+            kv_cache_shape = attn_backend.get_kv_cache_shape(
+                num_blocks,
+                kv_cache_spec.block_size,
+                kv_cache_spec.num_kv_heads,
+                kv_cache_spec.head_size,
+            )
+
+            # FIXME(woosuk): Add kv_cache_stride_order to all attention backends.
+            try:
+                kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
+                assert len(kv_cache_stride_order) == len(kv_cache_shape)
+            except (AttributeError, NotImplementedError):
+                kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
+
+            kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
+            inv_order = [
+                kv_cache_stride_order.index(i)
+                for i in range(len(kv_cache_stride_order))
+            ]
+
+            dtype = kv_cache_spec.dtype
+            raw_tensor = raw_tensor.view(dtype)
+            raw_tensor = raw_tensor.view(kv_cache_shape)
+            kv_caches[layer_name] = raw_tensor.permute(*inv_order)
+    return kv_caches
+
+
+def init_kv_cache(
+    runner_kv_caches: list[torch.Tensor],
+    forward_context: dict[str, Any],
+    kv_cache_config: KVCacheConfig,
+    attn_backends: dict[str, AttentionBackend],
+    device: torch.device,
+) -> None:
+    kv_cache_raw_tensors = _allocate_kv_cache(kv_cache_config, device)
+    kv_caches = _reshape_kv_cache(kv_cache_config, kv_cache_raw_tensors, attn_backends)
+    bind_kv_cache(kv_caches, forward_context, runner_kv_caches)
+
+
+def build_attn_metadata(
+    attn_metadata_builders: list[AttentionMetadataBuilder],
+    num_reqs: int,
+    num_tokens: int,
+    query_start_loc: CpuGpuBuffer,
+    seq_lens: CpuGpuBuffer,
+    num_computed_tokens_cpu: torch.Tensor,
+    block_tables: Sequence[torch.Tensor],
+    slot_mappings: torch.Tensor,
+    kv_cache_config: KVCacheConfig,
+) -> dict[str, Any]:
+    query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
+    query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
+    max_query_len = int(query_start_loc.np[: num_reqs + 1].max())
+    seq_lens_gpu = seq_lens.gpu[:num_reqs]
+    seq_lens_cpu = seq_lens.cpu[:num_reqs]
+    max_seq_len = int(seq_lens.np[:num_reqs].max())
+
+    attn_metadata: dict[str, Any] = {}
+    kv_cache_groups = kv_cache_config.kv_cache_groups
+    for i, kv_cache_spec in enumerate(kv_cache_groups):
+        block_table = block_tables[i]
+        slot_mapping = slot_mappings[i]
+
+        common_attn_metadata = CommonAttentionMetadata(
+            query_start_loc=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
+            seq_lens=seq_lens_gpu,
+            seq_lens_cpu=seq_lens_cpu,
+            max_seq_len=max_seq_len,
+            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            num_reqs=num_reqs,
+            num_actual_tokens=num_tokens,
+            max_query_len=max_query_len,
+            block_table_tensor=block_table,
+            slot_mapping=slot_mapping,
+            causal=True,
+        )
+
+        attn_metadata_builder = attn_metadata_builders[i]
+        metadata = attn_metadata_builder.build(
+            common_prefix_len=0,
+            common_attn_metadata=common_attn_metadata,
+        )
+        for layer_name in kv_cache_spec.layer_names:
+            attn_metadata[layer_name] = metadata
+    return attn_metadata
diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py
new file mode 100644
index 0000000000000..ff24e88ede2c0
--- /dev/null
+++ b/vllm/v1/worker/gpu/block_table.py
@@ -0,0 +1,315 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.utils.math_utils import cdiv
+from vllm.v1.utils import CpuGpuBuffer
+
+
+class BlockTables:
+    def __init__(
+        self,
+        block_sizes: list[int],
+        max_num_reqs: int,
+        max_num_batched_tokens: int,
+        max_model_len: int,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.block_sizes = block_sizes
+        self.max_num_reqs = max_num_reqs
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.max_model_len = max_model_len
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.num_kv_cache_groups = len(self.block_sizes)
+        # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
+        self.block_tables: list[torch.Tensor] = []
+        for i in range(self.num_kv_cache_groups):
+            block_size = self.block_sizes[i]
+            max_num_blocks = cdiv(self.max_model_len, block_size)
+            block_table = torch.zeros(
+                self.max_num_reqs,
+                max_num_blocks,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            self.block_tables.append(block_table)
+        self.block_table_ptrs = self._make_ptr_tensor(self.block_tables)
+
+        # Block tables used for model's forward pass.
+        # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
+        self.input_block_tables: list[torch.Tensor] = [
+            torch.zeros_like(block_table) for block_table in self.block_tables
+        ]
+        self.input_block_table_ptrs = self._make_ptr_tensor(self.input_block_tables)
+
+        self.block_table_strides = torch.tensor(
+            [b.stride(0) for b in self.block_tables],
+            dtype=torch.int64,
+            device=self.device,
+        )
+        self.block_sizes_tensor = torch.tensor(
+            self.block_sizes, dtype=torch.int32, device=self.device
+        )
+        self.num_blocks = torch.zeros(
+            self.num_kv_cache_groups,
+            self.max_num_reqs,
+            dtype=torch.int32,
+            device=self.device,
+        )
+        self.slot_mappings = torch.zeros(
+            self.num_kv_cache_groups,
+            self.max_num_batched_tokens,
+            dtype=torch.int64,
+            device=self.device,
+        )
+
+        # Misc buffers.
+        self.req_indices = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
+        self.overwrite = self._make_buffer(self.max_num_reqs, dtype=torch.bool)
+        self.cu_num_new_blocks = self._make_buffer(
+            self.num_kv_cache_groups, self.max_num_reqs + 1, dtype=torch.int32
+        )
+
+    def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            *args, dtype=dtype, pin_memory=self.pin_memory, device=self.device
+        )
+
+    def _make_ptr_tensor(self, x: Iterable[torch.Tensor]) -> torch.Tensor:
+        # NOTE(woosuk): Use uint64 instead of int64 to cover all possible addresses.
+        ptrs_tensor_cpu = torch.tensor(
+            [t.data_ptr() for t in x],
+            dtype=torch.uint64,
+            device="cpu",
+            pin_memory=self.pin_memory,
+        )
+        return ptrs_tensor_cpu.to(self.device, non_blocking=True)
+
+    def append_block_ids(
+        self,
+        # [num_reqs]
+        req_indices: list[int],
+        # [num_kv_cache_groups, num_reqs + 1]
+        cu_num_new_blocks: tuple[list[int], ...],
+        # [num_kv_cache_groups, num_new_blocks]
+        new_block_ids: tuple[list[int], ...],
+        # [num_reqs]
+        overwrite: list[bool],
+    ) -> None:
+        num_reqs = len(req_indices)
+        self.req_indices.np[:num_reqs] = req_indices
+        self.overwrite.np[:num_reqs] = overwrite
+        for i in range(self.num_kv_cache_groups):
+            self.cu_num_new_blocks.np[i, : num_reqs + 1] = cu_num_new_blocks[i]
+
+        # NOTE(woosuk): Here, we cannot use a fixed-size buffer because there's
+        # no clear upper bound to the number of new blocks in a single step.
+        # NOTE(woosuk): The buffer has to be cached, because otherwise we cannot
+        # guarantee that the buffer is not freed before the copy is completed.
+        self.new_block_ids_cpu = torch.empty(
+            self.num_kv_cache_groups,
+            max(len(x) for x in new_block_ids),
+            dtype=torch.int32,
+            device="cpu",
+            pin_memory=self.pin_memory,
+        )
+        new_block_ids_np = self.new_block_ids_cpu.numpy()
+        for i in range(self.num_kv_cache_groups):
+            new_block_ids_np[i, : len(new_block_ids[i])] = new_block_ids[i]
+        new_block_ids_gpu = self.new_block_ids_cpu.to(self.device, non_blocking=True)
+
+        _append_block_ids_kernel[(self.num_kv_cache_groups, num_reqs)](
+            self.req_indices.copy_to_gpu(num_reqs),
+            self.cu_num_new_blocks.copy_to_gpu(),
+            self.cu_num_new_blocks.gpu.stride(0),
+            new_block_ids_gpu,
+            new_block_ids_gpu.stride(0),
+            self.overwrite.copy_to_gpu(num_reqs),
+            self.block_table_strides,
+            self.block_table_ptrs,
+            self.num_blocks,
+            self.num_blocks.stride(0),
+            BLOCK_SIZE=1024,  # type: ignore
+        )
+
+    def gather_block_tables(
+        self,
+        idx_mapping: torch.Tensor,
+    ) -> tuple[torch.Tensor, ...]:
+        num_reqs = idx_mapping.shape[0]
+        _gather_block_tables_kernel[(self.num_kv_cache_groups, num_reqs)](
+            idx_mapping,
+            self.block_table_ptrs,
+            self.input_block_table_ptrs,
+            self.block_table_strides,
+            self.num_blocks,
+            self.num_blocks.stride(0),
+            BLOCK_SIZE=1024,  # type: ignore
+        )
+        return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)
+
+    def get_dummy_block_tables(self, num_reqs: int) -> tuple[torch.Tensor, ...]:
+        return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)
+
+    def compute_slot_mappings(
+        self,
+        query_start_loc: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        num_reqs = query_start_loc.shape[0] - 1
+        num_tokens = positions.shape[0]
+        num_groups = self.num_kv_cache_groups
+        _compute_slot_mappings_kernel[(num_groups, num_reqs + 1)](
+            num_tokens,
+            self.max_num_batched_tokens,
+            query_start_loc,
+            positions,
+            self.input_block_table_ptrs,
+            self.block_table_strides,
+            self.block_sizes_tensor,
+            self.slot_mappings,
+            self.slot_mappings.stride(0),
+            PAD_ID=PAD_SLOT_ID,
+            BLOCK_SIZE=1024,  # type: ignore
+        )
+        return self.slot_mappings[:, :num_tokens]
+
+    def get_dummy_slot_mappings(self, num_tokens: int) -> torch.Tensor:
+        self.slot_mappings.fill_(PAD_SLOT_ID)
+        return self.slot_mappings[:, :num_tokens]
+
+
+@triton.jit
+def _append_block_ids_kernel(
+    # Inputs
+    req_indices,  # [num_reqs]
+    cu_num_new_blocks_ptr,  # [num_kv_cache_groups, num_reqs + 1]
+    cu_num_new_blocks_stride,
+    new_block_ids_ptr,  # [num_kv_cache_groups, num_new_blocks]
+    new_block_ids_stride,
+    overwrite,  # [num_reqs]
+    block_table_strides,  # [num_kv_cache_groups]
+    # Outputs
+    block_table_ptrs,  # [num_kv_cache_groups]
+    num_blocks_ptr,  # [num_kv_cache_groups, max_num_reqs]
+    num_blocks_stride,
+    # Constants
+    BLOCK_SIZE: tl.constexpr,
+):
+    group_id = tl.program_id(0)
+    batch_idx = tl.program_id(1)
+    req_idx = tl.load(req_indices + batch_idx)
+    do_overwrite = tl.load(overwrite + batch_idx)
+
+    group_new_blocks_ptr = cu_num_new_blocks_ptr + group_id * cu_num_new_blocks_stride
+    start_idx = tl.load(group_new_blocks_ptr + batch_idx)
+    end_idx = tl.load(group_new_blocks_ptr + batch_idx + 1)
+    num_new_blocks = end_idx - start_idx
+
+    group_num_blocks_ptr = num_blocks_ptr + group_id * num_blocks_stride
+    dst_start_idx = tl.load(group_num_blocks_ptr + req_idx) if not do_overwrite else 0
+    dst_end_idx = dst_start_idx + num_new_blocks
+    tl.store(group_num_blocks_ptr + req_idx, dst_end_idx)
+
+    # Destination
+    block_table_ptr = _load_ptr(block_table_ptrs + group_id, tl.int32)
+    block_table_stride = tl.load(block_table_strides + group_id)
+    row_ptr = block_table_ptr + req_idx * block_table_stride
+
+    group_new_block_ids_ptr = new_block_ids_ptr + group_id * new_block_ids_stride
+    for i in range(0, num_new_blocks, BLOCK_SIZE):
+        offset = i + tl.arange(0, BLOCK_SIZE)
+        block_ids = tl.load(
+            group_new_block_ids_ptr + start_idx + offset, mask=offset < num_new_blocks
+        )
+        tl.store(
+            row_ptr + dst_start_idx + offset, block_ids, mask=offset < num_new_blocks
+        )
+
+
+@triton.jit
+def _gather_block_tables_kernel(
+    batch_idx_to_req_idx,  # [batch_size]
+    src_block_table_ptrs,  # [num_kv_cache_groups]
+    dst_block_table_ptrs,  # [num_kv_cache_groups]
+    block_table_strides,  # [num_kv_cache_groups]
+    num_blocks_ptr,  # [num_kv_cache_groups, max_num_reqs]
+    num_blocks_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # kv cache group id
+    group_id = tl.program_id(0)
+    batch_idx = tl.program_id(1)
+    req_idx = tl.load(batch_idx_to_req_idx + batch_idx)
+
+    group_num_blocks_ptr = num_blocks_ptr + group_id * num_blocks_stride
+    num_blocks = tl.load(group_num_blocks_ptr + req_idx)
+
+    stride = tl.load(block_table_strides + group_id)
+    src_block_table_ptr = _load_ptr(src_block_table_ptrs + group_id, tl.int32)
+    src_row_ptr = src_block_table_ptr + req_idx * stride
+    dst_block_table_ptr = _load_ptr(dst_block_table_ptrs + group_id, tl.int32)
+    dst_row_ptr = dst_block_table_ptr + batch_idx * stride
+
+    for i in tl.range(0, num_blocks, BLOCK_SIZE):
+        offset = i + tl.arange(0, BLOCK_SIZE)
+        block_ids = tl.load(src_row_ptr + offset, mask=offset < num_blocks)
+        tl.store(dst_row_ptr + offset, block_ids, mask=offset < num_blocks)
+
+
+@triton.jit
+def _compute_slot_mappings_kernel(
+    num_tokens,
+    max_num_tokens,
+    cu_num_tokens,  # [num_reqs + 1]
+    pos,  # [num_tokens]
+    block_table_ptrs,  # [num_kv_cache_groups]
+    block_table_strides,  # [num_kv_cache_groups]
+    page_sizes,  # [num_kv_cache_groups]
+    slot_mappings_ptr,  # [num_kv_cache_groups, max_num_tokens]
+    slot_mappings_stride,
+    PAD_ID: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # kv cache group id
+    group_id = tl.program_id(0)
+    req_idx = tl.program_id(1)
+    slot_mapping_ptr = slot_mappings_ptr + group_id * slot_mappings_stride
+
+    if req_idx == tl.num_programs(1) - 1:
+        # Pad remaining slots to -1. This is needed for CUDA graphs.
+        for i in range(num_tokens, max_num_tokens, BLOCK_SIZE):
+            offset = i + tl.arange(0, BLOCK_SIZE)
+            tl.store(slot_mapping_ptr + offset, PAD_ID, mask=offset < max_num_tokens)
+        return
+
+    block_table_ptr = _load_ptr(block_table_ptrs + group_id, tl.int32)
+    block_table_stride = tl.load(block_table_strides + group_id)
+    page_size = tl.load(page_sizes + group_id)
+
+    start_idx = tl.load(cu_num_tokens + req_idx)
+    end_idx = tl.load(cu_num_tokens + req_idx + 1)
+    for i in range(start_idx, end_idx, BLOCK_SIZE):
+        offset = i + tl.arange(0, BLOCK_SIZE)
+        positions = tl.load(pos + offset, mask=offset < end_idx, other=0)
+        block_indices = positions // page_size
+        block_numbers = tl.load(
+            block_table_ptr + req_idx * block_table_stride + block_indices
+        )
+        slot_ids = block_numbers * page_size + positions % page_size
+        tl.store(slot_mapping_ptr + offset, slot_ids, mask=offset < end_idx)
+
+
+@triton.jit
+def _load_ptr(ptr_to_ptr, elem_dtype):
+    ptr = tl.load(ptr_to_ptr)
+    ptr = tl.cast(ptr, tl.pointer_type(elem_dtype))
+    return tl.multiple_of(ptr, 16)
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
new file mode 100644
index 0000000000000..7fd1f76669f48
--- /dev/null
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+from contextlib import contextmanager
+
+import numpy as np
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.distributed.parallel_state import graph_capture, is_global_first_rank
+from vllm.forward_context import set_forward_context
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+
+
+class CudaGraphManager:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.device = device
+
+        self.max_model_len = vllm_config.model_config.max_model_len
+        self.dp_size = vllm_config.parallel_config.data_parallel_size
+        self.compilation_config = vllm_config.compilation_config
+        assert self.compilation_config is not None
+
+        self.cudagraph_mode = self.compilation_config.cudagraph_mode
+        self.cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
+        self.padded_sizes = self._init_padded_sizes()
+
+        self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
+        self.pool = torch.cuda.graph_pool_handle()
+        self.hidden_states: torch.Tensor | None = None
+
+    def _init_padded_sizes(self) -> dict[int, int]:
+        if not self.cudagraph_mode.has_full_cudagraphs():
+            # Full cuda graphs are not used.
+            return {}
+
+        padded_sizes: dict[int, int] = {}
+        assert len(self.cudagraph_sizes) > 0
+        for i in range(1, self.cudagraph_sizes[-1] + 1):
+            for x in self.cudagraph_sizes:
+                if i <= x:
+                    padded_sizes[i] = x
+                    break
+        return padded_sizes
+
+    def needs_capture(self) -> bool:
+        return len(self.padded_sizes) > 0
+
+    def get_cudagraph_size(
+        self,
+        scheduler_output: SchedulerOutput,
+        num_tokens_after_padding: int,
+    ) -> int | None:
+        if not self.cudagraph_mode.has_full_cudagraphs():
+            return None
+        if self.cudagraph_mode != CUDAGraphMode.FULL:
+            # TODO(woosuk): Support uniform decode with multiple tokens (spec decoding).
+            all_decode = all(
+                x == 1 for x in scheduler_output.num_scheduled_tokens.values()
+            )
+            if not all_decode:
+                # Prefill is included.
+                return None
+        return self.padded_sizes.get(num_tokens_after_padding)
+
+    def capture_graph(
+        self,
+        batch_size: int,
+        model: nn.Module,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        assert batch_size not in self.graphs
+
+        # Prepare dummy inputs.
+        input_ids = input_buffers.input_ids.gpu[:batch_size]
+        positions = input_buffers.positions.gpu[:batch_size]
+
+        input_buffers.query_start_loc.np[: batch_size + 1] = np.arange(batch_size + 1)
+        input_buffers.query_start_loc.np[batch_size:] = batch_size
+        input_buffers.query_start_loc.copy_to_gpu()
+        input_buffers.seq_lens.np[:batch_size] = self.max_model_len
+        input_buffers.seq_lens.np[batch_size:] = 0
+        input_buffers.seq_lens.copy_to_gpu()
+
+        input_block_tables = [x[:batch_size] for x in block_tables.input_block_tables]
+        slot_mappings = block_tables.slot_mappings[:, :batch_size]
+
+        attn_metadata = build_attn_metadata(
+            attn_metadata_builders=attn_metadata_builders,
+            num_reqs=batch_size,
+            num_tokens=batch_size,
+            query_start_loc=input_buffers.query_start_loc,
+            seq_lens=input_buffers.seq_lens,
+            num_computed_tokens_cpu=None,  # FIXME
+            block_tables=input_block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=kv_cache_config,
+        )
+        if self.dp_size > 1:
+            num_tokens_across_dp = torch.full(
+                (self.dp_size,),
+                batch_size,
+                dtype=torch.int32,
+                device="cpu",
+            )
+        else:
+            num_tokens_across_dp = None
+
+        # Warm up.
+        with set_forward_context(
+            attn_metadata,
+            self.vllm_config,
+            num_tokens=batch_size,
+            num_tokens_across_dp=num_tokens_across_dp,
+        ):
+            hidden_states = model(
+                input_ids=input_ids,
+                positions=positions,
+            )
+            if self.hidden_states is None:
+                self.hidden_states = torch.empty_like(hidden_states)
+        torch.cuda.synchronize()
+
+        # Capture the graph.
+        graph = torch.cuda.CUDAGraph()
+        with (
+            set_forward_context(
+                attn_metadata,
+                self.vllm_config,
+                num_tokens=batch_size,
+                num_tokens_across_dp=num_tokens_across_dp,
+            ),
+            torch.cuda.graph(graph, self.pool),
+        ):
+            hidden_states = model(
+                input_ids=input_ids,
+                positions=positions,
+            )
+            self.hidden_states[:batch_size] = hidden_states
+        self.graphs[batch_size] = graph
+
+    @torch.inference_mode()
+    def capture(
+        self,
+        model: nn.Module,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        assert self.needs_capture()
+        # Capture larger graphs first.
+        sizes_to_capture = sorted(self.cudagraph_sizes, reverse=True)
+        if is_global_first_rank():
+            sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs")
+
+        with freeze_gc(), graph_capture(device=self.device):
+            for batch_size in sizes_to_capture:
+                self.capture_graph(
+                    batch_size,
+                    model,
+                    input_buffers,
+                    block_tables,
+                    attn_metadata_builders,
+                    kv_cache_config,
+                )
+
+    def run(self, batch_size: int) -> torch.Tensor:
+        assert batch_size in self.graphs
+        self.graphs[batch_size].replay()
+        assert self.hidden_states is not None
+        return self.hidden_states[:batch_size]
+
+
+@contextmanager
+def freeze_gc():
+    gc.collect()
+    gc.freeze()
+    try:
+        yield
+    finally:
+        gc.unfreeze()
diff --git a/vllm/v1/worker/gpu/dp_utils.py b/vllm/v1/worker/gpu/dp_utils.py
new file mode 100644
index 0000000000000..9bfc7f25bef3a
--- /dev/null
+++ b/vllm/v1/worker/gpu/dp_utils.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.distributed as dist
+
+from vllm.distributed.parallel_state import get_dp_group
+
+
+def get_batch_metadata_across_dp(
+    num_tokens: int,
+    cudagraph_size: int,
+    dp_size: int,
+    dp_rank: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert dp_size > 1
+    # Use CPU group to avoid CPU-GPU synchronization.
+    group = get_dp_group().cpu_group
+    tensor = torch.zeros(2, dp_size, dtype=torch.int32, device="cpu")
+    tensor[0][dp_rank] = num_tokens
+    tensor[1][dp_rank] = cudagraph_size
+    dist.all_reduce(tensor, group=group)
+    return tensor[0], tensor[1]
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
new file mode 100644
index 0000000000000..89f375649146f
--- /dev/null
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Any
+
+import numba
+import numba.types as types
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+
+from vllm.utils import random_uuid
+from vllm.utils.math_utils import cdiv
+from vllm.v1.utils import CpuGpuBuffer
+
+
+class InputBuffers:
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_num_tokens: int,
+        hidden_size: int,
+        vocab_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_num_tokens = max_num_tokens
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.idx_mapping = self._make_buffer(max_num_reqs, dtype=torch.int32)
+        self.input_ids = self._make_buffer(max_num_tokens, dtype=torch.int32)
+        self.positions = self._make_buffer(max_num_tokens, dtype=torch.int64)
+        self.query_start_loc = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
+        self.seq_lens = self._make_buffer(max_num_reqs, dtype=torch.int32)
+
+        # Structured outputs.
+        self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
+        self.grammar_bitmask = self._make_buffer(
+            max_num_reqs, cdiv(vocab_size, 32), dtype=torch.int32
+        )
+
+    def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            *args, dtype=dtype, pin_memory=self.pin_memory, device=self.device
+        )
+
+
+@dataclass
+class InputBatch:
+    # batch_idx -> req_id
+    req_ids: list[str]
+    num_reqs: int
+
+    # batch_idx -> req_state_idx
+    idx_mapping: torch.Tensor
+    idx_mapping_np: np.ndarray
+
+    # [num_reqs]
+    # batch_idx -> num_scheduled_tokens
+    num_scheduled_tokens: np.ndarray
+    # sum(num_scheduled_tokens)
+    num_tokens: int
+    num_tokens_after_padding: int
+
+    # [num_reqs + 1]
+    query_start_loc: torch.Tensor
+    query_start_loc_np: np.ndarray
+    # [num_reqs]
+    seq_lens: torch.Tensor
+    seq_lens_np: np.ndarray
+
+    # [num_tokens_after_padding]
+    input_ids: torch.Tensor
+    # [num_tokens_after_padding]
+    positions: torch.Tensor
+
+    # layer_name -> Metadata
+    attn_metadata: dict[str, Any]
+
+    # [num_reqs]
+    logits_indices: torch.Tensor
+
+    @classmethod
+    def make_dummy(
+        cls,
+        num_reqs: int,
+        num_tokens: int,
+        input_buffers: InputBuffers,
+        device: torch.device,
+    ) -> "InputBatch":
+        assert 0 < num_reqs <= num_tokens
+        req_ids = [f"req_{i}_{random_uuid()}" for i in range(num_reqs)]
+        idx_mapping_np = np.arange(num_reqs, dtype=np.int32)
+        idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
+        num_scheduled_tokens = np.full(num_reqs, num_tokens // num_reqs, dtype=np.int32)
+        num_scheduled_tokens[-1] += num_tokens % num_reqs
+        assert int(num_scheduled_tokens.sum()) == num_tokens
+
+        input_buffers.query_start_loc.np[0] = 0
+        input_buffers.query_start_loc.np[1 : num_reqs + 1] = np.cumsum(
+            num_scheduled_tokens
+        )
+        input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
+        query_start_loc_np = input_buffers.query_start_loc.np[: num_reqs + 1]
+        query_start_loc = input_buffers.query_start_loc.copy_to_gpu()[: num_reqs + 1]
+        # seq_len equals to query_len
+        input_buffers.seq_lens.np[:num_reqs] = num_scheduled_tokens
+        input_buffers.seq_lens.np[num_reqs:] = 0
+        seq_lens_np = input_buffers.seq_lens.np[:num_reqs]
+        seq_lens = input_buffers.seq_lens.copy_to_gpu()[:num_reqs]
+
+        input_ids = input_buffers.input_ids.copy_to_gpu(num_tokens)
+        positions = input_buffers.positions.copy_to_gpu(num_tokens)
+        # attn_metadata = defaultdict(lambda: None)
+        logits_indices = query_start_loc[1:] - 1
+        return cls(
+            req_ids=req_ids,
+            num_reqs=num_reqs,
+            idx_mapping=idx_mapping,
+            idx_mapping_np=idx_mapping_np,
+            num_scheduled_tokens=num_scheduled_tokens,
+            num_tokens=num_tokens,
+            num_tokens_after_padding=num_tokens,
+            query_start_loc=query_start_loc,
+            query_start_loc_np=query_start_loc_np,
+            seq_lens=seq_lens,
+            seq_lens_np=seq_lens_np,
+            input_ids=input_ids,
+            positions=positions,
+            attn_metadata=None,  # type: ignore
+            logits_indices=logits_indices,
+        )
+
+
+# NOTE: With the type annotations, this function is pre-compiled
+# before the first call.
+@numba.jit(
+    [
+        types.none(
+            types.int32[:],  # idx_mapping
+            types.int32[:, :],  # token_ids
+            types.int32[:],  # num_computed_tokens
+            types.int32[:],  # num_scheduled_tokens
+            types.int32[:],  # input_ids
+            types.int64[:],  # positions
+            types.int32[:],  # query_start_loc
+            types.int32[:],  # seq_lens
+        )
+    ],
+    nopython=True,
+    cache=True,
+)
+def _prepare_inputs(
+    idx_mapping: np.ndarray,  # batch_idx -> req_idx
+    token_ids: np.ndarray,  # [N, max_model_len]
+    num_computed_tokens: np.ndarray,  # [N]
+    num_scheduled_tokens: np.ndarray,  # [B]
+    input_ids: np.ndarray,  # [num_input_tokens]
+    positions: np.ndarray,  # [num_input_tokens]
+    query_start_loc: np.ndarray,  # [B + 1]
+    seq_lens: np.ndarray,  # [B]
+) -> None:
+    num_reqs = num_scheduled_tokens.shape[0]
+    query_start_loc[0] = 0
+
+    cu_num_tokens = 0
+    for i in range(num_reqs):
+        req_idx = idx_mapping[i]
+        query_len = num_scheduled_tokens[i]
+        start = num_computed_tokens[req_idx]
+        end = start + query_len
+        seq_lens[i] = end
+
+        start_idx = cu_num_tokens
+        end_idx = start_idx + query_len
+        input_ids[start_idx:end_idx] = token_ids[req_idx, start:end]
+        positions[start_idx:end_idx] = np.arange(start, end, dtype=np.int64)
+
+        cu_num_tokens = end_idx
+        query_start_loc[i + 1] = cu_num_tokens
+
+    # Pad the inputs for CUDA graphs.
+    # Note: pad query_start_loc to be non-decreasing, as kernels
+    # like FlashAttention requires that
+    query_start_loc[num_reqs + 1 :].fill(cu_num_tokens)
+    # Fill unused with 0 for full cuda graph mode.
+    seq_lens[num_reqs:].fill(0)
+
+
+def prepare_inputs(
+    idx_mapping: np.ndarray,
+    prefill_token_ids: np.ndarray,
+    num_computed_tokens: np.ndarray,
+    num_scheduled_tokens: np.ndarray,
+    input_ids: CpuGpuBuffer,
+    positions: CpuGpuBuffer,
+    query_start_loc: CpuGpuBuffer,
+    seq_lens: CpuGpuBuffer,
+    num_tokens: int,
+) -> None:
+    _prepare_inputs(
+        idx_mapping,
+        prefill_token_ids,
+        num_computed_tokens,
+        num_scheduled_tokens,
+        input_ids.np,
+        positions.np,
+        query_start_loc.np,
+        seq_lens.np,
+    )
+    input_ids.copy_to_gpu(num_tokens)
+    positions.copy_to_gpu(num_tokens)
+    # NOTE(woosuk): We should copy the whole query_start_loc and seq_lens
+    # tensors from CPU to GPU, because they may include paddings needed
+    # for full CUDA graph mode.
+    query_start_loc.copy_to_gpu()
+    seq_lens.copy_to_gpu()
+    return
+
+
+@triton.jit
+def _combine_last_token_ids_kernel(
+    input_ids_ptr,
+    idx_mapping_ptr,
+    last_token_ids_ptr,
+    query_start_loc_ptr,
+    seq_lens_ptr,
+    prefill_len_ptr,
+):
+    batch_idx = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+
+    seq_len = tl.load(seq_lens_ptr + batch_idx)
+    prefill_len = tl.load(prefill_len_ptr + req_state_idx)
+    if seq_len <= prefill_len:
+        # Handling prefill tokens.
+        return
+
+    last_token_id = tl.load(last_token_ids_ptr + req_state_idx)
+    end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    tl.store(input_ids_ptr + end - 1, last_token_id)
+
+
+def combine_last_token_ids(
+    input_ids: torch.Tensor,
+    idx_mapping: torch.Tensor,
+    last_token_ids: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    seq_lens: torch.Tensor,
+    prefill_len: torch.Tensor,
+) -> torch.Tensor:
+    num_reqs = seq_lens.shape[0]
+    _combine_last_token_ids_kernel[(num_reqs,)](
+        input_ids,
+        idx_mapping,
+        last_token_ids,
+        query_start_loc,
+        seq_lens,
+        prefill_len,
+    )
+    return input_ids
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
new file mode 100644
index 0000000000000..08aad9ddd06b3
--- /dev/null
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -0,0 +1,814 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+import time
+from copy import deepcopy
+from typing import Any
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model_loader
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.mem_utils import DeviceMemoryProfiler
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.outputs import (
+    EMPTY_MODEL_RUNNER_OUTPUT,
+    LogprobsTensors,
+    ModelRunnerOutput,
+)
+from vllm.v1.sample.sampler import SamplerOutput
+from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier
+from vllm.v1.worker.gpu.attn_utils import (
+    build_attn_metadata,
+    get_kv_cache_spec,
+    init_attn_backend,
+    init_kv_cache,
+)
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager
+from vllm.v1.worker.gpu.dp_utils import get_batch_metadata_across_dp
+from vllm.v1.worker.gpu.input_batch import (
+    InputBatch,
+    InputBuffers,
+    combine_last_token_ids,
+    prepare_inputs,
+)
+from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
+from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
+from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
+from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
+from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
+
+logger = init_logger(__name__)
+
+
+class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.compilation_config = vllm_config.compilation_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.speculative_config = vllm_config.speculative_config
+        self.observability_config = vllm_config.observability_config
+
+        self.device = device
+        self.pin_memory = is_pin_memory_available()
+        self.dtype = self.model_config.dtype
+        self.kv_cache_dtype = self.dtype
+        if self.cache_config.cache_dtype != "auto":
+            # Quantized KV cache.
+            self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
+                self.cache_config.cache_dtype
+            ]
+        self.is_pooling_model = False
+
+        self.vocab_size = self.model_config.get_vocab_size()
+        self.max_model_len = self.model_config.max_model_len
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.hidden_size = self.model_config.get_hidden_size()
+
+        self.dp_size = self.parallel_config.data_parallel_size
+        self.dp_rank = self.parallel_config.data_parallel_rank
+
+        self.use_async_scheduling = self.scheduler_config.async_scheduling
+        self.output_copy_stream = torch.cuda.Stream(self.device)
+        self.output_copy_event = torch.cuda.Event()
+        if self.use_async_scheduling:
+            self.input_prep_event = torch.cuda.Event()
+            self.structured_outputs_event = torch.cuda.Event()
+        else:
+            self.input_prep_event = None
+            self.structured_outputs_event = None
+
+        self.req_states = RequestState(
+            max_num_reqs=self.max_num_reqs,
+            max_model_len=self.max_model_len,
+            max_num_batched_tokens=self.max_num_tokens,
+            vocab_size=self.vocab_size,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        )
+        self.input_buffers = InputBuffers(
+            max_num_reqs=self.max_num_reqs,
+            max_num_tokens=self.max_num_tokens,
+            hidden_size=self.hidden_size,
+            vocab_size=self.vocab_size,
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        )
+        self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
+
+        # CUDA graphs.
+        self.cudagraph_manager = CudaGraphManager(
+            vllm_config=self.vllm_config,
+            device=self.device,
+        )
+
+    def get_supported_tasks(self) -> tuple[str]:
+        return ("generate",)
+
+    def load_model(self, *args, **kwargs) -> None:
+        time_before_load = time.perf_counter()
+        with DeviceMemoryProfiler() as m:
+            model_loader = get_model_loader(self.vllm_config.load_config)
+            logger.info("Loading model from scratch...")
+
+            self.model = model_loader.load_model(
+                vllm_config=self.vllm_config,
+                model_config=self.vllm_config.model_config,
+            )
+            if self.lora_config:
+                self.model = self.load_lora_model(
+                    self.model,
+                    self.vllm_config,
+                    self.device,
+                )
+        time_after_load = time.perf_counter()
+
+        self.model_memory_usage = m.consumed_memory
+        logger.info(
+            "Model loading took %.4f GiB and %.6f seconds",
+            m.consumed_memory / GiB_bytes,
+            time_after_load - time_before_load,
+        )
+
+    def get_model(self) -> nn.Module:
+        return self.model
+
+    def get_kv_cache_spec(self):
+        return get_kv_cache_spec(self.vllm_config)
+
+    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        kv_cache_config = deepcopy(kv_cache_config)
+        self.kv_cache_config = kv_cache_config
+        block_sizes = [
+            kv_cache_group.kv_cache_spec.block_size
+            for kv_cache_group in kv_cache_config.kv_cache_groups
+        ]
+
+        self.block_tables = BlockTables(
+            block_sizes=block_sizes,
+            max_num_reqs=self.max_num_reqs,
+            max_num_batched_tokens=self.max_num_tokens,
+            max_model_len=self.max_model_len,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        )
+
+        self.attn_backends, self.attn_metadata_builders = init_attn_backend(
+            self.kv_cache_config,
+            self.vllm_config,
+            self.device,
+        )
+
+        self.kv_caches: list[torch.Tensor] = []
+        init_kv_cache(
+            self.kv_caches,
+            self.compilation_config.static_forward_context,
+            self.kv_cache_config,
+            self.attn_backends,
+            self.device,
+        )
+        # Attention groups are not supported.
+        self.attn_groups = []  # type: ignore
+
+    def prepare_dummy_attn_metadata(self, input_batch: InputBatch) -> None:
+        block_tables = self.block_tables.get_dummy_block_tables(input_batch.num_reqs)
+        slot_mappings = self.block_tables.get_dummy_slot_mappings(
+            input_batch.num_tokens
+        )
+        num_computed_tokens_cpu = torch.zeros(
+            input_batch.num_reqs, dtype=torch.int32, device="cpu"
+        )
+        attn_metadata = build_attn_metadata(
+            attn_metadata_builders=self.attn_metadata_builders,
+            num_reqs=input_batch.num_reqs,
+            num_tokens=input_batch.num_tokens,
+            query_start_loc=self.input_buffers.query_start_loc,
+            seq_lens=self.input_buffers.seq_lens,
+            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            block_tables=block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=self.kv_cache_config,
+        )
+        input_batch.attn_metadata = attn_metadata
+
+    @torch.inference_mode()
+    def _dummy_run(
+        self,
+        num_tokens: int,
+        *args,
+        skip_attn: bool = True,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        input_batch = InputBatch.make_dummy(
+            num_reqs=num_reqs,
+            num_tokens=num_tokens,
+            input_buffers=self.input_buffers,
+            device=self.device,
+        )
+        if not skip_attn:
+            self.prepare_dummy_attn_metadata(input_batch)
+
+        if self.dp_size == 1:
+            num_tokens_across_dp: torch.Tensor | None = None
+        else:
+            num_tokens_across_dp = torch.full(
+                (self.dp_size,), num_tokens, dtype=torch.int32, device="cpu"
+            )
+        num_sampled_tokens = np.ones(input_batch.num_reqs, dtype=np.int32)
+        with (
+            self.maybe_dummy_run_with_lora(
+                self.lora_config,
+                input_batch.num_scheduled_tokens,
+                num_sampled_tokens,
+            ),
+            set_forward_context(
+                input_batch.attn_metadata,
+                self.vllm_config,
+                num_tokens=num_tokens,
+                num_tokens_across_dp=num_tokens_across_dp,
+            ),
+        ):
+            hidden_states = self.model(
+                input_ids=input_batch.input_ids,
+                positions=input_batch.positions,
+            )
+            sample_hidden_states = hidden_states[input_batch.logits_indices]
+        return hidden_states, sample_hidden_states
+
+    @torch.inference_mode()
+    def _dummy_sampler_run(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> None:
+        num_reqs = hidden_states.shape[0]
+        sampling_metadata = SamplingMetadata.make_dummy(
+            num_reqs=num_reqs,
+            device=self.device,
+        )
+        logits = self.model.compute_logits(hidden_states)
+        self.sampler(logits, sampling_metadata)
+
+    @torch.inference_mode()
+    def profile_run(self) -> None:
+        hidden_states, sample_hidden_states = self._dummy_run(
+            self.max_num_tokens,
+            skip_attn=True,
+        )
+        self._dummy_sampler_run(sample_hidden_states)
+        torch.cuda.synchronize()
+        del hidden_states, sample_hidden_states
+        gc.collect()
+
+    def reset_mm_cache(self) -> None:
+        pass
+
+    def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int:
+        # SP is not supported yet.
+        return num_scheduled_tokens
+
+    @torch.inference_mode()
+    def capture_model(self) -> int:
+        if not self.cudagraph_manager.needs_capture():
+            logger.warning(
+                "Skipping CUDA graph capture. To turn on CUDA graph capture, "
+                "ensure `cudagraph_mode` was not manually set to `NONE`"
+            )
+            return 0
+
+        start_time = time.perf_counter()
+        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        with self.maybe_setup_dummy_loras(self.lora_config):
+            self.cudagraph_manager.capture(
+                model=self.model,
+                input_buffers=self.input_buffers,
+                block_tables=self.block_tables,
+                attn_metadata_builders=self.attn_metadata_builders,
+                kv_cache_config=self.kv_cache_config,
+            )
+
+        end_time = time.perf_counter()
+        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
+        elapsed_time = end_time - start_time
+        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
+        # This usually takes 5~20 seconds.
+        logger.info(
+            "Graph capturing finished in %.0f secs, took %.2f GiB",
+            elapsed_time,
+            cuda_graph_size / (1 << 30),
+        )
+        return cuda_graph_size
+
+    def warmup_for_prefill(self) -> None:
+        # For FlashInfer, we would like to execute a dummy prefill run
+        # to trigger JIT compilation.
+        if all("FLASHINFER" in b.get_name() for b in self.attn_backends.values()):
+            self._dummy_run(self.max_num_tokens, skip_attn=False)
+            torch.cuda.synchronize()
+
+    def update_states(self, scheduler_output: SchedulerOutput) -> None:
+        for req_id in scheduler_output.preempted_req_ids:
+            self.req_states.remove_request(req_id)
+        for req_id in scheduler_output.finished_req_ids:
+            self.req_states.remove_request(req_id)
+
+        # TODO(woosuk): Change SchedulerOutput.
+        req_indices: list[int] = []
+        cu_num_new_blocks = tuple(
+            [0] for _ in range(self.block_tables.num_kv_cache_groups)
+        )
+        new_block_ids: tuple[list[int], ...] = tuple(
+            [] for _ in range(self.block_tables.num_kv_cache_groups)
+        )
+        overwrite: list[bool] = []
+
+        # Add new requests.
+        for new_req_data in scheduler_output.scheduled_new_reqs:
+            req_id = new_req_data.req_id
+            self.req_states.add_request(
+                req_id=req_id,
+                prompt_len=len(new_req_data.prompt_token_ids),
+                prefill_token_ids=new_req_data.prefill_token_ids,
+                num_computed_tokens=new_req_data.num_computed_tokens,
+                sampling_params=new_req_data.sampling_params,
+                lora_request=new_req_data.lora_request,
+            )
+
+            req_index = self.req_states.req_id_to_index[req_id]
+            req_indices.append(req_index)
+            for i, block_ids in enumerate(new_req_data.block_ids):
+                x = cu_num_new_blocks[i][-1]
+                cu_num_new_blocks[i].append(x + len(block_ids))
+                new_block_ids[i].extend(block_ids)
+            overwrite.append(True)
+
+        # Add new blocks for the existing requests.
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for i, req_id in enumerate(cached_reqs.req_ids):
+            req_index = self.req_states.req_id_to_index[req_id]
+
+            req_new_block_ids = cached_reqs.new_block_ids[i]
+            if req_new_block_ids is not None:
+                req_indices.append(req_index)
+                for group_id, block_ids in enumerate(req_new_block_ids):
+                    x = cu_num_new_blocks[group_id][-1]
+                    cu_num_new_blocks[group_id].append(x + len(block_ids))
+                    new_block_ids[group_id].extend(block_ids)
+                overwrite.append(False)
+
+        if req_indices:
+            self.block_tables.append_block_ids(
+                req_indices=req_indices,
+                cu_num_new_blocks=cu_num_new_blocks,
+                new_block_ids=new_block_ids,
+                overwrite=overwrite,
+            )
+
+    def prepare_inputs(
+        self,
+        scheduler_output: SchedulerOutput,
+        num_tokens_after_padding: int,
+    ) -> InputBatch:
+        num_tokens = scheduler_output.total_num_scheduled_tokens
+        assert num_tokens > 0
+        num_reqs = len(scheduler_output.num_scheduled_tokens)
+
+        # Decode first, then prefill.
+        # batch_idx -> req_id
+        req_ids = sorted(
+            scheduler_output.num_scheduled_tokens,
+            key=scheduler_output.num_scheduled_tokens.get,
+        )
+        num_scheduled_tokens = np.array(
+            [scheduler_output.num_scheduled_tokens[i] for i in req_ids], dtype=np.int32
+        )
+
+        idx_mapping_list = [
+            self.req_states.req_id_to_index[req_id] for req_id in req_ids
+        ]
+        idx_mapping = self.input_buffers.idx_mapping
+        idx_mapping.np[:num_reqs] = idx_mapping_list
+        idx_mapping_np = idx_mapping.np[:num_reqs]
+        idx_mapping = idx_mapping.copy_to_gpu(num_reqs)
+
+        # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
+        block_tables = self.block_tables.gather_block_tables(idx_mapping)
+
+        prepare_inputs(
+            idx_mapping_np,
+            self.req_states.prefill_token_ids,
+            self.req_states.num_computed_tokens,
+            num_scheduled_tokens,
+            self.input_buffers.input_ids,
+            self.input_buffers.positions,
+            self.input_buffers.query_start_loc,
+            self.input_buffers.seq_lens,
+            num_tokens,
+        )
+
+        query_start_loc = self.input_buffers.query_start_loc
+        query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
+        query_start_loc_np = query_start_loc.np[: num_reqs + 1]
+        seq_lens_gpu = self.input_buffers.seq_lens.gpu[:num_reqs]
+        seq_lens_np = self.input_buffers.seq_lens.np[:num_reqs]
+
+        # Some input token ids are directly read from the last sampled tokens.
+        combine_last_token_ids(
+            self.input_buffers.input_ids.gpu,
+            idx_mapping,
+            self.req_states.last_sampled_tokens,
+            query_start_loc_gpu,
+            seq_lens_gpu,
+            self.req_states.prefill_len.copy_to_gpu(),
+        )
+
+        # Compute slot mappings: [num_kv_cache_groups, num_tokens]
+        slot_mappings = self.block_tables.compute_slot_mappings(
+            query_start_loc_gpu, self.input_buffers.positions.gpu[:num_tokens]
+        )
+
+        num_computed_tokens_cpu = torch.from_numpy(
+            self.req_states.num_computed_tokens[idx_mapping_np]
+        )
+
+        # Logits indices to sample next token from.
+        logits_indices = query_start_loc_gpu[1:] - 1
+
+        # Layer name -> attention metadata.
+        attn_metadata = build_attn_metadata(
+            attn_metadata_builders=self.attn_metadata_builders,
+            num_reqs=num_reqs,
+            num_tokens=num_tokens,
+            query_start_loc=self.input_buffers.query_start_loc,
+            seq_lens=self.input_buffers.seq_lens,
+            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            block_tables=block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=self.kv_cache_config,
+        )
+
+        input_ids = self.input_buffers.input_ids.gpu[:num_tokens_after_padding]
+        positions = self.input_buffers.positions.gpu[:num_tokens_after_padding]
+        return InputBatch(
+            req_ids=req_ids,
+            num_reqs=num_reqs,
+            idx_mapping=idx_mapping,
+            idx_mapping_np=idx_mapping_np,
+            num_scheduled_tokens=num_scheduled_tokens,
+            num_tokens=num_tokens,
+            num_tokens_after_padding=num_tokens_after_padding,
+            query_start_loc=query_start_loc_gpu,
+            query_start_loc_np=query_start_loc_np,
+            seq_lens=seq_lens_gpu,
+            seq_lens_np=seq_lens_np,
+            input_ids=input_ids,
+            positions=positions,
+            attn_metadata=attn_metadata,
+            logits_indices=logits_indices,
+        )
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        input_batch: InputBatch,
+        sampling_metadata: SamplingMetadata,
+        grammar_output: GrammarOutput | None,
+    ) -> SamplerOutput:
+        sample_hidden_states = hidden_states[input_batch.logits_indices]
+        logits = self.model.compute_logits(sample_hidden_states)
+        if grammar_output is not None:
+            # Apply grammar bitmask to the logits in-place.
+            with async_barrier(self.structured_outputs_event):
+                apply_grammar_bitmask(
+                    logits,
+                    input_batch.req_ids,
+                    grammar_output.structured_output_request_ids,
+                    grammar_output.grammar_bitmask,
+                    self.input_buffers,
+                )
+        sampler_output = self.sampler(logits, sampling_metadata)
+        return sampler_output
+
+    def compute_prompt_logprobs(
+        self,
+        hidden_states: torch.Tensor,
+        input_batch: InputBatch,
+    ) -> dict[str, LogprobsTensors]:
+        idx_mapping_np = input_batch.idx_mapping_np
+        needs_prompt_logprobs = self.req_states.needs_prompt_logprobs[idx_mapping_np]
+        if not np.any(needs_prompt_logprobs):
+            # No request asks for prompt logprobs.
+            return {}
+
+        num_computed_tokens = self.req_states.num_computed_tokens[idx_mapping_np]
+        prompt_lens = self.req_states.prompt_len[idx_mapping_np]
+        # NOTE(woosuk): -1 because the last prompt token's hidden state is not
+        # needed for prompt logprobs.
+        includes_prompt = num_computed_tokens < prompt_lens - 1
+        # NOTE(woosuk): If the request was resumed after preemption, its prompt
+        # logprobs must have been computed before preemption. Skip.
+        resumed_after_prompt = (
+            prompt_lens < self.req_states.prefill_len.np[idx_mapping_np]
+        )
+        needs_prompt_logprobs &= includes_prompt & ~resumed_after_prompt
+        if not np.any(needs_prompt_logprobs):
+            return {}
+
+        # Just to be safe, clone the input ids.
+        n = input_batch.num_tokens
+        # Shift the input ids by one.
+        token_ids = torch.empty_like(input_batch.input_ids[:n])
+        token_ids[: n - 1] = input_batch.input_ids[1:n]
+        # To avoid out-of-bound access, set the last token id to 0.
+        token_ids[n - 1] = 0
+
+        # Handle chunked prompts.
+        seq_lens = self.input_buffers.seq_lens.np[: input_batch.num_reqs]
+        is_prompt_chunked = seq_lens < prompt_lens
+        prefill_token_ids = self.req_states.prefill_token_ids
+        query_start_loc = self.input_buffers.query_start_loc.np
+        for i, req_id in enumerate(input_batch.req_ids):
+            if not needs_prompt_logprobs[i]:
+                continue
+            if not is_prompt_chunked[i]:
+                continue
+            # The prompt is chunked. Get the next prompt token.
+            req_idx = input_batch.idx_mapping_np[i]
+            next_prompt_token = int(prefill_token_ids[req_idx, seq_lens[i]])
+            idx = int(query_start_loc[i + 1] - 1)
+            # Set the next prompt token.
+            # NOTE(woosuk): This triggers a GPU operation.
+            token_ids[idx] = next_prompt_token
+
+        # NOTE(woosuk): We mask out logprobs for negative tokens.
+        prompt_logprobs, prompt_ranks = compute_prompt_logprobs(
+            token_ids,
+            hidden_states[:n],
+            self.model.compute_logits,
+        )
+
+        prompt_token_ids = token_ids.unsqueeze(-1)
+        prompt_logprobs_dict: dict[str, LogprobsTensors] = {}
+        for i, req_id in enumerate(input_batch.req_ids):
+            if not needs_prompt_logprobs[i]:
+                continue
+
+            start_idx = query_start_loc[i]
+            end_idx = query_start_loc[i + 1]
+            assert start_idx < end_idx, (
+                f"start_idx ({start_idx}) >= end_idx ({end_idx})"
+            )
+            logprobs = LogprobsTensors(
+                logprob_token_ids=prompt_token_ids[start_idx:end_idx],
+                logprobs=prompt_logprobs[start_idx:end_idx],
+                selected_token_ranks=prompt_ranks[start_idx:end_idx],
+            )
+
+            req_extra_data = self.req_states.extra_data[req_id]
+            prompt_logprobs_list = req_extra_data.in_progress_prompt_logprobs
+            if is_prompt_chunked[i]:
+                # Prompt is chunked. Do not return the logprobs yet.
+                prompt_logprobs_list.append(logprobs)
+                continue
+
+            if prompt_logprobs_list:
+                # Merge the in-progress logprobs.
+                prompt_logprobs_list.append(logprobs)
+                logprobs = LogprobsTensors(
+                    logprob_token_ids=torch.cat(
+                        [x.logprob_token_ids for x in prompt_logprobs_list]
+                    ),
+                    logprobs=torch.cat([x.logprobs for x in prompt_logprobs_list]),
+                    selected_token_ranks=torch.cat(
+                        [x.selected_token_ranks for x in prompt_logprobs_list]
+                    ),
+                )
+                prompt_logprobs_list.clear()
+
+            prompt_logprobs_dict[req_id] = logprobs
+        return prompt_logprobs_dict
+
+    def postprocess(
+        self,
+        sampler_output: SamplerOutput,
+        prompt_logprobs_dict: dict[str, LogprobsTensors],
+        input_batch: InputBatch,
+    ) -> AsyncOutput | ModelRunnerOutput:
+        # Store the last sampled token ids.
+        self.req_states.last_sampled_tokens[input_batch.idx_mapping] = (
+            sampler_output.sampled_token_ids
+        )
+        # Get the number of sampled tokens.
+        # 0 if chunked-prefilling, 1 if not.
+        idx_mapping_np = input_batch.idx_mapping_np
+        is_chunked_prefilling = (
+            input_batch.seq_lens_np < self.req_states.num_tokens[idx_mapping_np]
+        )
+        num_sampled_tokens = (~is_chunked_prefilling).astype(np.int32)
+        # Increment the number of tokens.
+        self.req_states.num_tokens[idx_mapping_np] += num_sampled_tokens
+        # Increment the number of computed tokens.
+        self.req_states.num_computed_tokens[idx_mapping_np] += (
+            input_batch.num_scheduled_tokens
+        )
+
+        model_runner_output = ModelRunnerOutput(
+            req_ids=input_batch.req_ids,
+            req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)},
+            sampled_token_ids=None,
+            logprobs=None,
+            prompt_logprobs_dict=prompt_logprobs_dict,
+            pooler_output=[],
+            kv_connector_output=None,
+            num_nans_in_logits=None,
+        )
+        async_output = AsyncOutput(
+            model_runner_output=model_runner_output,
+            sampler_output=sampler_output,
+            num_sampled_tokens=num_sampled_tokens,
+            copy_stream=self.output_copy_stream,
+            copy_event=self.output_copy_event,
+        )
+        if self.use_async_scheduling:
+            return async_output
+        return async_output.get_output()
+
+    def get_cudagraph_and_dp_padding(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> tuple[CUDAGraphMode, int, torch.Tensor | None]:
+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        if self.dp_size == 1:
+            # No DP. Only consider CUDA graphs.
+            if total_num_scheduled_tokens == 0:
+                # Special case: no tokens to run.
+                return CUDAGraphMode.NONE, 0, None
+
+            cudagraph_size = self.cudagraph_manager.get_cudagraph_size(
+                scheduler_output, total_num_scheduled_tokens
+            )
+            if cudagraph_size is not None:
+                # Use full CUDA graph.
+                return CUDAGraphMode.FULL, cudagraph_size, None
+            # Fall back to eager mode.
+            # TODO(woosuk): Support piecewise CUDA graphs.
+            return CUDAGraphMode.NONE, total_num_scheduled_tokens, None
+
+        # Consider DP padding and CUDA graph.
+        if total_num_scheduled_tokens == 0:
+            # Special handling is needed for 0.
+            cudagraph_size_before_dp: int | None = 0
+        else:
+            cudagraph_size_before_dp = self.cudagraph_manager.get_cudagraph_size(
+                scheduler_output, total_num_scheduled_tokens
+            )
+            if cudagraph_size_before_dp is None:
+                cudagraph_size_before_dp = -1
+
+        assert cudagraph_size_before_dp is not None
+        num_tokens_across_dp, cudagraph_size_across_dp = get_batch_metadata_across_dp(
+            total_num_scheduled_tokens,
+            cudagraph_size_before_dp,
+            self.dp_size,
+            self.dp_rank,
+        )
+        if all(cudagraph_size_across_dp >= 0):
+            # If all ranks can use CUDA graph, pad to the maximum number of tokens
+            # across DP and use CUDA graph.
+            num_tokens_after_padding = int(cudagraph_size_across_dp.max().item())
+            cudagraph_mode = CUDAGraphMode.FULL
+        else:
+            # If any of the ranks cannot use CUDA graph, use eager mode for all ranks.
+            # No padding is needed except for ranks that have no tokens to run.
+            num_tokens_across_dp = torch.clamp(num_tokens_across_dp, min=1)
+            num_tokens_after_padding = num_tokens_across_dp[self.dp_rank]
+            cudagraph_mode = CUDAGraphMode.NONE
+        return cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        scheduler_output: SchedulerOutput,
+        intermediate_tensors: Any | None = None,
+        dummy_run: bool = False,
+    ) -> ModelRunnerOutput | None:
+        assert intermediate_tensors is None
+        if scheduler_output.total_num_scheduled_tokens == 0 and not dummy_run:
+            # No need to run the model.
+            with async_barrier(self.input_prep_event):
+                self.update_states(scheduler_output)
+                return EMPTY_MODEL_RUNNER_OUTPUT
+
+        # NOTE: Call this before the async barrier so CPU all-reduce and
+        # GPU execution can overlap.
+        cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp = (
+            self.get_cudagraph_and_dp_padding(scheduler_output)
+        )
+        with async_barrier(self.input_prep_event):
+            self.update_states(scheduler_output)
+            if num_tokens_after_padding == 0:
+                # All DP ranks have zero tokens to run.
+                return EMPTY_MODEL_RUNNER_OUTPUT
+
+            if not dummy_run:
+                # Common case.
+                # Prepare all the inputs and copy to the input buffers.
+                input_batch = self.prepare_inputs(
+                    scheduler_output,
+                    num_tokens_after_padding,
+                )
+
+                # NOTE(woosuk): Sampling metadata should be built under the async
+                # barrier to avoid race conditions.
+                pos = input_batch.positions[input_batch.logits_indices]
+                sampling_metadata = self.req_states.make_sampling_metadata(
+                    input_batch.idx_mapping_np, pos
+                )
+
+                if self.lora_config:
+                    # Activate LoRA adapters.
+                    lora_inputs = self.req_states.make_lora_inputs(
+                        input_batch.req_ids,
+                        input_batch.idx_mapping_np,
+                        input_batch.num_scheduled_tokens,
+                    )
+                    self._set_active_loras(*lora_inputs)
+            else:
+                # No actual tokens to run. A dummy run for DP.
+                num_reqs = min(num_tokens_after_padding, self.max_num_reqs)
+                input_batch = InputBatch.make_dummy(
+                    num_reqs=num_reqs,
+                    num_tokens=num_tokens_after_padding,
+                    input_buffers=self.input_buffers,
+                    device=self.device,
+                )
+                self.prepare_dummy_attn_metadata(input_batch)
+                sampling_metadata = None
+
+        # Run model.
+        if cudagraph_mode == CUDAGraphMode.FULL:
+            # Run CUDA graph.
+            # NOTE(woosuk): Here, we don't need to pass the input tensors,
+            # because they are already copied to the CUDA graph input buffers.
+            hidden_states = self.cudagraph_manager.run(
+                input_batch.num_tokens_after_padding
+            )
+        else:
+            # Run PyTorch model in eager mode.
+            with set_forward_context(
+                input_batch.attn_metadata,
+                self.vllm_config,
+                num_tokens=input_batch.num_tokens_after_padding,
+                cudagraph_runtime_mode=cudagraph_mode,
+                num_tokens_across_dp=num_tokens_across_dp,
+            ):
+                hidden_states = self.model(
+                    input_ids=input_batch.input_ids,
+                    positions=input_batch.positions,
+                )
+
+        self.execute_model_state = hidden_states, input_batch, sampling_metadata
+        return None
+
+    @torch.inference_mode()
+    def sample_tokens(
+        self,
+        grammar_output: GrammarOutput | None,
+    ) -> AsyncOutput | ModelRunnerOutput:
+        assert self.execute_model_state is not None
+        hidden_states, input_batch, sampling_metadata = self.execute_model_state
+        self.execute_model_state = None  # type: ignore
+        assert sampling_metadata is not None
+
+        sampler_output = self.sample(
+            hidden_states, input_batch, sampling_metadata, grammar_output
+        )
+        prompt_logprobs_dict = self.compute_prompt_logprobs(hidden_states, input_batch)
+        output = self.postprocess(
+            sampler_output,
+            prompt_logprobs_dict,
+            input_batch,
+        )
+        return output
diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
new file mode 100644
index 0000000000000..e916aadb6b5a0
--- /dev/null
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -0,0 +1,327 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.config.model import LogprobsMode
+from vllm.v1.outputs import LogprobsTensors, SamplerOutput
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+
+
+class Sampler:
+    def __init__(
+        self,
+        logprobs_mode: LogprobsMode = "raw_logprobs",
+    ):
+        if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
+            raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
+        self.logprobs_mode = logprobs_mode
+
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+        if sampling_metadata.max_num_logprobs is not None:
+            if self.logprobs_mode == "processed_logprobs":
+                sampled, logits = self.sample(
+                    logits, sampling_metadata, return_logits=True
+                )
+            else:
+                assert self.logprobs_mode == "raw_logprobs"
+                sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
+
+            logprobs_tensors = compute_topk_logprobs(
+                logits,
+                sampling_metadata.max_num_logprobs,
+                sampled,
+            )
+        else:
+            sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
+            logprobs_tensors = None
+
+        # These are GPU tensors.
+        sampler_output = SamplerOutput(
+            # The sampled tokens are expanded to 2D tensor with shape
+            # [num_requests, 1], where each row represents one generated
+            # token per request.
+            sampled_token_ids=sampled.view(-1, 1),
+            logprobs_tensors=logprobs_tensors,
+        )
+        return sampler_output
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        return_logits: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        is_greedy = sampling_metadata.temperature == 0
+        temp = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
+        logits = logits / temp.view(-1, 1)
+        logits = apply_top_k_top_p(
+            logits, sampling_metadata.top_k, sampling_metadata.top_p
+        )
+
+        sampled = gumbel_sample(
+            logits,
+            is_greedy,
+            sampling_metadata.seeds,
+            sampling_metadata.pos,
+        )
+        return sampled, logits if return_logits else None
+
+
+@triton.jit
+def _gumbel_sample_kernel(
+    sampled_ptr,
+    logits_ptr,
+    logits_stride,
+    seeds_ptr,
+    pos_ptr,
+    is_greedy_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    is_greedy = tl.load(is_greedy_ptr + req_idx)
+
+    if is_greedy:
+        # Greedy sampling. Don't apply gumbel noise.
+        max_val = float("-inf")
+        max_idx = 0
+        for i in range(0, vocab_size, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            mask = block < vocab_size
+            logits = tl.load(
+                logits_ptr + req_idx * logits_stride + block,
+                mask=mask,
+                other=float("-inf"),
+            )
+
+            idx = tl.argmax(logits, axis=0)
+            value = tl.max(logits, axis=0)
+            is_greater = value > max_val
+            max_val = tl.where(is_greater, value, max_val)
+            max_idx = tl.where(is_greater, i + idx, max_idx)
+        tl.store(sampled_ptr + req_idx, max_idx)
+        return
+
+    # Random sampling.
+    # Calculate gumbel seed.
+    seed = tl.load(seeds_ptr + req_idx)
+    pos = tl.load(pos_ptr + req_idx)
+    gumbel_seed = tl.randint(seed, pos)
+
+    max_val = float("-inf")
+    max_idx = 0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < vocab_size
+
+        # Generate gumbel noise.
+        r = tl.rand(gumbel_seed, block).to(tl.float64)
+        gumbel_noise = -tl.log(-tl.log(r + 1e-20) + 1e-20)
+        gumbel_noise = gumbel_noise.to(tl.float32)
+
+        # Apply gumbel noise.
+        logits = tl.load(logits_ptr + req_idx * logits_stride + block, mask=mask)
+        logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
+
+        # Argmax to get the sampled token.
+        idx = tl.argmax(logits, axis=0)
+        value = tl.max(logits, axis=0)
+        is_greater = value > max_val
+        max_val = tl.where(is_greater, value, max_val)
+        max_idx = tl.where(is_greater, i + idx, max_idx)
+    tl.store(sampled_ptr + req_idx, max_idx)
+
+
+def gumbel_sample(
+    logits: torch.Tensor,  # [num_reqs, vocab_size]
+    is_greedy: torch.Tensor,  # [num_reqs]
+    seed: torch.Tensor,  # [num_reqs]
+    pos: torch.Tensor,  # [num_reqs]
+) -> torch.Tensor:
+    num_reqs, vocab_size = logits.shape
+    # NOTE(woosuk): Use int64 for later indexing.
+    sampled = torch.empty(
+        num_reqs,
+        dtype=torch.int64,
+        device=logits.device,
+    )
+    _gumbel_sample_kernel[(num_reqs,)](
+        sampled,
+        logits,
+        logits.stride(0),
+        seed,
+        pos,
+        is_greedy,
+        vocab_size,
+        num_warps=8,
+        BLOCK_SIZE=16384,  # type: ignore
+    )
+    return sampled
+
+
+@triton.jit
+def _topk_log_softmax_kernel(
+    output_ptr,
+    logits_ptr,
+    logits_stride,
+    topk_ids_ptr,
+    topk,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+    PADDED_TOPK: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = logits_ptr + req_idx * logits_stride
+
+    max_val = float("-inf")
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
+        max_val = tl.max(tl.maximum(logits, max_val))
+    max_val = max_val.to(tl.float32)  # type: ignore
+
+    se = 0.0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=0.0)
+        # NOTE(woosuk): Make sure that logits and all following operations use FP32.
+        logits = logits.to(tl.float32)
+        e = tl.exp(logits - max_val)
+        e = tl.where(block < vocab_size, e, 0.0)
+        se += tl.sum(e)
+    lse = tl.log(se)
+
+    k_offset = tl.arange(0, PADDED_TOPK)
+    k_mask = k_offset < topk
+    topk_ids = tl.load(topk_ids_ptr + req_idx * topk + k_offset, mask=k_mask, other=0)
+
+    logits = tl.load(row_ptr + topk_ids, mask=k_mask)
+    logits = logits.to(tl.float32)
+    o = logits - max_val - lse
+    tl.store(output_ptr + req_idx * topk + k_offset, o, mask=k_mask)
+
+
+@triton.jit
+def _ranks_kernel(
+    output_ptr,
+    logits_ptr,
+    logits_stride,
+    token_ids_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = logits_ptr + req_idx * logits_stride
+
+    token_id = tl.load(token_ids_ptr + req_idx)
+    x = tl.load(row_ptr + token_id)
+
+    n = 0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
+        n += tl.sum((logits > x).to(tl.int32))
+    tl.store(output_ptr + req_idx, n)
+
+
+def compute_token_logprobs(
+    logits: torch.Tensor,
+    token_ids: torch.Tensor,
+) -> torch.Tensor:
+    batch_size = logits.shape[0]
+    vocab_size = logits.shape[1]
+    token_ids = token_ids.to(torch.int64)
+    num_logprobs = token_ids.shape[1]
+    logprobs = torch.empty(
+        batch_size,
+        num_logprobs,
+        dtype=torch.float32,
+        device=logits.device,
+    )
+    _topk_log_softmax_kernel[(batch_size,)](
+        logprobs,
+        logits,
+        logits.stride(0),
+        token_ids,
+        num_logprobs,
+        vocab_size,
+        BLOCK_SIZE=1024,  # type: ignore
+        PADDED_TOPK=triton.next_power_of_2(num_logprobs),
+    )
+    return logprobs
+
+
+def compute_topk_logprobs(
+    logits: torch.Tensor,
+    num_logprobs: int,
+    sampled_token_ids: torch.Tensor,
+) -> LogprobsTensors:
+    assert num_logprobs >= 0
+    batch_size, vocab_size = logits.shape
+    if num_logprobs == 0:
+        logprob_token_ids = sampled_token_ids.unsqueeze(-1)
+    else:
+        topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
+        logprob_token_ids = torch.cat(
+            (sampled_token_ids.unsqueeze(-1), topk_indices), dim=1
+        )
+
+    # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full
+    # logprobs tensor. Instead, we only compute and return the logprobs of
+    # the topk + 1 tokens.
+    logprobs = compute_token_logprobs(logits, logprob_token_ids)
+    token_ranks = torch.empty(
+        batch_size,
+        dtype=torch.int64,
+        device=logits.device,
+    )
+    _ranks_kernel[(batch_size,)](
+        token_ranks,
+        logits,
+        logits.stride(0),
+        sampled_token_ids,
+        vocab_size,
+        BLOCK_SIZE=8192,  # type: ignore
+    )
+    return LogprobsTensors(
+        logprob_token_ids=logprob_token_ids,
+        logprobs=logprobs,
+        selected_token_ranks=token_ranks,
+    )
+
+
+def compute_prompt_logprobs(
+    prompt_token_ids: torch.Tensor,
+    prompt_hidden_states: torch.Tensor,
+    logits_fn: Callable[[torch.Tensor], torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # Since materializing the full prompt logits can take too much memory,
+    # we compute it in chunks.
+    CHUNK_SIZE = 1024
+    logprobs = []
+    ranks = []
+    prompt_token_ids = prompt_token_ids.to(torch.int64)
+    for start_idx in range(0, prompt_token_ids.shape[0], CHUNK_SIZE):
+        end_idx = start_idx + CHUNK_SIZE
+        # NOTE(woosuk): logits_fn can be slow because it involves all-gather.
+        prompt_logits = logits_fn(prompt_hidden_states[start_idx:end_idx])
+        prompt_logprobs = compute_topk_logprobs(
+            prompt_logits,
+            0,  # num_logprobs
+            prompt_token_ids[start_idx:end_idx],
+        )
+        logprobs.append(prompt_logprobs.logprobs)
+        ranks.append(prompt_logprobs.selected_token_ranks)
+
+    logprobs = torch.cat(logprobs, dim=0) if len(logprobs) > 1 else logprobs[0]
+    ranks = torch.cat(ranks, dim=0) if len(ranks) > 1 else ranks[0]
+    return logprobs, ranks
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
new file mode 100644
index 0000000000000..5d05c3f57790a
--- /dev/null
+++ b/vllm/v1/worker/gpu/states.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass, field
+
+import numpy as np
+import torch
+
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingParams
+from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.utils import CpuGpuBuffer
+
+_NP_INT64_MIN = np.iinfo(np.int64).min
+_NP_INT64_MAX = np.iinfo(np.int64).max
+NO_LORA_ID = 0
+
+
+@dataclass
+class SamplingMetadata:
+    temperature: torch.Tensor
+
+    top_p: torch.Tensor | None
+    top_k: torch.Tensor | None
+
+    seeds: torch.Tensor
+    pos: torch.Tensor
+
+    # None means no logprobs, 0 means sampled token logprobs only
+    max_num_logprobs: int | None
+
+    @classmethod
+    def make_dummy(
+        cls,
+        num_reqs: int,
+        device: torch.device,
+    ) -> "SamplingMetadata":
+        assert num_reqs > 0
+        temperature = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        temperature[0] = 0.5
+        # TODO(woosuk): Use top-p and top-k for dummy sampler.
+        # Currently, they are disabled because of memory usage.
+        # top_p = torch.full((num_reqs,), 0.95, dtype=torch.float32, device=device)
+        # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device)
+        top_p = None
+        top_k = None
+        seeds = torch.zeros(num_reqs, dtype=torch.int64, device=device)
+        pos = torch.zeros(num_reqs, dtype=torch.int64, device=device)
+        max_num_logprobs = 20
+
+        return cls(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            seeds=seeds,
+            pos=pos,
+            max_num_logprobs=max_num_logprobs,
+        )
+
+
+class RequestState:
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_batched_tokens: int,
+        vocab_size: int,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.vocab_size = vocab_size
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.req_id_to_index: dict[str, int] = {}
+        self.index_to_req_id: dict[int, str] = {}
+        self.free_indices = list(range(max_num_reqs))
+        self.extra_data: dict[str, ExtraData] = {}
+
+        self.prompt_len = np.zeros(self.max_num_reqs, dtype=np.int32)
+        self.prefill_token_ids = np.zeros(
+            (self.max_num_reqs, self.max_model_len),
+            dtype=np.int32,
+        )
+        self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
+        self.num_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
+
+        # Last sampled tokens.
+        self.last_sampled_tokens = torch.zeros(
+            self.max_num_reqs,
+            1,
+            dtype=torch.int64,
+            device=device,
+        )
+
+        # LoRA.
+        self.lora_ids = np.zeros(self.max_num_reqs, dtype=np.int32)
+        self.lora_ids.fill(NO_LORA_ID)
+
+        # Sampling parameters.
+        self.temperature = self._make_param(self.max_num_reqs, torch.float32)
+        self.top_p = self._make_param(self.max_num_reqs, torch.float32)
+        self.top_k = self._make_param(self.max_num_reqs, torch.int32)
+        self.seeds = self._make_param(self.max_num_reqs, torch.int64)
+
+        self.num_logprobs = np.empty(self.max_num_reqs, dtype=np.int32)
+        # -1 means no logprobs are requested.
+        self.num_logprobs.fill(-1)
+        self.needs_prompt_logprobs = np.zeros(self.max_num_reqs, dtype=bool)
+
+    def _make_param(self, size: int, dtype: torch.dtype) -> "Param":
+        return Param(size, dtype=dtype, device=self.device, pin_memory=self.pin_memory)
+
+    def _make_buffer(self, size: int, dtype: torch.dtype) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            size, dtype=dtype, device=self.device, pin_memory=self.pin_memory
+        )
+
+    @property
+    def num_reqs(self) -> int:
+        return len(self.req_id_to_index)
+
+    def add_request(
+        self,
+        req_id: str,
+        prompt_len: int,
+        prefill_token_ids: list[int],
+        num_computed_tokens: int,
+        sampling_params: SamplingParams,
+        lora_request: LoRARequest | None,
+    ) -> None:
+        assert len(self.free_indices) > 0, "No free indices"
+        req_idx = self.free_indices.pop()
+        self.req_id_to_index[req_id] = req_idx
+        self.index_to_req_id[req_idx] = req_id
+        self.extra_data[req_id] = ExtraData(lora_request)
+
+        self.prompt_len[req_idx] = prompt_len
+        prefill_len = len(prefill_token_ids)
+        assert prefill_len >= prompt_len, (
+            f"prefill_len {prefill_len} < prompt_len {prompt_len}"
+        )
+        self.prefill_len.np[req_idx] = prefill_len
+        self.prefill_token_ids[req_idx, :prefill_len] = prefill_token_ids
+        self.num_tokens[req_idx] = prefill_len
+        self.num_computed_tokens[req_idx] = num_computed_tokens
+
+        if lora_request is not None:
+            self.lora_ids[req_idx] = lora_request.lora_int_id
+        else:
+            self.lora_ids[req_idx] = NO_LORA_ID
+
+        self.temperature.np[req_idx] = sampling_params.temperature
+        self.top_p.np[req_idx] = sampling_params.top_p
+        if 0 < sampling_params.top_k < self.vocab_size:
+            top_k = sampling_params.top_k
+        else:
+            top_k = self.vocab_size
+        self.top_k.np[req_idx] = top_k
+
+        if sampling_params.seed is not None:
+            seed = sampling_params.seed
+        else:
+            seed = np.random.randint(_NP_INT64_MIN, _NP_INT64_MAX)
+        self.seeds.np[req_idx] = seed
+
+        if sampling_params.logprobs is not None:
+            num_logprobs = sampling_params.logprobs
+        else:
+            num_logprobs = -1
+        self.num_logprobs[req_idx] = num_logprobs
+
+        # For now, only support prompt logprobs for the prompt tokens.
+        needs_prompt_logprobs = sampling_params.prompt_logprobs is not None
+        self.needs_prompt_logprobs[req_idx] = needs_prompt_logprobs
+
+    def remove_request(self, req_id: str) -> None:
+        self.extra_data.pop(req_id, None)
+        req_idx = self.req_id_to_index.pop(req_id, None)
+        if req_idx is None:
+            # Request not found.
+            return
+        self.index_to_req_id.pop(req_idx, None)
+        self.free_indices.append(req_idx)
+
+    def make_sampling_metadata(
+        self,
+        idx_mapping: np.ndarray,
+        pos: torch.Tensor,
+    ) -> SamplingMetadata:
+        temperature = self.temperature.np[idx_mapping]
+        temperature = self.temperature.copy_np_to_gpu(temperature)
+
+        top_p = self.top_p.np[idx_mapping]
+        no_top_p = np.all(top_p == 1.0)
+        top_p = self.top_p.copy_np_to_gpu(top_p) if not no_top_p else None
+
+        top_k = self.top_k.np[idx_mapping]
+        no_top_k = np.all(top_k == self.vocab_size)
+        top_k = self.top_k.copy_np_to_gpu(top_k) if not no_top_k else None
+
+        seeds = self.seeds.np[idx_mapping]
+        seeds = self.seeds.copy_np_to_gpu(seeds)
+
+        num_logprobs = self.num_logprobs[idx_mapping]
+        max_num_logprobs: int | None = int(np.max(num_logprobs))
+        if max_num_logprobs == -1:
+            max_num_logprobs = None
+
+        return SamplingMetadata(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            seeds=seeds,
+            pos=pos,
+            max_num_logprobs=max_num_logprobs,
+        )
+
+    def make_lora_inputs(
+        self,
+        req_ids: list[str],
+        idx_mapping: np.ndarray,
+        num_scheduled_tokens: np.ndarray,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
+        lora_ids = self.lora_ids[idx_mapping]
+        prompt_lora_mapping = tuple(lora_ids)
+        token_lora_mapping = tuple(lora_ids.repeat(num_scheduled_tokens))
+
+        active_lora_requests: set[LoRARequest] = set()
+        for req_id in req_ids:
+            lora_request = self.extra_data[req_id].lora_request
+            if lora_request is not None:
+                active_lora_requests.add(lora_request)
+        return prompt_lora_mapping, token_lora_mapping, active_lora_requests
+
+
+class Param:
+    def __init__(
+        self,
+        size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.buffer = CpuGpuBuffer(
+            size,
+            dtype=dtype,
+            device=device,
+            pin_memory=pin_memory,
+        )
+        self.np = np.zeros_like(self.buffer.np)
+
+    def copy_np_to_gpu(self, x: np.ndarray) -> torch.Tensor:
+        n = x.shape[0]
+        self.buffer.np[:n] = x
+        return self.buffer.copy_to_gpu(n)
+
+
+@dataclass
+class ExtraData:
+    lora_request: LoRARequest | None
+    in_progress_prompt_logprobs: list[LogprobsTensors] = field(default_factory=list)
diff --git a/vllm/v1/worker/gpu/structured_outputs.py b/vllm/v1/worker/gpu/structured_outputs.py
new file mode 100644
index 0000000000000..83051b0ed33ff
--- /dev/null
+++ b/vllm/v1/worker/gpu/structured_outputs.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+import torch
+
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+
+
+def apply_grammar_bitmask(
+    logits: torch.Tensor,
+    req_ids: list[str],
+    grammar_req_ids: list[str],
+    grammar_bitmask: np.ndarray,
+    input_buffers: InputBuffers,
+) -> None:
+    input_buffers.grammar_bitmask.np[: grammar_bitmask.shape[0]] = grammar_bitmask
+    input_buffers.grammar_bitmask.copy_to_gpu(grammar_bitmask.shape[0])
+
+    batch_size = logits.shape[0]
+    grammar_req_id_to_idx = {req_id: i for i, req_id in enumerate(grammar_req_ids)}
+    # logits -> bitmask mapping
+    mapping = [grammar_req_id_to_idx.get(req_id, -1) for req_id in req_ids]
+    input_buffers.bitmask_indices.np[:batch_size] = mapping
+    input_buffers.bitmask_indices.copy_to_gpu(batch_size)
+
+    vocab_size = logits.shape[-1]
+    BLOCK_SIZE = 8192
+    grid = (batch_size, triton.cdiv(vocab_size, BLOCK_SIZE))
+    _apply_grammar_bitmask_kernel[grid](
+        logits,
+        logits.stride(0),
+        input_buffers.grammar_bitmask.gpu,
+        input_buffers.grammar_bitmask.gpu.stride(0),
+        input_buffers.bitmask_indices.gpu,
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+
+# Adapted from
+# https://github.com/mlc-ai/xgrammar/blob/main/python/xgrammar/kernels/apply_token_bitmask_inplace_triton.py
+@triton.jit
+def _apply_grammar_bitmask_kernel(
+    logits_ptr,
+    logits_stride,
+    bitmask_ptr,
+    bitmask_stride,
+    bitmask_indices_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    logits_idx = tl.program_id(0)
+    bitmask_idx = tl.load(bitmask_indices_ptr + logits_idx)
+    if bitmask_idx == -1:
+        # No bitmask to apply.
+        return
+
+    # Load the bitmask.
+    block_id = tl.program_id(1)
+    bitmask_offset = (block_id * BLOCK_SIZE) // 32 + tl.arange(0, BLOCK_SIZE // 32)
+    packed_bitmask = tl.load(
+        bitmask_ptr + bitmask_idx * bitmask_stride + bitmask_offset,
+        mask=bitmask_offset < bitmask_stride,
+    )
+    # Unpack the bitmask.
+    bitmask = ((packed_bitmask[:, None] >> (tl.arange(0, 32)[None, :])) & 1) == 0
+    bitmask = bitmask.reshape(BLOCK_SIZE)
+
+    # Apply the bitmask to the logits.
+    block_offset = block_id * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    tl.store(
+        logits_ptr + logits_idx * logits_stride + block_offset,
+        -float("inf"),
+        mask=bitmask & (block_offset < vocab_size),
+    )
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index f1fd5be966c37..6a4bfde5f972b 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -41,7 +41,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
-from vllm.v1.core.sched.output import GrammarOutput
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import (
@@ -58,7 +58,6 @@ logger = init_logger(__name__)
 
 if TYPE_CHECKING:
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-    from vllm.v1.core.sched.output import SchedulerOutput
 
 
 class Worker(WorkerBase):
@@ -101,6 +100,8 @@ class Worker(WorkerBase):
         else:
             self.profiler = None
 
+        self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
+
     def sleep(self, level: int = 1) -> None:
         from vllm.device_allocator.cumem import CuMemAllocator
 
@@ -237,9 +238,17 @@ class Worker(WorkerBase):
             raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
         # Construct the model runner
-        self.model_runner: GPUModelRunner = GPUModelRunner(
-            self.vllm_config, self.device
-        )
+        if self.use_v2_model_runner:
+            from vllm.v1.worker.gpu.model_runner import (
+                GPUModelRunner as GPUModelRunnerV2,
+            )
+
+            # HACK(woosuk): This is a temporary fix to avoid type errors.
+            self.model_runner: GPUModelRunner = GPUModelRunnerV2(  # type: ignore
+                self.vllm_config, self.device
+            )
+        else:
+            self.model_runner = GPUModelRunner(self.vllm_config, self.device)
 
         if self.rank == 0:
             # If usage stat is enabled, collect relevant info.
@@ -573,7 +582,12 @@ class Worker(WorkerBase):
             self.profiler.stop()
 
     def execute_dummy_batch(self) -> None:
-        self.model_runner._dummy_run(1, uniform_decode=True)
+        if self.use_v2_model_runner:
+            self.model_runner.execute_model(
+                SchedulerOutput.make_empty(), dummy_run=True
+            )
+        else:
+            self.model_runner._dummy_run(1, uniform_decode=True)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_runner.add_lora(lora_request)

From b7f1f490a61c99d0b371e39aefbe5546cba231a9 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Fri, 21 Nov 2025 08:34:46 -0800
Subject: [PATCH 122/249] Upstream triton fp4 weight preshuffle (#28888)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
---
 vllm/_aiter_ops.py                            | 25 +++++++
 .../quark/schemes/quark_ocp_mx.py             | 65 +++++++++++++++----
 2 files changed, 76 insertions(+), 14 deletions(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index e53e4ae6e5296..db79b3f5e8bcb 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -948,6 +948,31 @@ class rocm_aiter_ops:
             (8192, 32768),
         ]
 
+    @staticmethod
+    def is_triton_gemm_afp4wfp4_presh_ws_tuned(n: int, k: int) -> bool:
+        return (n, k) in [
+            (8192, 4096),
+            (1280, 8192),
+            (16384, 53248),
+            (106496, 16384),
+            (57344, 8192),
+            (8192, 2048),
+            (2560, 8192),
+            (10240, 8192),
+            (16384, 16384),
+            (8192, 28672),
+            (28672, 8192),
+            (18432, 16384),
+            (8192, 1024),
+            (7168, 8192),
+            (5120, 8192),
+            (8192, 8192),
+            (8192, 7168),
+            (14336, 8192),
+            (8192, 14336),
+            (8192, 3584),
+        ]
+
     @staticmethod
     def shuffle_weight(
         self, tensor: torch.Tensor, layout: tuple[int, int] = (16, 16)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
index 007e78e68d5cd..33e9f9806b27e 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -10,6 +10,7 @@ import torch
 import torch.nn.functional as F
 
 from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     dequant_mxfp4,
@@ -49,7 +50,10 @@ def is_rocm_aiter_fp4_asm_gemm_enabled() -> bool:
 
 try:
     from aiter.ops.shuffle import shuffle_weight
-    from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
+    from aiter.ops.triton.gemm_afp4wfp4 import (
+        gemm_afp4wfp4,
+        gemm_afp4wfp4_preshuffled_weight_scales,
+    )
     from aiter.ops.triton.quant import dynamic_mxfp4_quant
 
     from vllm.utils.torch_utils import direct_register_custom_op
@@ -66,23 +70,56 @@ try:
         x_scales: torch.Tensor | None = None,
     ) -> torch.Tensor:
         M = x.shape[0]
+        N = weight.shape[0]
+        K = weight.shape[1]
         if rocm_use_aiter_fp4_asm_gemm:
-            if x_scales is None:
-                # use hip quant kernel for performance
-                x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True)
+            if M <= 64 and rocm_aiter_ops.is_triton_gemm_afp4wfp4_presh_ws_tuned(N, K):
+                if x_scales is None:
+                    # use hip quant kernel for performance
+                    if M >= 32:
+                        x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True)
+                    else:
+                        x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=False)
+                else:
+                    x_q = x
+                    x_s = x_scales
+
+                if M >= 32:
+                    x_s = x_s.view(torch.uint8).view(x_s.shape[0] // 32, -1)
+                else:
+                    x_s = x_s[:M, ...].view(torch.uint8)
+
+                y = torch.empty(M, N, device=x_q.device, dtype=out_dtype)
+                gemm_afp4wfp4_preshuffled_weight_scales(
+                    x_q.view(torch.uint8),
+                    weight.view(torch.uint8).view(weight.shape[0] // 16, -1),
+                    x_s,
+                    weight_scale.view(torch.uint8).view(
+                        weight_scale.shape[0] // 32, -1
+                    ),
+                    out_dtype,
+                    y,
+                )
             else:
-                x_q = x
-                x_s = x_scales
+                if x_scales is None:
+                    # use hip quant kernel for performance
+                    x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True)
+                else:
+                    x_q = x
+                    x_s = x_scales
 
-            # 32 alignment is enough for dim0 padding of output for
-            # gemm_a4w4 kernel
-            y = torch.empty(
-                (M + 31) // 32 * 32, weight.shape[0], device=x_q.device, dtype=out_dtype
-            )
+                # 32 alignment is enough for dim0 padding of output for
+                # gemm_a4w4 kernel
+                y = torch.empty(
+                    (M + 31) // 32 * 32,
+                    weight.shape[0],
+                    device=x_q.device,
+                    dtype=out_dtype,
+                )
 
-            gemm_a4w4(
-                x_q, weight, x_s, weight_scale.view(x_s.dtype), y, bpreshuffle=True
-            )
+                gemm_a4w4(
+                    x_q, weight, x_s, weight_scale.view(x_s.dtype), y, bpreshuffle=True
+                )
             return y[:M]
         else:
             if x_scales is None:

From a42ab317acff8c4b7d4808bb34548a530ee04f0f Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 21 Nov 2025 11:46:20 -0500
Subject: [PATCH 123/249] [Log] Optimize startup log (#28948)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 .../layers/fused_moe/fused_moe.py             |  6 +++--
 .../model_executor/layers/quantization/fp8.py | 10 +++++---
 vllm/profiler/gpu_profiler.py                 | 25 ++++++++++---------
 vllm/v1/core/kv_cache_utils.py                |  3 ++-
 4 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index f44328418f1bc..df208eae2e71c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -872,8 +872,10 @@ def get_moe_configs(
     for config_file_path in config_file_paths:
         if os.path.exists(config_file_path):
             with open(config_file_path) as f:
-                logger.info(
-                    "Using configuration from %s for MoE layer.", config_file_path
+                logger.info_once(
+                    "Using configuration from %s for MoE layer.",
+                    config_file_path,
+                    scope="global",
                 )
                 # If a configuration has been found, return it
                 tuned_config = json.load(f)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 92fbdd7093483..91bd45bf879cb 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -162,9 +162,11 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
     # deepGEMM on supported platforms with block-quantized weights
     if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
         if not has_deep_gemm():
-            logger.warning_once("DeepGEMM backend requested but not available.")
+            logger.warning_once(
+                "DeepGEMM backend requested but not available.", scope="local"
+            )
         elif is_deep_gemm_supported():
-            logger.info_once("Using DeepGEMM backend for FP8 MoE")
+            logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
             return Fp8MoeBackend.DEEPGEMM
 
     # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
@@ -173,7 +175,9 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
         and current_platform.is_device_capability(100)
         and block_quant
     ):
-        logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE")
+        logger.info_once(
+            "Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
+        )
         return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
 
     # default to Triton
diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py
index 2155b67a3db4b..3e2cbe7296e9d 100644
--- a/vllm/profiler/gpu_profiler.py
+++ b/vllm/profiler/gpu_profiler.py
@@ -139,18 +139,19 @@ class TorchProfilerWrapper(WorkerProfiler):
 
         self.local_rank = local_rank
         torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
-        logger.info(
-            "Torch profiling enabled. Traces will be saved to: %s",
-            torch_profiler_trace_dir,
-        )
-        logger.debug(
-            "Profiler config: record_shapes=%s,"
-            "profile_memory=%s,with_stack=%s,with_flops=%s",
-            envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-            envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-            envs.VLLM_TORCH_PROFILER_WITH_STACK,
-            envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-        )
+        if local_rank in (None, 0):
+            logger.info(
+                "Torch profiling enabled. Traces will be saved to: %s",
+                torch_profiler_trace_dir,
+            )
+            logger.debug(
+                "Profiler config: record_shapes=%s,"
+                "profile_memory=%s,with_stack=%s,with_flops=%s",
+                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+            )
         self.profiler = torch.profiler.profile(
             activities=[
                 torch.profiler.ProfilerActivity.CPU,
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 01ecd881115df..b18ba8e8b2c7b 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1236,10 +1236,11 @@ def _report_kv_cache_config(
     max_concurrency = get_max_concurrency_for_kv_cache_config(
         vllm_config, kv_cache_config
     )
-    logger.info(
+    logger.info_once(
         "Maximum concurrency for %s tokens per request: %.2fx",
         max_model_len_str,
         max_concurrency,
+        scope="local",
     )
 
 
From e99e467384001e284e0722a33362866b10fed65b Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 21 Nov 2025 10:53:09 -0600
Subject: [PATCH 124/249] [CI/Build][Kernel][AMD] Move extra dim to after load
 in _fwd_kv_parallel in lighting_attn.py (#29132)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 vllm/model_executor/layers/lightning_attn.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
index 99853680eac6c..ffccdc12241cb 100644
--- a/vllm/model_executor/layers/lightning_attn.py
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -198,7 +198,7 @@ def _fwd_kv_parallel(
     )
 
     # Load the decay factors for the current head and block
-    k_decay_ptr = K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)[None, :]
+    k_decay_ptr = K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)
 
     kv_index = tl.arange(0, CBLOCK)
 
@@ -228,6 +228,12 @@ def _fwd_kv_parallel(
 
         # Load decay factor and compute weighted key-value outer product
         k_decay = tl.load(k_decay_ptr)
+
+        # NOTE: Need to add the extra dim here due to AMD MLIR lowering error.
+        # Please don't move it back until issue is resolved.
+        # Issue: https://github.com/ROCm/triton/issues/907
+        k_decay = k_decay[None, :]
+
         kv += tl.dot(k_trans * k_decay, v)
 
         # Move to the next sub-block

From b4c8fbaae2592501f442817f86e32cfeb795d81f Mon Sep 17 00:00:00 2001
From: Mingyuan Ma <111467530+Victor49152@users.noreply.github.com>
Date: Fri, 21 Nov 2025 08:54:11 -0800
Subject: [PATCH 125/249] Add TRTLLM MoE NVFP4 kernel to
 CompressedTensorsW4A4MoeMethod (#28892)

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 .../compressed_tensors_moe.py                 | 142 +++++++++--
 .../layers/quantization/modelopt.py           | 205 ++--------------
 .../quantization/utils/flashinfer_fp4_moe.py  | 221 ++++++++++++++++++
 3 files changed, 358 insertions(+), 210 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index fa254030a271a..ad547dd409822 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -8,6 +8,7 @@ from enum import Enum
 import torch
 from compressed_tensors import CompressionFormat
 from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy
+from torch.nn.parameter import Parameter
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -50,9 +51,15 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize,
+    flashinfer_trtllm_fp4_moe,
+    prepare_static_weights_for_trtllm_fp4_moe,
     reorder_w1w3_to_w3w1,
     select_nvfp4_gemm_impl,
 )
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    FlashinferMoeBackend,
+    get_flashinfer_moe_backend,
+)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     expert_weight_is_col_major,
     requant_weight_ue8m0_inplace,
@@ -193,6 +200,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
+        self.flashinfer_moe_backend = None
+        if self.allow_flashinfer:
+            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
+            logger.info_once(
+                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
+                " for CompressedTensorsW4A4MoeMethod."
+            )
 
     def create_weights(
         self,
@@ -344,21 +358,20 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         if self.use_marlin:
             prepare_moe_fp4_layer_for_marlin(layer)
             return
-
-        # swizzle weight scales
-        layer.w13_weight_scale = torch.nn.Parameter(
-            swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
-        )
-
-        layer.w2_weight_scale = torch.nn.Parameter(
-            swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
-        )
-
         # w13
-        w13_input_global_scale = layer.w13_input_global_scale.max(dim=1).values.to(
-            torch.float32
-        )
-
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            w13_input_global_scale = (
+                layer.w13_input_global_scale.min()
+                .to(torch.float32)
+                .expand(layer.num_experts)
+            )
+        else:
+            w13_input_global_scale = layer.w13_input_global_scale.min(dim=1).values.to(
+                torch.float32
+            )
         layer.g1_alphas = torch.nn.Parameter(
             ((1 / w13_input_global_scale) * layer.w13_weight_scale_2),
             requires_grad=False,
@@ -369,22 +382,92 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         )
 
         # w2
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            w2_input_global_scale = (
+                layer.w2_input_global_scale.min()
+                .to(torch.float32)
+                .expand(layer.num_experts)
+            )
+        else:
+            w2_input_global_scale = layer.w2_input_global_scale
+
         layer.g2_alphas = torch.nn.Parameter(
-            ((1 / layer.w2_input_global_scale) * layer.w2_weight_scale_2).to(
-                torch.float32
-            ),
+            ((1 / w2_input_global_scale) * layer.w2_weight_scale_2).to(torch.float32),
             requires_grad=False,
         )
 
         layer.w2_input_scale_quant = torch.nn.Parameter(
-            (layer.w2_input_global_scale), requires_grad=False
+            (w2_input_global_scale), requires_grad=False
         )
 
+        # TensorRT-LLM specific processing
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            # Prepare static weights for TRT-LLM kernel
+            # alternate: prepare_static_weight_layouts_for_trtllm_moe
+            (
+                gemm1_weights_fp4_shuffled,
+                gemm1_scales_fp4_shuffled,
+                gemm2_weights_fp4_shuffled,
+                gemm2_scales_fp4_shuffled,
+            ) = prepare_static_weights_for_trtllm_fp4_moe(
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                layer.w2_weight.size(-2),  # hidden_size
+                layer.w13_weight.size(-2) // 2,  # intermediate_size
+                layer.w13_weight.size(0),  # num_experts
+            )
+            logger.debug_once("Finished shuffling weights for TRT-LLM MOE")
+
+            layer.gemm1_weights_fp4_shuffled = Parameter(
+                gemm1_weights_fp4_shuffled, requires_grad=False
+            )
+            layer.gemm2_weights_fp4_shuffled = Parameter(
+                gemm2_weights_fp4_shuffled, requires_grad=False
+            )
+            layer.gemm1_scales_fp4_shuffled = Parameter(
+                gemm1_scales_fp4_shuffled, requires_grad=False
+            )
+            layer.gemm2_scales_fp4_shuffled = Parameter(
+                gemm2_scales_fp4_shuffled, requires_grad=False
+            )
+
+            # Additional parameter needed for TRT-LLM
+            layer.g1_scale_c = Parameter(
+                (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32),
+                requires_grad=False,
+            )
+
+            # Clean up weights that won't be used by TRT-LLM
+            del layer.w2_weight
+            del layer.w2_weight_scale
+            del layer.w13_weight
+            del layer.w13_weight_scale
+        else:
+            # swizzle weight scales
+            layer.w13_weight_scale = torch.nn.Parameter(
+                swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
+            )
+
+            layer.w2_weight_scale = torch.nn.Parameter(
+                swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
+            )
+
     def maybe_make_prepare_finalize(
         self,
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
-        if self.use_marlin:
+        if self.use_marlin or (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
             return None
         elif not self.allow_flashinfer:
             return super().maybe_make_prepare_finalize(routing_tables)
@@ -411,7 +494,10 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        if self.use_marlin:
+        if (
+            self.use_marlin
+            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
             return None
 
         return nvfp4_moe_quant_config(
@@ -452,6 +538,22 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             )
         assert activation == "silu", "Only SiLU activation is supported."
 
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            return flashinfer_trtllm_fp4_moe(
+                layer=layer,
+                x=x,
+                router_logits=router_logits,
+                top_k=top_k,
+                global_num_experts=global_num_experts,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                custom_routing_function=custom_routing_function,
+                e_score_correction_bias=e_score_correction_bias,
+            )
+
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 6b5ed7762eb31..01a23168bdde3 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -15,7 +15,6 @@ from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
-    RoutingMethodType,
     fp8_w8a8_moe_quant_config,
     nvfp4_moe_quant_config,
 )
@@ -38,6 +37,8 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize,
+    flashinfer_trtllm_fp4_moe,
+    prepare_static_weights_for_trtllm_fp4_moe,
     reorder_w1w3_to_w3w1,
     select_nvfp4_gemm_impl,
 )
@@ -1136,7 +1137,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
         self.flashinfer_moe_backend = None
-        self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {}
         if self.allow_flashinfer:
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
@@ -1303,138 +1303,14 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         )
         layer.register_parameter("w2_input_scale", w2_input_scale)
 
-    def prepare_static_weights_for_trtllm_fp4_moe(
-        self,
-        # args_dequant,
-        # args,
-        gemm1_weights,
-        gemm2_weights,
-        gemm1_scales_linear_fp4_bytes,
-        gemm2_scales_linear_fp4_bytes,
-        hidden_size,
-        intermediate_size,
-        num_experts,
-    ):
-        from flashinfer import nvfp4_block_scale_interleave
-        from flashinfer.fused_moe.core import (
-            _maybe_get_cached_w3_w1_permute_indices,
-            get_w2_permute_indices_with_cache,
-        )
-
-        """Prepare quantized weights for kernel (done offline with weights)."""
-        epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
-
-        # Convert quantized weights to proper formats
-        gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape(
-            num_experts, 2 * intermediate_size, hidden_size // 2
-        )  # packed fp4
-        gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view(
-            torch.float8_e4m3fn
-        ).reshape(
-            num_experts, 2 * intermediate_size, hidden_size // 16
-        )  # fp8 scaling factors
-
-        gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape(
-            num_experts, hidden_size, intermediate_size // 2
-        )  # packed fp4
-        gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view(
-            torch.float8_e4m3fn
-        ).reshape(
-            num_experts, hidden_size, intermediate_size // 16
-        )  # fp8 scaling factors
-
-        gemm1_weights_fp4_shuffled = []
-        gemm1_scales_fp4_shuffled = []
-        gemm2_weights_fp4_shuffled = []
-        gemm2_scales_fp4_shuffled = []
-        for i in range(num_experts):
-            # Calculate the permute indices for the following:
-            # 1. Reorder rows of W1 and scales for fused gated activation
-            # 2. Shuffle weights and scaling factors for transposed mma output
-            # for both w3_w1 and w2 weights and scale factors
-            permute_indices = _maybe_get_cached_w3_w1_permute_indices(
-                self._cache_permute_indices,
-                gemm1_weights_fp4[i].view(torch.uint8),
-                epilogue_tile_m,
-            )
-            gemm1_weights_fp4_shuffled.append(
-                gemm1_weights_fp4[i]
-                .view(torch.uint8)[permute_indices.to(gemm1_weights_fp4.device)]
-                .contiguous()
-            )
-
-            permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices(
-                self._cache_permute_indices,
-                gemm1_scales_linear_fp4[i].view(torch.uint8),
-                epilogue_tile_m,
-                num_elts_per_sf=16,
-            )
-            gemm1_scales_fp4_shuffled.append(
-                nvfp4_block_scale_interleave(
-                    gemm1_scales_linear_fp4[i]
-                    .view(torch.uint8)[
-                        permute_sf_indices.to(gemm1_scales_linear_fp4.device)
-                    ]
-                    .contiguous()
-                )
-            )
-
-            permute_indices = get_w2_permute_indices_with_cache(
-                self._cache_permute_indices,
-                gemm2_weights_fp4[i].view(torch.uint8),
-                epilogue_tile_m,
-            )
-            gemm2_weights_fp4_shuffled.append(
-                gemm2_weights_fp4[i]
-                .view(torch.uint8)[permute_indices.to(gemm2_weights_fp4.device)]
-                .contiguous()
-            )
-
-            permute_sf_indices = get_w2_permute_indices_with_cache(
-                self._cache_permute_indices,
-                gemm2_scales_linear_fp4[i].view(torch.uint8),
-                epilogue_tile_m,
-                num_elts_per_sf=16,
-            )
-            gemm2_scales_fp4_shuffled.append(
-                nvfp4_block_scale_interleave(
-                    gemm2_scales_linear_fp4[i]
-                    .view(torch.uint8)[
-                        permute_sf_indices.to(gemm2_scales_linear_fp4.device)
-                    ]
-                    .contiguous()
-                )
-            )
-
-        # Stack weights for all experts
-        gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled)
-        gemm1_scales_fp4_shuffled = (
-            torch.stack(gemm1_scales_fp4_shuffled)
-            .view(torch.float8_e4m3fn)
-            .reshape(num_experts, 2 * intermediate_size, hidden_size // 16)
-        )
-
-        gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled)
-        gemm2_scales_fp4_shuffled = (
-            torch.stack(gemm2_scales_fp4_shuffled)
-            .view(torch.float8_e4m3fn)
-            .reshape(num_experts, hidden_size, intermediate_size // 16)
-        )
-        return (
-            gemm1_weights_fp4_shuffled,
-            gemm1_scales_fp4_shuffled,
-            gemm2_weights_fp4_shuffled,
-            gemm2_scales_fp4_shuffled,
-        )
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # GEMM 1 processing
         gemm1_weight = layer.w13_weight.data
         gemm1_weight_scale = layer.w13_weight_scale.data
 
-        if (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+        if self.allow_flashinfer and (
+            self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
             gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
                 gemm1_weight, gemm1_weight_scale, dim=-2
@@ -1508,7 +1384,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 gemm1_scales_fp4_shuffled,
                 gemm2_weights_fp4_shuffled,
                 gemm2_scales_fp4_shuffled,
-            ) = self.prepare_static_weights_for_trtllm_fp4_moe(
+            ) = prepare_static_weights_for_trtllm_fp4_moe(
                 layer.w13_weight,
                 layer.w2_weight,
                 layer.w13_weight_scale,
@@ -1614,68 +1490,17 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
-            import flashinfer
-
-            from vllm.model_executor.models.llama4 import Llama4MoE
-
-            a1_gscale = layer.w13_input_scale_quant
-            (hidden_states_fp4, hidden_states_scale_linear_fp4) = (
-                flashinfer.fp4_quantize(
-                    x,
-                    a1_gscale,
-                    is_sf_swizzled_layout=False,
-                )
-            )
-            use_llama4_routing = (
-                custom_routing_function is Llama4MoE.custom_routing_function
-            )
-            routing_method_type = layer.routing_method_type
-            if use_llama4_routing:
-                routing_method_type = RoutingMethodType.Llama4
-            router_logits = (
-                router_logits.to(torch.float32)
-                if routing_method_type == RoutingMethodType.DeepSeekV3
-                else router_logits
-            )
-            routing_bias = e_score_correction_bias
-            if routing_bias is not None:
-                routing_bias = routing_bias.to(torch.bfloat16)
-            out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
-                routing_logits=router_logits,
-                routing_bias=routing_bias,
-                hidden_states=hidden_states_fp4,
-                hidden_states_scale=hidden_states_scale_linear_fp4.view(
-                    torch.float8_e4m3fn
-                ).flatten(),
-                gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
-                gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
-                    torch.float8_e4m3fn
-                ),
-                gemm1_bias=None,
-                gemm1_alpha=None,
-                gemm1_beta=None,
-                gemm1_clamp_limit=None,
-                gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
-                gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
-                    torch.float8_e4m3fn
-                ),
-                gemm2_bias=None,
-                output1_scale_scalar=layer.g1_scale_c.data,
-                output1_scale_gate_scalar=layer.g1_alphas.data,
-                output2_scale_scalar=layer.g2_alphas.data,
-                num_experts=global_num_experts,
+            return flashinfer_trtllm_fp4_moe(
+                layer=layer,
+                x=x,
+                router_logits=router_logits,
                 top_k=top_k,
-                n_group=num_expert_group,
+                global_num_experts=global_num_experts,
+                num_expert_group=num_expert_group,
                 topk_group=topk_group,
-                intermediate_size=layer.intermediate_size_per_partition,
-                local_expert_offset=layer.ep_rank * layer.local_num_experts,
-                local_num_experts=layer.local_num_experts,
-                routed_scaling_factor=1.0,
-                tile_tokens_dim=None,
-                routing_method_type=routing_method_type,
-                do_finalize=True,
-            )[0]
-            return out
+                custom_routing_function=custom_routing_function,
+                e_score_correction_bias=e_score_correction_bias,
+            )
 
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 36e8599dd9484..eda40657b1e39 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -9,6 +9,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
+    RoutingMethodType,
 )
 from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (
     FlashInferCuteDSLExperts,
@@ -110,3 +111,223 @@ def select_nvfp4_gemm_impl(
         "CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS "
         "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)"
     )
+
+
+def prepare_static_weights_for_trtllm_fp4_moe(
+    # args_dequant,
+    # args,
+    gemm1_weights,
+    gemm2_weights,
+    gemm1_scales_linear_fp4_bytes,
+    gemm2_scales_linear_fp4_bytes,
+    hidden_size,
+    intermediate_size,
+    num_experts,
+):
+    from flashinfer import nvfp4_block_scale_interleave
+    from flashinfer.fused_moe.core import (
+        _maybe_get_cached_w3_w1_permute_indices,
+        get_w2_permute_indices_with_cache,
+    )
+
+    _cache_permute_indices: dict[torch.Size, torch.Tensor] = {}
+    """Prepare quantized weights for kernel (done offline with weights)."""
+    epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
+
+    # Convert quantized weights to proper formats
+    gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape(
+        num_experts, 2 * intermediate_size, hidden_size // 2
+    )  # packed fp4
+    gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view(
+        torch.float8_e4m3fn
+    ).reshape(
+        num_experts, 2 * intermediate_size, hidden_size // 16
+    )  # fp8 scaling factors
+
+    gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape(
+        num_experts, hidden_size, intermediate_size // 2
+    )  # packed fp4
+    gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view(
+        torch.float8_e4m3fn
+    ).reshape(num_experts, hidden_size, intermediate_size // 16)  # fp8 scaling factors
+
+    gemm1_weights_fp4_shuffled = []
+    gemm1_scales_fp4_shuffled = []
+    gemm2_weights_fp4_shuffled = []
+    gemm2_scales_fp4_shuffled = []
+    for i in range(num_experts):
+        # Calculate the permute indices for the following:
+        # 1. Reorder rows of W1 and scales for fused gated activation
+        # 2. Shuffle weights and scaling factors for transposed mma output
+        # for both w3_w1 and w2 weights and scale factors
+        permute_indices = _maybe_get_cached_w3_w1_permute_indices(
+            _cache_permute_indices,
+            gemm1_weights_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+        )
+        gemm1_weights_fp4_shuffled.append(
+            gemm1_weights_fp4[i]
+            .view(torch.uint8)[permute_indices.to(gemm1_weights_fp4.device)]
+            .contiguous()
+        )
+
+        permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices(
+            _cache_permute_indices,
+            gemm1_scales_linear_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+            num_elts_per_sf=16,
+        )
+        gemm1_scales_fp4_shuffled.append(
+            nvfp4_block_scale_interleave(
+                gemm1_scales_linear_fp4[i]
+                .view(torch.uint8)[
+                    permute_sf_indices.to(gemm1_scales_linear_fp4.device)
+                ]
+                .contiguous()
+            )
+        )
+
+        permute_indices = get_w2_permute_indices_with_cache(
+            _cache_permute_indices,
+            gemm2_weights_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+        )
+        gemm2_weights_fp4_shuffled.append(
+            gemm2_weights_fp4[i]
+            .view(torch.uint8)[permute_indices.to(gemm2_weights_fp4.device)]
+            .contiguous()
+        )
+
+        permute_sf_indices = get_w2_permute_indices_with_cache(
+            _cache_permute_indices,
+            gemm2_scales_linear_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+            num_elts_per_sf=16,
+        )
+        gemm2_scales_fp4_shuffled.append(
+            nvfp4_block_scale_interleave(
+                gemm2_scales_linear_fp4[i]
+                .view(torch.uint8)[
+                    permute_sf_indices.to(gemm2_scales_linear_fp4.device)
+                ]
+                .contiguous()
+            )
+        )
+
+    # Stack weights for all experts
+    gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled)
+    gemm1_scales_fp4_shuffled = (
+        torch.stack(gemm1_scales_fp4_shuffled)
+        .view(torch.float8_e4m3fn)
+        .reshape(num_experts, 2 * intermediate_size, hidden_size // 16)
+    )
+
+    gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled)
+    gemm2_scales_fp4_shuffled = (
+        torch.stack(gemm2_scales_fp4_shuffled)
+        .view(torch.float8_e4m3fn)
+        .reshape(num_experts, hidden_size, intermediate_size // 16)
+    )
+    return (
+        gemm1_weights_fp4_shuffled,
+        gemm1_scales_fp4_shuffled,
+        gemm2_weights_fp4_shuffled,
+        gemm2_scales_fp4_shuffled,
+    )
+
+
+def flashinfer_trtllm_fp4_moe(
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    global_num_experts: int,
+    num_expert_group: int | None,
+    topk_group: int | None,
+    custom_routing_function: object | None,
+    e_score_correction_bias: torch.Tensor | None,
+) -> torch.Tensor:
+    """
+    Apply FlashInfer TensorRT-LLM FP4 MoE kernel.
+
+    Args:
+        layer: The MoE layer with weights and scales
+        x: Input tensor
+        router_logits: Router logits for expert selection
+        top_k: Number of experts to select per token
+        global_num_experts: Total number of experts across all ranks
+        num_expert_group: Number of expert groups (for grouped routing)
+        topk_group: Top-k within each group
+        custom_routing_function: Custom routing function (e.g., Llama4)
+        e_score_correction_bias: Optional routing bias correction
+
+    Returns:
+        Output tensor from the MoE layer
+    """
+    import flashinfer
+
+    from vllm.model_executor.models.llama4 import Llama4MoE
+
+    # Quantize input to FP4
+    a1_gscale = layer.w13_input_scale_quant
+    (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
+        x,
+        a1_gscale,
+        is_sf_swizzled_layout=False,
+    )
+
+    # Determine routing method type
+    use_llama4_routing = custom_routing_function is Llama4MoE.custom_routing_function
+    routing_method_type = layer.routing_method_type
+    if use_llama4_routing:
+        routing_method_type = flashinfer.RoutingMethodType.Llama4
+
+    # Prepare routing bias
+    routing_bias = e_score_correction_bias
+    if routing_bias is not None:
+        routing_bias = routing_bias.to(torch.bfloat16)
+
+    router_logits = (
+        router_logits.to(torch.float32)
+        if routing_method_type == RoutingMethodType.DeepSeekV3
+        else router_logits
+    )
+
+    # Call TRT-LLM FP4 block-scale MoE kernel
+    out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
+        routing_logits=router_logits,
+        routing_bias=routing_bias,
+        hidden_states=hidden_states_fp4,
+        hidden_states_scale=hidden_states_scale_linear_fp4.view(
+            torch.float8_e4m3fn
+        ).flatten(),
+        gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
+        gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
+            torch.float8_e4m3fn
+        ),
+        gemm1_bias=None,
+        gemm1_alpha=None,
+        gemm1_beta=None,
+        gemm1_clamp_limit=None,
+        gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
+        gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
+            torch.float8_e4m3fn
+        ),
+        gemm2_bias=None,
+        output1_scale_scalar=layer.g1_scale_c.data,
+        output1_scale_gate_scalar=layer.g1_alphas.data,
+        output2_scale_scalar=layer.g2_alphas.data,
+        num_experts=global_num_experts,
+        top_k=top_k,
+        n_group=num_expert_group if num_expert_group is not None else 0,
+        topk_group=topk_group if topk_group is not None else 0,
+        intermediate_size=layer.intermediate_size_per_partition,
+        local_expert_offset=layer.ep_rank * layer.local_num_experts,
+        local_num_experts=layer.local_num_experts,
+        routed_scaling_factor=None,
+        tile_tokens_dim=None,
+        routing_method_type=routing_method_type,
+        do_finalize=True,
+    )[0]
+
+    return out

From 460d02a417b440ce8b3b8d09c6f5214a2a346426 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Fri, 21 Nov 2025 10:55:27 -0600
Subject: [PATCH 126/249] [NIXL] Fix after virtual block_size for host_buffer
 with heter kv_layout (#29122)

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
---
 .../kv_transfer/kv_connector/v1/nixl_connector.py  | 14 +++++++++++++-
 vllm/platforms/xpu.py                              |  8 --------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 1626f819af8b5..7c0911240493c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -1042,10 +1042,12 @@ class NixlConnectorWorker:
         NOT directly supported by NIXL (e.g., tpu)
         """
         xfer_buffers: dict[str, torch.Tensor] = {}
+        inv_order = [0, 1, 3, 2, 4]
         try:
             for layer_name, kv_cache in kv_caches.items():
                 kv_shape = kv_cache.shape
                 kv_dtype = kv_cache.dtype
+                permute_shape = False
                 if (
                     self.kv_cache_layout == "NHD"
                     and self.vllm_config.kv_transfer_config is not None
@@ -1059,10 +1061,20 @@ class NixlConnectorWorker:
                     # Since NHD will not support Decode/Prefill TP_ratio > 1,
                     # we can leverage host_buffer for permute
                     self.host_buffer_kv_cache_layout = "HND"
-                    kv_shape = tuple(kv_shape[i] for i in [0, 1, 3, 2, 4])
+                    kv_shape = (
+                        tuple(kv_shape[i] for i in inv_order)
+                        if not self.use_mla
+                        else kv_shape
+                    )
+                    permute_shape = not self.use_mla
+
                 xfer_buffers[layer_name] = torch.empty(
                     kv_shape, dtype=kv_dtype, device="cpu"
                 )
+                if permute_shape:
+                    xfer_buffers[layer_name] = xfer_buffers[layer_name].permute(
+                        inv_order
+                    )
         except MemoryError as e:
             logger.error("NIXLConnectorWorker gets %s.", e)
             raise
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 65516827a16da..18a3186b142f1 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -251,10 +251,6 @@ class XPUPlatform(Platform):
     ) -> None:
         """Copy blocks from src_cache to dst_cache on XPU."""
         _src_cache = src_cache[:, src_block_indices]
-        if _src_cache.shape[2:] != dst_cache.shape[2:]:
-            # To support TP_ratio, HOST KV might be initiated with HND
-            # while XPU device KV is with NHD
-            _src_cache = _src_cache.permute(0, 1, 3, 2, 4)
         dst_cache[:, dst_block_indices] = _src_cache.to(dst_cache.device)
 
     @classmethod
@@ -267,8 +263,4 @@ class XPUPlatform(Platform):
     ) -> None:
         """Copy blocks from XPU to host (CPU)."""
         _src_cache = src_cache[:, src_block_indices]
-        if _src_cache.shape[2:] != dst_cache.shape[2:]:
-            # XPU device KV is with NHD while HOST KV
-            # might be initiated with HND for TP_ratio support
-            _src_cache = _src_cache.permute(0, 1, 3, 2, 4)
         dst_cache[:, dst_block_indices] = _src_cache.cpu()

From 75648b16ddce1bff02c39c6f06be62a58385ff52 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Fri, 21 Nov 2025 11:12:16 -0600
Subject: [PATCH 127/249] [ROCm][CI] Fix config/test_config_generation.py
 (#29142)

Signed-off-by: charlifu <charlifu@amd.com>
---
 docker/Dockerfile.rocm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 731a97d93da1f..42466d1801cf6 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -7,6 +7,8 @@ FROM ${BASE_IMAGE} AS base
 
 ARG ARG_PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
+ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
 
 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
@@ -121,8 +123,6 @@ COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
 COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker
 
-ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
-ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false
 
 # ENV that can improve safe tensor loading, and end-to-end time

From ceca06050124a10b33e78ee33d1a25a97edd1f74 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 22 Nov 2025 02:19:25 +0800
Subject: [PATCH 128/249] [Deprecation] Deprecate `seed=None` (#29185)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/arg_utils.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6eaf328eb1655..888f57b1ac1df 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -367,7 +367,7 @@ class EngineArgs:
     config_format: str = ModelConfig.config_format
     dtype: ModelDType = ModelConfig.dtype
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
-    seed: int | None = None
+    seed: int | None = 0
     max_model_len: int | None = ModelConfig.max_model_len
     cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
     cudagraph_capture_sizes: list[int] | None = (
@@ -1192,6 +1192,12 @@ class EngineArgs:
         # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
         # doesn't affect the user process.
         if self.seed is None:
+            logger.warning_once(
+                "`seed=None` is equivalent to `seed=0` in V1 Engine. "
+                "You will no longer be allowed to pass `None` in v0.13.",
+                scope="local",
+            )
+
             self.seed = 0
             if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
                 logger.warning(
@@ -1203,28 +1209,31 @@ class EngineArgs:
                 )
 
         if self.disable_mm_preprocessor_cache:
-            logger.warning(
+            logger.warning_once(
                 "`--disable-mm-preprocessor-cache` is deprecated "
                 "and will be removed in v0.13. "
                 "Please use `--mm-processor-cache-gb 0` instead.",
+                scope="local",
             )
 
             self.mm_processor_cache_gb = 0
         elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
-            logger.warning(
+            logger.warning_once(
                 "VLLM_MM_INPUT_CACHE_GIB` is deprecated "
                 "and will be removed in v0.13. "
                 "Please use `--mm-processor-cache-gb %d` instead.",
                 envs.VLLM_MM_INPUT_CACHE_GIB,
+                scope="local",
             )
 
             self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
 
         if self.enable_multimodal_encoder_data_parallel:
-            logger.warning(
+            logger.warning_once(
                 "--enable-multimodal-encoder-data-parallel` is deprecated "
                 "and will be removed in v0.13. "
-                "Please use `--mm-encoder-tp-mode data` instead."
+                "Please use `--mm-encoder-tp-mode data` instead.",
+                scope="local",
             )
 
             self.mm_encoder_tp_mode = "data"

From 1bed891f72a6cbd32c0c75dfaa29ad21d7a68b75 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 21 Nov 2025 10:21:40 -0800
Subject: [PATCH 129/249] [Chore] Fix pre-commit error after #25266 (#29190)

---
 vllm/v1/worker/gpu/async_utils.py     | 20 +++++++++++---------
 vllm/v1/worker/gpu/attn_utils.py      | 14 ++++++++------
 vllm/v1/worker/gpu/cudagraph_utils.py | 12 ++++++++++--
 vllm/v1/worker/gpu/model_runner.py    | 16 ++++++++++------
 vllm/v1/worker/gpu/sampler.py         |  2 +-
 5 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
index 638ec6fb0b082..e523090aa2172 100644
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -7,6 +7,7 @@ import torch
 
 from vllm.v1.outputs import (
     AsyncModelRunnerOutput,
+    LogprobsTensors,
     ModelRunnerOutput,
     SamplerOutput,
 )
@@ -46,15 +47,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
                 "cpu", non_blocking=True
             )
             if sampler_output.logprobs_tensors is not None:
-                self.logprobs_tensors = (
+                self.logprobs_tensors: LogprobsTensors | None = (
                     sampler_output.logprobs_tensors.to_cpu_nonblocking()
                 )
             else:
                 self.logprobs_tensors = None
-            self.prompt_logprobs_dict = {}
+            self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
             if self.model_runner_output.prompt_logprobs_dict:
                 for k, v in self.model_runner_output.prompt_logprobs_dict.items():
-                    self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking()
+                    if v is not None:
+                        self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking()
+                    else:
+                        self.prompt_logprobs_dict[k] = None
             self.copy_event.record(self.copy_stream)
 
     def get_output(self) -> ModelRunnerOutput:
@@ -64,12 +68,10 @@ class AsyncOutput(AsyncModelRunnerOutput):
         # the existing model runner.
         # Going forward, we should keep the data structures as NumPy arrays
         # rather than Python lists.
-        sampled_token_ids_np = self.sampled_token_ids.numpy()
-        num_reqs = sampled_token_ids_np.shape[0]
-        sampled_token_ids: list[np.ndarray] = [
-            sampled_token_ids_np[i, : self.num_sampled_tokens[i]]
-            for i in range(num_reqs)
-        ]
+        sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
+        num_reqs = len(sampled_token_ids)
+        for i in range(num_reqs):
+            del sampled_token_ids[i][self.num_sampled_tokens[i] :]
         self.model_runner_output.sampled_token_ids = sampled_token_ids
 
         if self.logprobs_tensors is not None:
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
index 8850c18092299..222db565dff17 100644
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import Any
+from typing import Any, cast
 
 import torch
 
@@ -13,6 +13,7 @@ from vllm.v1.attention.backends.utils import (
     CommonAttentionMetadata,
 )
 from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
     KVCacheConfig,
     KVCacheSpec,
 )
@@ -22,7 +23,8 @@ from vllm.v1.worker.utils import bind_kv_cache
 
 def get_kv_cache_spec(vllm_config: VllmConfig) -> dict[str, KVCacheSpec]:
     kv_cache_spec: dict[str, KVCacheSpec] = {}
-    attn_layers = get_layers_from_vllm_config(vllm_config, AttentionLayerBase)
+    layer_type = cast(type[Any], AttentionLayerBase)
+    attn_layers = get_layers_from_vllm_config(vllm_config, layer_type)
     for layer_name, attn_module in attn_layers.items():
         # Skip modules that don't need KV cache (eg encoder-only attention)
         if spec := attn_module.get_kv_cache_spec(vllm_config):
@@ -35,16 +37,15 @@ def init_attn_backend(
     vllm_config: VllmConfig,
     device: torch.device,
 ):
-    attn_backends: dict[str, AttentionBackend] = {}
+    attn_backends: dict[str, type[AttentionBackend]] = {}
     attn_metadata_builders: list[AttentionMetadataBuilder] = []
     flashinfer_workspace: torch.Tensor | None = None
     for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
         layer_names = kv_cache_group_spec.layer_names
         any_layer_name = next(iter(layer_names))
 
-        attn_layers = get_layers_from_vllm_config(
-            vllm_config, AttentionLayerBase, layer_names
-        )
+        layer_type = cast(type[Any], AttentionLayerBase)
+        attn_layers = get_layers_from_vllm_config(vllm_config, layer_type, layer_names)
         attn_backend = attn_layers[any_layer_name].get_attn_backend()
         for layer_name in layer_names:
             attn_backends[layer_name] = attn_backend
@@ -93,6 +94,7 @@ def _reshape_kv_cache(
     kv_caches: dict[str, torch.Tensor] = {}
     for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
         kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+        assert isinstance(kv_cache_spec, AttentionSpec)
         for layer_name in kv_cache_group_spec.layer_names:
             raw_tensor = kv_cache_raw_tensors[layer_name]
             assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 7fd1f76669f48..31a706475243c 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -34,8 +34,16 @@ class CudaGraphManager:
         self.compilation_config = vllm_config.compilation_config
         assert self.compilation_config is not None
 
-        self.cudagraph_mode = self.compilation_config.cudagraph_mode
-        self.cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
+        if self.compilation_config.cudagraph_mode is None:
+            self.cudagraph_mode = CUDAGraphMode.NONE
+        else:
+            self.cudagraph_mode = self.compilation_config.cudagraph_mode
+        if self.compilation_config.cudagraph_capture_sizes is not None:
+            self.cudagraph_sizes = sorted(
+                self.compilation_config.cudagraph_capture_sizes
+            )
+        else:
+            self.cudagraph_sizes = []
         self.padded_sizes = self._init_padded_sizes()
 
         self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 08aad9ddd06b3..9ca37ff282d82 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -329,8 +329,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             torch.cuda.synchronize()
 
     def update_states(self, scheduler_output: SchedulerOutput) -> None:
-        for req_id in scheduler_output.preempted_req_ids:
-            self.req_states.remove_request(req_id)
+        if scheduler_output.preempted_req_ids is not None:
+            for req_id in scheduler_output.preempted_req_ids:
+                self.req_states.remove_request(req_id)
         for req_id in scheduler_output.finished_req_ids:
             self.req_states.remove_request(req_id)
 
@@ -346,6 +347,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # Add new requests.
         for new_req_data in scheduler_output.scheduled_new_reqs:
+            assert new_req_data.prompt_token_ids is not None
+            assert new_req_data.prefill_token_ids is not None
+            assert new_req_data.sampling_params is not None
             req_id = new_req_data.req_id
             self.req_states.add_request(
                 req_id=req_id,
@@ -398,8 +402,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Decode first, then prefill.
         # batch_idx -> req_id
         req_ids = sorted(
-            scheduler_output.num_scheduled_tokens,
-            key=scheduler_output.num_scheduled_tokens.get,
+            scheduler_output.num_scheduled_tokens.keys(),
+            key=lambda k: scheduler_output.num_scheduled_tokens[k],
         )
         num_scheduled_tokens = np.array(
             [scheduler_output.num_scheduled_tokens[i] for i in req_ids], dtype=np.int32
@@ -637,9 +641,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         model_runner_output = ModelRunnerOutput(
             req_ids=input_batch.req_ids,
             req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)},
-            sampled_token_ids=None,
+            sampled_token_ids=None,  # type: ignore
             logprobs=None,
-            prompt_logprobs_dict=prompt_logprobs_dict,
+            prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore
             pooler_output=[],
             kv_connector_output=None,
             num_nans_in_logits=None,
diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
index e916aadb6b5a0..55f98ca6bb6a3 100644
--- a/vllm/v1/worker/gpu/sampler.py
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -8,8 +8,8 @@ import triton.language as tl
 
 from vllm.config.model import LogprobsMode
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
-from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+from vllm.v1.worker.gpu.states import SamplingMetadata
 
 
 class Sampler:

From 1840c5cb1818ae036cb4d8276d37ce81142acbee Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 21 Nov 2025 14:41:52 -0500
Subject: [PATCH 130/249] [BugFix] Make sure to allocate worst case MoE
 workspace during profile run in the DP + EP case (#27426)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/envs.py                                  |  4 +-
 .../layers/fused_moe/modular_kernel.py        | 41 +++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index d2d6917403420..9b1ed1fc680b4 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -55,7 +55,7 @@ if TYPE_CHECKING:
     VLLM_CPU_SGL_KERNEL: bool = False
     VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
     VLLM_XLA_CHECK_RECOMPILATION: bool = False
-    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
+    VLLM_FUSED_MOE_CHUNK_SIZE: int = 16 * 1024
     VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
     VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", "shm"] = "auto"
     VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
@@ -785,7 +785,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Enable SPMD mode for TPU backend.
     "VLLM_XLA_USE_SPMD": lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
     "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(
-        os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")
+        os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(16 * 1024))
     ),
     # Control whether to use fused MoE activation chunking. Current chunking
     # logic is incompatible with torch.compile and causes IMA. See issue
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 4af7af9257dfa..b2af58cdca887 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -10,6 +10,9 @@ from typing import final
 import torch
 
 import vllm.envs as envs
+from vllm.config import get_current_vllm_config
+from vllm.forward_context import get_forward_context, is_forward_context_available
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache,
@@ -26,6 +29,8 @@ from vllm.v1.worker.ubatching import (
     dbo_yield,
 )
 
+logger = init_logger(__name__)
+
 #
 # This file defines a set of base classes used to make MoE kernels more modular.
 # The goal is to be able to utilize different communication mechanisms with
@@ -798,6 +803,42 @@ class FusedMoEModularKernel(torch.nn.Module):
         buffers = self.shared_buffers[ubatch_idx]
         workspace_dtype = self.fused_experts.workspace_dtype(out_dtype)
 
+        # Force worst-case allocation in profiling run for
+        # "mk.FusedMoEModularKernel.Standard" formats where this is only bounded
+        # by `VLLM_FUSED_MOE_CHUNK_SIZE` and may not be seen during profiling with
+        # DP+EP due to the random token routing.
+        is_profile_run = (
+            is_forward_context_available()
+            and get_forward_context().attn_metadata is None
+        )
+        if is_profile_run and self.fused_experts.supports_chunking():
+            parallel_config = get_current_vllm_config().parallel_config
+            is_dp_ep = (
+                parallel_config.data_parallel_size > 1
+                and parallel_config.enable_expert_parallel
+            )
+            if is_dp_ep:
+                max_workspace_13, max_workspace_2, max_fused_out_shape = (
+                    self.fused_experts.workspace_shapes(
+                        envs.VLLM_FUSED_MOE_CHUNK_SIZE,
+                        N,
+                        K,
+                        top_k,
+                        global_num_experts,
+                        local_num_experts,
+                        expert_tokens_meta,
+                    )
+                )
+                buffers.workspace13.get(
+                    max_workspace_13, device=device, dtype=workspace_dtype
+                )
+                buffers.workspace2.get(
+                    max_workspace_2, device=device, dtype=workspace_dtype
+                )
+                buffers.fused_out.get(
+                    max_fused_out_shape, device=device, dtype=workspace_dtype
+                )
+
         # Get intermediate workspace shapes based off the chunked M size.
         workspace13_shape, workspace2_shape, _ = self.fused_experts.workspace_shapes(
             M_chunk,

From 53a1ba6ec584ea93531a3195b3b9f8049786055b Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sat, 22 Nov 2025 05:06:09 +0800
Subject: [PATCH 131/249] [log] add weights loading time log to sharded_state
 loader (#28628)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/model_executor/model_loader/sharded_state_loader.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index d94dbd9f06e0b..1538f0c2af655 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -4,6 +4,7 @@
 import collections
 import glob
 import os
+import time
 from collections.abc import Generator
 from typing import Any
 
@@ -132,6 +133,7 @@ class ShardedStateLoader(BaseModelLoader):
                 f"pre-sharded checkpoints are currently supported!"
             )
         state_dict = self._filter_subtensors(model.state_dict())
+        counter_before_loading_weights = time.perf_counter()
         for key, tensor in self.iterate_over_files(filepaths):
             # If loading with LoRA enabled, additional padding may
             # be added to certain parameters. We only load into a
@@ -150,6 +152,12 @@ class ShardedStateLoader(BaseModelLoader):
                 )
             param_data.copy_(tensor)
             state_dict.pop(key)
+        counter_after_loading_weights = time.perf_counter()
+        logger.info_once(
+            "Loading weights took %.2f seconds",
+            counter_after_loading_weights - counter_before_loading_weights,
+            scope="local",
+        )
         if state_dict:
             raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
 

From c68c7b403dce632dbbbb6d2482ea86fe7bf53d51 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 21 Nov 2025 16:58:32 -0500
Subject: [PATCH 132/249] [BugFix] Fix missing symbol triggering FA2 fallback
 on Hopper (#29107)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 cmake/external_projects/vllm_flash_attn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 6cc5cda14c525..ff687e0af7b44 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 71bb26f6295449be880344b93b51791cc009237d
+          GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From 57430fc95c8a94a7c68b3d525e3b8823b0f2433f Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Fri, 21 Nov 2025 22:58:59 +0100
Subject: [PATCH 133/249] Default model load/config/tokenizer to `mistral`
 format if relevant files exist (#28659)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 docs/features/tool_calling.md                 | 23 +++++--
 .../language/generation/test_mistral.py       |  2 +-
 tests/models/multimodal/test_mapping.py       | 14 ++++-
 .../models/quantization/test_bitsandbytes.py  |  3 +
 tests/tool_use/utils.py                       |  6 ++
 tests/transformers_utils/test_config.py       | 62 +++++++++++++++++++
 tests/transformers_utils/test_utils.py        |  6 +-
 .../llm/test_struct_output_generate.py        | 14 ++++-
 vllm/config/model.py                          |  9 +--
 vllm/model_executor/model_loader/__init__.py  |  2 +
 .../model_loader/default_loader.py            | 20 +++++-
 vllm/transformers_utils/config.py             | 46 +++++++++++++-
 vllm/transformers_utils/configs/mistral.py    |  2 +-
 vllm/transformers_utils/tokenizer.py          | 30 +++++----
 vllm/v1/engine/processor.py                   | 25 +++++++-
 15 files changed, 230 insertions(+), 34 deletions(-)
 create mode 100644 tests/transformers_utils/test_config.py

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 7e6c69e717dba..dd79ba19b7247 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -142,7 +142,7 @@ Flags: `--tool-call-parser hermes`
 Supported models:
 
 * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed)
-* Additional mistral function-calling models are compatible as well.
+* Additional Mistral function-calling models are compatible as well.
 
 Known issues:
 
@@ -158,12 +158,25 @@ Known issues:
 
 Recommended flags:
 
-1. To use [mistral-common](https://github.com/mistralai/mistral-common) the official Mistral tokenization backend:
+1. To use the official Mistral AI's format:
 
-    `--tokenizer_mode mistral --config_format mistral --load_format mistral --tool-call-parser mistral`
+    `--tool-call-parser mistral`
 
-2. To use the default Transformers tokenization backend:
-    `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+2. To use the Transformers format when available:
+
+    `--tokenizer_mode hf --config_format hf --load_format hf --tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+
+!!! note
+    Models officially released by Mistral AI have two possible formats:
+
+    1. The official format that is used by default with `auto` or `mistral` arguments:
+
+        `--tokenizer_mode mistral --config_format mistral --load_format mistral`
+        This format uses [mistral-common](https://github.com/mistralai/mistral-common), the Mistral AI's tokenizer backend.
+
+    2. The Transformers format, when available, that is used with `hf` arguments:
+
+        `--tokenizer_mode hf --config_format hf --load_format hf --chat-template examples/tool_chat_template_mistral_parallel.jinja`
 
 ### Llama Models (`llama3_json`)
 
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index 0ae83ec16020a..80e337d570a36 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -208,7 +208,7 @@ def test_mistral_format(
     with vllm_runner(
         model,
         dtype=dtype,
-        tokenizer_mode="auto",
+        tokenizer_mode="hf",
         load_format="safetensors",
         config_format="hf",
     ) as hf_format_model:
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index 2f38dc450ef96..0d2eaca95504e 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -50,12 +50,24 @@ def test_hf_model_weights_mapper(model_arch: str):
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
+    is_mistral_model = model_arch in [
+        "Mistral3ForConditionalGeneration",
+        "PixtralForConditionalGeneration",
+        "VoxtralForConditionalGeneration",
+    ]
+
+    if not is_mistral_model or model_info.tokenizer_mode == "mistral":
+        tokenizer_mode = model_info.tokenizer_mode
+    else:
+        tokenizer_mode = "hf"
+
     model_id = model_info.default
 
     model_config = ModelConfig(
         model_id,
         tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
+        tokenizer_mode=tokenizer_mode,
+        config_format="hf",
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py
index dc4b4546e451b..5b8aaa299fdc1 100644
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -259,6 +259,9 @@ def validate_generated_texts(
         tensor_parallel_size=vllm_tp_size,
         enforce_eager=False,
         default_torch_num_threads=1,
+        tokenizer_mode="hf",
+        load_format="hf",
+        config_format="hf",
     ) as llm:
         vllm_outputs = llm.generate_greedy(prompts, max_tokens)
         vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index d188b21863812..7584b903156b7 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -128,6 +128,12 @@ CONFIGS: dict[str, ServerConfig] = {
         "arguments": [
             "--enforce-eager",
             "--no-enable-prefix-caching",
+            "--tokenizer_mode",
+            "hf",
+            "--load_format",
+            "hf",
+            "--config_format",
+            "hf",
             "--tool-call-parser",
             "mistral",
             "--chat-template",
diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py
new file mode 100644
index 0000000000000..de28ab5f99e8c
--- /dev/null
+++ b/tests/transformers_utils/test_config.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+
+from vllm.transformers_utils.config import list_filtered_repo_files
+
+
+@pytest.mark.parametrize(
+    "allow_patterns,expected_relative_files",
+    [
+        (
+            ["*.json", "correct*.txt"],
+            ["json_file.json", "subfolder/correct.txt", "correct_2.txt"],
+        ),
+    ],
+)
+def test_list_filtered_repo_files(
+    allow_patterns: list[str], expected_relative_files: list[str]
+):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        # Prep folder and files
+        path_tmp_dir = Path(tmp_dir)
+        subfolder = path_tmp_dir / "subfolder"
+        subfolder.mkdir()
+        (path_tmp_dir / "json_file.json").touch()
+        (path_tmp_dir / "correct_2.txt").touch()
+        (path_tmp_dir / "uncorrect.txt").touch()
+        (path_tmp_dir / "uncorrect.jpeg").touch()
+        (subfolder / "correct.txt").touch()
+        (subfolder / "uncorrect_sub.txt").touch()
+
+        def _glob_path() -> list[str]:
+            return [
+                str(file.relative_to(path_tmp_dir))
+                for file in path_tmp_dir.glob("**/*")
+                if file.is_file()
+            ]
+
+        # Patch list_repo_files called by fn
+        with patch(
+            "vllm.transformers_utils.config.list_repo_files",
+            MagicMock(return_value=_glob_path()),
+        ) as mock_list_repo_files:
+            out_files = sorted(
+                list_filtered_repo_files(
+                    tmp_dir, allow_patterns, "revision", "model", "token"
+                )
+            )
+        assert out_files == sorted(expected_relative_files)
+        assert mock_list_repo_files.call_count == 1
+        assert mock_list_repo_files.call_args_list[0] == call(
+            repo_id=tmp_dir,
+            revision="revision",
+            repo_type="model",
+            token="token",
+        )
diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py
index beaef04d766bf..bfe1cec76c138 100644
--- a/tests/transformers_utils/test_utils.py
+++ b/tests/transformers_utils/test_utils.py
@@ -2,7 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-from vllm.transformers_utils.utils import is_cloud_storage, is_gcs, is_s3
+from vllm.transformers_utils.utils import (
+    is_cloud_storage,
+    is_gcs,
+    is_s3,
+)
 
 
 def test_is_gcs():
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index a00600b87eca1..d1b037b7956cf 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -46,11 +46,15 @@ EAGLE_SPEC_CONFIG = {
 
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
+    # FIXME: Since "auto" will use Mistral tokenizer and these backends do not support
+    # it, we skip these tests for now.
+    # ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
+    # ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", None),
     pytest.param(
         "mistralai/Ministral-8B-Instruct-2410",
         "lm-format-enforcer",
-        "auto",
+        "hf",
         None,
         marks=pytest.mark.skip(
             reason=(
@@ -80,7 +84,7 @@ PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
     # ("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
     ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", NGRAM_SPEC_CONFIG),
-    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", NGRAM_SPEC_CONFIG),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", NGRAM_SPEC_CONFIG),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG),
     ("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto", EAGLE_SPEC_CONFIG),
 ]
@@ -151,6 +155,8 @@ def test_structured_output(
         ),
         seed=120,
         tokenizer_mode=tokenizer_mode,
+        load_format="auto" if not model_name.startswith("mistralai/") else "hf",
+        config_format="auto" if not model_name.startswith("mistralai/") else "hf",
         speculative_config=speculative_config,
     )
 
@@ -720,6 +726,8 @@ def test_structured_output_auto_mode(
         max_model_len=1024,
         structured_outputs_config=dict(backend="auto"),
         tokenizer_mode=tokenizer_mode,
+        load_format="auto",
+        config_format="auto",
     )
 
     sampling_params = SamplingParams(
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 8f59673f4e1c3..49688e17cf932 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -81,7 +81,7 @@ TaskOption = Literal[
     "transcription",
     "draft",
 ]
-TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
+TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -130,7 +130,8 @@ class ModelConfig:
     name or path will be used."""
     tokenizer_mode: TokenizerMode = "auto"
     """Tokenizer mode:\n
-    - "auto" will use the fast tokenizer if available.\n
+    - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
+    - "hf" will use the fast tokenizer if available.\n
     - "slow" will always use the slow tokenizer.\n
     - "mistral" will always use the tokenizer from `mistral_common`.\n
     - "custom" will use --tokenizer to select the preregistered tokenizer."""
@@ -241,8 +242,8 @@ class ModelConfig:
     first one."""
     config_format: str | ConfigFormat = "auto"
     """The format of the model config to load:\n
-    - "auto" will try to load the config in hf format if available else it
-    will try to load in mistral format.\n
+    - "auto" will try to load the config in hf format if available after trying
+    to load in mistral format.\n
     - "hf" will load the config in hf format.\n
     - "mistral" will load the config in mistral format."""
     hf_token: bool | str | None = None
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index 301f2d00bf404..052d2cfc1099e 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -30,6 +30,7 @@ logger = init_logger(__name__)
 # if a new load format is added here
 LoadFormats = Literal[
     "auto",
+    "hf",
     "bitsandbytes",
     "dummy",
     "fastsafetensors",
@@ -45,6 +46,7 @@ LoadFormats = Literal[
 ]
 _LOAD_FORMAT_TO_MODEL_LOADER: dict[str, type[BaseModelLoader]] = {
     "auto": DefaultModelLoader,
+    "hf": DefaultModelLoader,
     "bitsandbytes": BitsAndBytesModelLoader,
     "dummy": DummyModelLoader,
     "fastsafetensors": DefaultModelLoader,
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 67aa584c6bda2..7401a7a0e2dbb 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -31,6 +31,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     safetensors_weights_iterator,
 )
 from vllm.platforms import current_platform
+from vllm.transformers_utils.config import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
@@ -96,8 +97,25 @@ class DefaultModelLoader(BaseModelLoader):
         load_format = self.load_config.load_format
         use_safetensors = False
         index_file = SAFE_WEIGHTS_INDEX_NAME
-        # Some quantized models use .pt files for storing the weights.
+
+        # First check for 'auto' format that mistral files format are present.
+        # This is to load mistral models with official format by default.
         if load_format == "auto":
+            load_format = (
+                "mistral"
+                if len(
+                    list_filtered_repo_files(
+                        model_name_or_path=model_name_or_path,
+                        allow_patterns=["consolidated*.safetensors"],
+                        revision=revision,
+                    )
+                )
+                > 0
+                else "hf"
+            )
+
+        # Some quantized models use .pt files for storing the weights.
+        if load_format == "hf":
             allow_patterns = ["*.safetensors", "*.bin"]
         elif load_format == "safetensors" or load_format == "fastsafetensors":
             use_safetensors = True
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index df24738477e76..9eac7bb50afa6 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import fnmatch
 import json
 import os
 import time
@@ -355,6 +356,41 @@ def list_repo_files(
     return with_retry(lookup_files, "Error retrieving file list")
 
 
+def list_filtered_repo_files(
+    model_name_or_path: str,
+    allow_patterns: list[str],
+    revision: str | None = None,
+    repo_type: str | None = None,
+    token: str | bool | None = None,
+) -> list[str]:
+    try:
+        all_files = list_repo_files(
+            repo_id=model_name_or_path,
+            revision=revision,
+            token=token,
+            repo_type=repo_type,
+        )
+    except Exception:
+        logger.error(
+            "Error retrieving file list. Please ensure your `model_name_or_path`"
+            "`repo_type`, `token` and `revision` arguments are correctly set. "
+            "Returning an empty list."
+        )
+        return []
+
+    file_list = []
+    # Filter patterns on filenames
+    for pattern in allow_patterns:
+        file_list.extend(
+            [
+                file
+                for file in all_files
+                if fnmatch.fnmatch(os.path.basename(file), pattern)
+            ]
+        )
+    return file_list
+
+
 def file_exists(
     repo_id: str,
     file_name: str,
@@ -619,10 +655,14 @@ def get_config(
 
     if config_format == "auto":
         try:
-            if is_gguf or file_or_path_exists(model, HF_CONFIG_NAME, revision=revision):
-                config_format = "hf"
-            elif file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
+            # First check for Mistral to avoid defaulting to
+            # Transformers implementation.
+            if file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
                 config_format = "mistral"
+            elif is_gguf or file_or_path_exists(
+                model, HF_CONFIG_NAME, revision=revision
+            ):
+                config_format = "hf"
             else:
                 raise ValueError(
                     "Could not detect config format for no config file found. "
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index fe202b2ed1568..8da4ab35c56c3 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -118,7 +118,7 @@ def _remap_general_mistral_args(config: dict) -> dict:
         "model_type": ("model_type", "transformer"),
         "hidden_act": ("activation", "silu"),
         "tie_word_embeddings": ("tied_embeddings", False),
-        "max_seq_len": ("max_seq_len", 128_000),
+        "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
         "max_position_embeddings": ("max_position_embeddings", 128_000),
     }
 
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index a393568909d27..233076741503d 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -3,8 +3,8 @@
 
 import contextlib
 import copy
+import importlib.util
 import os
-import warnings
 from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, TypeAlias
@@ -15,7 +15,10 @@ from typing_extensions import assert_never
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
+from vllm.transformers_utils.config import (
+    get_sentence_transformer_tokenizer_config,
+    list_filtered_repo_files,
+)
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import check_gguf_file
 
@@ -182,25 +185,29 @@ def get_tokenizer(
         kwargs["gguf_file"] = Path(tokenizer_name).name
         tokenizer_name = Path(tokenizer_name).parent
 
-    # if tokenizer is from official mistral org
-    is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
-    if is_from_mistral_org and tokenizer_mode != "mistral":
-        warnings.warn(
-            "It is strongly recommended to run mistral models with "
-            '`--tokenizer-mode "mistral"` to ensure correct '
-            "encoding and decoding.",
-            FutureWarning,
-            stacklevel=2,
+    # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
+    # first to use official Mistral tokenizer if possible.
+    mistral_common_installed = importlib.util.find_spec("mistral_common") is not None
+    if tokenizer_mode == "auto" and mistral_common_installed:
+        allow_patterns = ["tekken.json", "tokenizer.model.v*"]
+        files_list = list_filtered_repo_files(
+            model_name_or_path=str(tokenizer_name),
+            allow_patterns=allow_patterns,
+            revision=revision,
         )
+        if len(files_list) > 0:
+            tokenizer_mode = "mistral"
 
     tokenizer: AnyTokenizer
     if tokenizer_mode == "mistral":
+        logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
         tokenizer = MistralTokenizer.from_pretrained(
             str(tokenizer_name), revision=revision
         )
     elif tokenizer_mode == "custom":
         from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
 
+        logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
         tokenizer = TokenizerRegistry.get_tokenizer(
             str(tokenizer_name),
             *args,
@@ -210,6 +217,7 @@ def get_tokenizer(
         )
     else:
         try:
+            logger.debug_once(f"Loading AutoTokenizer from {tokenizer_name}")
             tokenizer = AutoTokenizer.from_pretrained(
                 tokenizer_name,
                 *args,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 905ad406b307e..af4f0e410e253 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -20,6 +20,7 @@ from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import MultiModalCacheStats
@@ -300,12 +301,24 @@ class Processor:
             # allows <|special_token|> and similar, see
             # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
             # Without tokenizer these are disallowed in grammars.
+            if isinstance(self.tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "Mistral tokenizer is not supported for the 'guidance' "
+                    "structured output backend. Please use ['xgrammar', 'outlines'] "
+                    "backends or tokenizer_mode='hf' instead."
+                )
             validate_guidance_grammar(params, tokenizer=None)
         elif backend == "outlines":
             # outlines backend
             validate_structured_output_request_outlines(params)
         elif backend == "lm-format-enforcer":
             # lm format enforcer backend
+            if isinstance(self.tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "Mistral tokenizer is not supported for the 'lm-format-enforcer' "
+                    "structured output backend. Please use ['xgrammar', 'outlines'] "
+                    "backends or tokenizer_mode='hf' instead."
+                )
             validate_structured_output_request_lm_format_enforcer(params)
         else:
             # NOTE: backend must be "auto" here, because we have
@@ -320,9 +333,15 @@ class Processor:
             except ValueError:
                 # The request either failed validation
                 # or includes some jsonschema feature(s) that
-                # are not supported in xgrammar. Fall back to guidance.
-                validate_guidance_grammar(params, tokenizer=None)
-                params.structured_outputs._backend = "guidance"
+                # are not supported in xgrammar.
+                if isinstance(self.tokenizer, MistralTokenizer):
+                    # Fall back to outlines if the tokenizer is Mistral
+                    validate_structured_output_request_outlines(params)
+                    params.structured_outputs._backend = "outlines"
+                else:
+                    # Fall back to guidance by default.
+                    validate_guidance_grammar(params, tokenizer=None)
+                    params.structured_outputs._backend = "guidance"
             # Remember that this backend was set automatically
             params.structured_outputs._backend_was_auto = True
 

From 3137991f55c9372d4743154a56933a37e47feca7 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Fri, 21 Nov 2025 17:28:17 -0500
Subject: [PATCH 134/249] [BugFix] EPLB + B200 + DeepGEMM : Handle column-major
 scales tensor (#29162)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 tests/distributed/eplb_utils.py               |  49 +++
 tests/distributed/test_eplb_execute.py        |  40 +--
 .../distributed/test_eplb_fused_moe_layer.py  | 285 ++++++++++++++++++
 vllm/model_executor/layers/fused_moe/layer.py |  41 +++
 4 files changed, 376 insertions(+), 39 deletions(-)
 create mode 100644 tests/distributed/eplb_utils.py
 create mode 100644 tests/distributed/test_eplb_fused_moe_layer.py

diff --git a/tests/distributed/eplb_utils.py b/tests/distributed/eplb_utils.py
new file mode 100644
index 0000000000000..27a63e0215148
--- /dev/null
+++ b/tests/distributed/eplb_utils.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import random
+
+import torch
+import torch.multiprocessing as mp
+
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+)
+from vllm.utils.system_utils import update_environment_variables
+
+mp.set_start_method("spawn", force=True)
+
+
+def distributed_run(fn, world_size, *args):
+    number_of_processes = world_size
+    processes: list[mp.Process] = []
+    for i in range(number_of_processes):
+        env: dict[str, str] = {}
+        env["RANK"] = str(i)
+        env["LOCAL_RANK"] = str(i)
+        env["WORLD_SIZE"] = str(number_of_processes)
+        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
+        env["MASTER_ADDR"] = "localhost"
+        env["MASTER_PORT"] = "12345"
+        p = mp.Process(target=fn, args=(env, world_size, *args))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def set_env_vars_and_device(env: dict[str, str]) -> None:
+    update_environment_variables(env)
+    local_rank = os.environ["LOCAL_RANK"]
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    init_distributed_environment()
+
+    # Ensure each worker process has the same random seed
+    random.seed(42)
+    torch.manual_seed(42)
diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index 0a97749ac318c..9498e75b279b7 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -1,57 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import os
 import random
 
 import pytest
 import torch
 import torch.distributed
-import torch.multiprocessing as mp
 
 from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
 from vllm.distributed.parallel_state import (
     ensure_model_parallel_initialized,
     get_tp_group,
-    init_distributed_environment,
 )
-from vllm.utils.system_utils import update_environment_variables
 
-mp.set_start_method("spawn", force=True)
-
-
-def distributed_run(fn, world_size, *args):
-    number_of_processes = world_size
-    processes: list[mp.Process] = []
-    for i in range(number_of_processes):
-        env: dict[str, str] = {}
-        env["RANK"] = str(i)
-        env["LOCAL_RANK"] = str(i)
-        env["WORLD_SIZE"] = str(number_of_processes)
-        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
-        env["MASTER_ADDR"] = "localhost"
-        env["MASTER_PORT"] = "12345"
-        p = mp.Process(target=fn, args=(env, world_size, *args))
-        processes.append(p)
-        p.start()
-
-    for p in processes:
-        p.join()
-
-    for p in processes:
-        assert p.exitcode == 0
-
-
-def set_env_vars_and_device(env: dict[str, str]) -> None:
-    update_environment_variables(env)
-    local_rank = os.environ["LOCAL_RANK"]
-    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
-    init_distributed_environment()
-
-    # Ensure each worker process has the same random seed
-    random.seed(42)
-    torch.manual_seed(42)
+from .eplb_utils import distributed_run, set_env_vars_and_device
 
 
 def create_expert_indices_with_redundancy(
diff --git a/tests/distributed/test_eplb_fused_moe_layer.py b/tests/distributed/test_eplb_fused_moe_layer.py
new file mode 100644
index 0000000000000..55f26519887a1
--- /dev/null
+++ b/tests/distributed/test_eplb_fused_moe_layer.py
@@ -0,0 +1,285 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Test that the interaction between EPLB and FusedMoE Layer is okay
+
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    get_tp_group,
+)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+from .eplb_utils import distributed_run, set_env_vars_and_device
+
+
+@dataclass
+class TestConfig:
+    num_layers: int
+    num_experts: int
+    num_local_experts: int
+    num_topk: int
+    hidden_size: int
+    intermediate_size: int
+    weight_dtype: torch.dtype
+    weight_scale_dtype: torch.dtype | None
+    column_major_scales: bool
+
+
+def make_expert_weights(
+    layer_idx: int,
+    global_expert_idx: int,
+    global_num_experts: int,
+    tensor_shape: tuple[int, ...],
+    tensor_dtype: torch.dtype,
+    tensor_device: torch.device,
+    is_column_major: bool,
+) -> torch.Tensor:
+    assert len(tensor_shape) == 2
+
+    if is_column_major:
+        tensor_shape = (tensor_shape[1], tensor_shape[0])
+
+    x = torch.empty(tensor_shape, dtype=tensor_dtype, device=tensor_device)
+    value_offset = (layer_idx * global_num_experts + global_expert_idx) * x.numel()
+    x.view(-1).copy_(
+        torch.arange(
+            value_offset,
+            value_offset + x.numel(),
+            dtype=tensor_dtype,
+            device=tensor_device,
+        )
+    )
+
+    if is_column_major:
+        x = torch.transpose(x, 1, 0)
+        assert not x.is_contiguous()
+    return x
+
+
+def make_fused_moe_layer(
+    rank: int,
+    layer_idx: int,
+    test_config: TestConfig,
+) -> FusedMoE:
+    fml = FusedMoE(
+        num_experts=test_config.num_experts,
+        top_k=test_config.num_topk,
+        hidden_size=test_config.hidden_size,
+        intermediate_size=test_config.intermediate_size,
+        prefix=f"dummy_layer_{layer_idx}",
+        activation="silu",
+        is_act_and_mul=True,
+        params_dtype=test_config.weight_dtype,
+    )
+
+    device = torch.device(f"cuda:{rank}")
+
+    from functools import partial
+
+    _make_expert_weights = partial(
+        make_expert_weights,
+        layer_idx=layer_idx,
+        global_num_experts=test_config.num_experts,
+        tensor_device=device,
+    )
+
+    assert isinstance(fml.w13_weight.data, torch.Tensor)
+    assert isinstance(fml.w2_weight.data, torch.Tensor)
+    fml.w13_weight.data = fml.w13_weight.data.to(device=device)
+    fml.w2_weight.data = fml.w2_weight.data.to(device=device)
+    w13_weight = fml.w13_weight.data
+    w2_weight = fml.w2_weight.data
+    assert w13_weight.size(0) == test_config.num_local_experts
+    for i in range(test_config.num_local_experts):
+        g_i = rank * test_config.num_local_experts + i
+        w13_weight_e = w13_weight[i]
+        w2_weight_e = w2_weight[i]
+        w13_weight_e.copy_(
+            _make_expert_weights(
+                global_expert_idx=g_i,
+                tensor_shape=w13_weight_e.shape,
+                tensor_dtype=w13_weight_e.dtype,
+                is_column_major=False,
+            )
+        )
+        w2_weight_e.copy_(
+            _make_expert_weights(
+                global_expert_idx=g_i,
+                tensor_shape=w2_weight_e.shape,
+                tensor_dtype=w2_weight_e.dtype,
+                is_column_major=False,
+            )
+        )
+
+    block_size = 16
+
+    def block_quant_scales_shape(
+        shape: tuple[int, ...], is_column_major: bool
+    ) -> tuple[int, ...]:
+        assert len(shape) == 3
+        if not is_column_major:
+            return (shape[0], shape[1] // block_size, shape[2] // block_size)
+        else:
+            return (shape[0], shape[2] // block_size, shape[1] // block_size)
+
+    is_column_major = test_config.column_major_scales
+    w13_weight_scale_inv = torch.empty(
+        block_quant_scales_shape(w13_weight.shape, is_column_major),
+        dtype=test_config.weight_dtype,
+        device=device,
+    )
+    w2_weight_scale_inv = torch.empty(
+        block_quant_scales_shape(w2_weight.shape, is_column_major),
+        dtype=test_config.weight_dtype,
+        device=device,
+    )
+
+    for i in range(test_config.num_local_experts):
+        g_i = rank * test_config.num_local_experts + i
+        w13_s_e = w13_weight_scale_inv[i]
+        w2_s_e = w2_weight_scale_inv[i]
+        w13_s_e.copy_(
+            _make_expert_weights(
+                global_expert_idx=g_i,
+                tensor_shape=w13_s_e.shape,
+                tensor_dtype=w13_s_e.dtype,
+                # Fill data in row-major and then
+                # transpose if test_config requires col-major.
+                is_column_major=False,
+            )
+        )
+        w2_s_e.copy_(
+            _make_expert_weights(
+                global_expert_idx=g_i,
+                tensor_shape=w2_s_e.shape,
+                tensor_dtype=w2_s_e.dtype,
+                is_column_major=False,
+            )
+        )
+    if is_column_major:
+        w13_weight_scale_inv = torch.transpose(w13_weight_scale_inv, 1, 2)
+        w2_weight_scale_inv = torch.transpose(w2_weight_scale_inv, 1, 2)
+        assert not w13_weight_scale_inv.is_contiguous()
+        assert not w2_weight_scale_inv.is_contiguous()
+
+    # Add scales to the parameter list
+    fml.w13_weight_scale_inv = torch.nn.Parameter(
+        w13_weight_scale_inv, requires_grad=False
+    )
+    fml.w2_weight_scale_inv = torch.nn.Parameter(
+        w2_weight_scale_inv, requires_grad=False
+    )
+
+    return fml
+
+
+def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
+    # Initialize model parallel (using tensor parallel as an entrypoint
+    # to expert parallel)
+    set_env_vars_and_device(env)
+
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size
+    vllm_config.parallel_config.enable_expert_parallel = True
+
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
+
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+
+        fml_layers = [
+            make_fused_moe_layer(ep_rank, layer_idx, test_config)
+            for layer_idx in range(test_config.num_layers)
+        ]
+        rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers]
+
+        indices = torch.zeros(
+            test_config.num_layers, test_config.num_experts, dtype=torch.long
+        )
+        for lidx in range(test_config.num_layers):
+            indices[lidx] = torch.Tensor(range(test_config.num_experts))
+
+        shuffled_indices = torch.zeros_like(indices)
+        for lidx in range(test_config.num_layers):
+            shuffled_indices[lidx] = torch.randperm(test_config.num_experts)
+
+        rearrange_expert_weights_inplace(
+            indices,
+            shuffled_indices,
+            rank_expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        num_local_experts = test_config.num_local_experts
+        num_global_experts = test_config.num_experts
+        for lidx, fml in enumerate(fml_layers):
+            for name, w in fml.named_parameters():
+                for e in range(num_local_experts):
+                    g_e = shuffled_indices[lidx][ep_rank * num_local_experts + e]
+                    ref = make_expert_weights(
+                        layer_idx=lidx,
+                        global_expert_idx=int(g_e.item()),
+                        global_num_experts=num_global_experts,
+                        tensor_shape=w[e].shape,
+                        tensor_dtype=w[e].dtype,
+                        tensor_device=w[e].device,
+                        is_column_major=not w[e].is_contiguous(),
+                    )
+                    assert w[e].shape == ref.shape and w[e].stride() == ref.stride(), (
+                        f"w[{e}] {w[e].size()} {w[e].stride()} vs "
+                        f"ref {ref.size()} {ref.stride()}"
+                    )
+                    torch.testing.assert_close(w[e], ref)
+
+
+@pytest.mark.parametrize("world_size", [2])
+@pytest.mark.parametrize("num_layers", [4])
+@pytest.mark.parametrize("num_experts", [16])
+@pytest.mark.parametrize("hidden_size", [256])
+@pytest.mark.parametrize("intermediate_size", [256])
+@pytest.mark.parametrize("column_major_scales", [True, False])
+def test_eplb_fml(
+    world_size: int,
+    num_layers: int,
+    num_experts: int,
+    hidden_size: int,
+    intermediate_size: int,
+    column_major_scales: bool,
+):
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    num_local_experts = num_experts // world_size
+    num_topk = 4
+    # The dtypes are fine as we are essentially just checking data-copies
+    weight_dtype = torch.bfloat16
+    weight_scale_dtype = torch.bfloat16
+
+    test_config = TestConfig(
+        num_layers=num_layers,
+        num_experts=num_experts,
+        num_local_experts=num_local_experts,
+        num_topk=num_topk,
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        weight_dtype=weight_dtype,
+        weight_scale_dtype=weight_scale_dtype,
+        column_major_scales=column_major_scales,
+    )
+
+    distributed_run(
+        _test_eplb_fml,
+        world_size,
+        test_config,
+    )
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b2f554efd8a6f..6619b64b2bbc0 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1391,7 +1391,48 @@ class FusedMoE(CustomOp):
                     yield param_name
 
     def get_expert_weights(self) -> Iterable[torch.Tensor]:
+        def _maybe_make_contiguous(
+            name: str, p: torch.nn.Parameter
+        ) -> torch.nn.Parameter:
+            """
+            In some cases, the last 2 dimensions (the non-expert dimensions)
+            of the weight scale tensor are transposed. This function
+            transforms the tensor (view update) so the tensor is contiguous().
+            Example: A non-contiguous scale tensor,
+              `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to
+              `x_` of shape (E, 16, 32) and stride (512, 32, 1).
+              Note that we specifically use torch.transpose() so `x_` refers
+              to the same underlying memory. The tensors `x` and `x_`, pointing
+              to the same underlying memory make this transformation safe in the
+              context of EPLB. i.e. It is the same memory and just the view
+              is different.
+            Note: This function handles the "weight_scale" tensors specifically.
+            This could however be generalized to handle similar tensors.
+            """
+            if p.ndim != 3:
+                return p
+            if p.is_contiguous():
+                # Already contiguous. do nothing.
+                return p
+            # p is non-contiguous. We only handle the case where the last 2
+            # dimensions of the scales tensor is transposed. We can handle
+            # other cases when they become relevant.
+            is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1
+            if "weight_scale" not in name or not is_transposed_12:
+                # do nothing.
+                return p
+
+            # Do not update the layer paramater as the layer's MoE operations would
+            # expect the parameter's tensor to the same shape / stride. Instead,
+            # make a new torch.nn.Parameter that is used just in the context of
+            # EPLB.
+            return torch.nn.Parameter(
+                torch.transpose(p.data, 1, 2), requires_grad=False
+            )
+
         weights = list(self.named_parameters())
+        weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights]
+
         assert all(
             weight.is_contiguous()
             for name, weight in weights

From c6fa3895e90f6daef4d223188f6b4156311f40c9 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 21 Nov 2025 22:45:00 +0000
Subject: [PATCH 135/249] [KV Connector] Fix async connector prefix cache
 metrics (#28585)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
---
 tests/v1/core/test_scheduler.py | 17 +++++++++++++----
 vllm/v1/core/sched/scheduler.py | 16 ++++++++--------
 vllm/v1/request.py              |  3 +++
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 04e738293cd77..d9a69a77c9797 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1057,7 +1057,8 @@ def test_kv_connector_basic(is_async: bool):
     )
 
 
-def test_external_prefix_cache_metrics():
+@pytest.mark.parametrize("is_async", [False, True])
+def test_external_prefix_cache_metrics(is_async: bool):
     """
     Verify connector prefix cache metrics are updated
     correctly when the scheduler processes requests with KV connector hits.
@@ -1067,7 +1068,9 @@ def test_external_prefix_cache_metrics():
     NUM_MATCHED_NEW_TOKENS = 4
     scheduler = create_scheduler(
         enable_prefix_caching=False,
-        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
+        use_kv_connector=mock_kv(
+            matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async
+        ),
     )
 
     # --- Prepare simple requests ---
@@ -1079,9 +1082,15 @@ def test_external_prefix_cache_metrics():
         num_tokens=NUM_TOKENS,
         max_tokens=MAX_TOKENS,
     )
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
 
-    for req in requests:
-        scheduler.add_request(req)
+    if is_async:
+        _step_until_kv_transfer_finished(scheduler, req_ids)
 
     # --- Trigger scheduling and simulate model output ---
     output = scheduler.schedule()
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 9195b112d8690..4cb5348cbacc3 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -470,6 +470,7 @@ class Scheduler(SchedulerInterface):
                             skipped_waiting_requests.prepend_request(request)
                             continue
 
+                        request.num_external_computed_tokens = ext_tokens
                         num_external_computed_tokens = ext_tokens
 
                     # Total computed tokens (local + external).
@@ -576,9 +577,6 @@ class Scheduler(SchedulerInterface):
                         new_computed_blocks + new_blocks,
                         num_external_computed_tokens,
                     )
-                    self._update_connector_prefix_cache_stats(
-                        request, num_external_computed_tokens
-                    )
 
                 # Request was already popped from self.waiting
                 # unless it was re-added above due to new_blocks being None.
@@ -590,6 +588,8 @@ class Scheduler(SchedulerInterface):
                     request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
                     continue
 
+                self._update_connector_prefix_cache_stats(request)
+
                 req_index += 1
                 self.running.append(request)
                 if self.log_stats:
@@ -1380,15 +1380,13 @@ class Scheduler(SchedulerInterface):
     # KV Connector Related Methods
     ########################################################################
 
-    def _update_connector_prefix_cache_stats(
-        self, request: Request, num_external_tokens: int
-    ) -> None:
+    def _update_connector_prefix_cache_stats(self, request: Request) -> None:
         if self.connector_prefix_cache_stats is None:
             return
 
         self.connector_prefix_cache_stats.record(
             num_tokens=request.num_tokens,
-            num_hits=num_external_tokens,
+            num_hits=request.num_external_computed_tokens,
             preempted=request.num_preemptions > 0,
         )
 
@@ -1571,9 +1569,11 @@ class Scheduler(SchedulerInterface):
                 marked_invalid_block = True
                 # Truncate the computed tokens at the first failed block
                 request.num_computed_tokens = idx * self.block_size
-                total_affected_tokens += (
+                num_affected_tokens = (
                     req_num_computed_tokens - request.num_computed_tokens
                 )
+                total_affected_tokens += num_affected_tokens
+                request.num_external_computed_tokens -= num_affected_tokens
 
             if is_affected:
                 if not marked_invalid_block:
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 3d92906fbf4b1..366cdadf5a583 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -121,6 +121,9 @@ class Request:
         # The number of requests being preempted by the scheduler
         self.num_preemptions = 0
 
+        # The number of tokens that have been computed remotely.
+        self.num_external_computed_tokens = 0
+
         self.block_hashes: list[BlockHash] = []
         self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None
         if block_hasher is not None:

From e9af6ba62ac99683139ff8d6bac87677fecf0b0c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 21 Nov 2025 15:52:28 -0800
Subject: [PATCH 136/249] [Model Runner V2] Optimize Gumbel Sampling Kernel
 (#29210)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/sampler.py | 93 ++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 50 deletions(-)

diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
index 55f98ca6bb6a3..499e9d3b1538d 100644
--- a/vllm/v1/worker/gpu/sampler.py
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -3,10 +3,9 @@
 from collections.abc import Callable
 
 import torch
-import triton
-import triton.language as tl
 
 from vllm.config.model import LogprobsMode
+from vllm.triton_utils import tl, triton
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
 from vllm.v1.worker.gpu.states import SamplingMetadata
@@ -78,7 +77,10 @@ class Sampler:
 
 @triton.jit
 def _gumbel_sample_kernel(
-    sampled_ptr,
+    local_argmax_ptr,
+    local_argmax_stride,
+    local_max_ptr,
+    local_max_stride,
     logits_ptr,
     logits_stride,
     seeds_ptr,
@@ -88,40 +90,21 @@ def _gumbel_sample_kernel(
     BLOCK_SIZE: tl.constexpr,
 ):
     req_idx = tl.program_id(0)
+    block_idx = tl.program_id(1)
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = block < vocab_size
+    logits = tl.load(
+        logits_ptr + req_idx * logits_stride + block,
+        mask=mask,
+        other=float("-inf"),
+    )
+
     is_greedy = tl.load(is_greedy_ptr + req_idx)
-
-    if is_greedy:
-        # Greedy sampling. Don't apply gumbel noise.
-        max_val = float("-inf")
-        max_idx = 0
-        for i in range(0, vocab_size, BLOCK_SIZE):
-            block = i + tl.arange(0, BLOCK_SIZE)
-            mask = block < vocab_size
-            logits = tl.load(
-                logits_ptr + req_idx * logits_stride + block,
-                mask=mask,
-                other=float("-inf"),
-            )
-
-            idx = tl.argmax(logits, axis=0)
-            value = tl.max(logits, axis=0)
-            is_greater = value > max_val
-            max_val = tl.where(is_greater, value, max_val)
-            max_idx = tl.where(is_greater, i + idx, max_idx)
-        tl.store(sampled_ptr + req_idx, max_idx)
-        return
-
-    # Random sampling.
-    # Calculate gumbel seed.
-    seed = tl.load(seeds_ptr + req_idx)
-    pos = tl.load(pos_ptr + req_idx)
-    gumbel_seed = tl.randint(seed, pos)
-
-    max_val = float("-inf")
-    max_idx = 0
-    for i in range(0, vocab_size, BLOCK_SIZE):
-        block = i + tl.arange(0, BLOCK_SIZE)
-        mask = block < vocab_size
+    if not is_greedy:
+        # Calculate the seed for gumbel noise.
+        seed = tl.load(seeds_ptr + req_idx)
+        pos = tl.load(pos_ptr + req_idx)
+        gumbel_seed = tl.randint(seed, pos)
 
         # Generate gumbel noise.
         r = tl.rand(gumbel_seed, block).to(tl.float64)
@@ -129,16 +112,13 @@ def _gumbel_sample_kernel(
         gumbel_noise = gumbel_noise.to(tl.float32)
 
         # Apply gumbel noise.
-        logits = tl.load(logits_ptr + req_idx * logits_stride + block, mask=mask)
         logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
 
-        # Argmax to get the sampled token.
-        idx = tl.argmax(logits, axis=0)
-        value = tl.max(logits, axis=0)
-        is_greater = value > max_val
-        max_val = tl.where(is_greater, value, max_val)
-        max_idx = tl.where(is_greater, i + idx, max_idx)
-    tl.store(sampled_ptr + req_idx, max_idx)
+    idx = tl.argmax(logits, axis=0)
+    token_id = block_idx * BLOCK_SIZE + idx
+    value = tl.max(logits, axis=0)
+    tl.store(local_argmax_ptr + req_idx * local_argmax_stride + block_idx, token_id)
+    tl.store(local_max_ptr + req_idx * local_max_stride + block_idx, value)
 
 
 def gumbel_sample(
@@ -148,23 +128,36 @@ def gumbel_sample(
     pos: torch.Tensor,  # [num_reqs]
 ) -> torch.Tensor:
     num_reqs, vocab_size = logits.shape
-    # NOTE(woosuk): Use int64 for later indexing.
-    sampled = torch.empty(
+    BLOCK_SIZE = 1024
+    num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
+    local_argmax = torch.empty(
         num_reqs,
+        num_blocks,
         dtype=torch.int64,
         device=logits.device,
     )
-    _gumbel_sample_kernel[(num_reqs,)](
-        sampled,
+    local_max = torch.empty(
+        num_reqs,
+        num_blocks,
+        dtype=torch.float32,
+        device=logits.device,
+    )
+    _gumbel_sample_kernel[(num_reqs, num_blocks)](
+        local_argmax,
+        local_argmax.stride(0),
+        local_max,
+        local_max.stride(0),
         logits,
         logits.stride(0),
         seed,
         pos,
         is_greedy,
         vocab_size,
-        num_warps=8,
-        BLOCK_SIZE=16384,  # type: ignore
+        BLOCK_SIZE=BLOCK_SIZE,
     )
+    # NOTE(woosuk): Use int64 for later indexing.
+    max_block_idx = local_max.argmax(dim=-1, keepdim=True)
+    sampled = local_argmax.gather(dim=-1, index=max_block_idx).view(-1)
     return sampled
 
 
From 30d64662387aaa74abcee294f27b83043f2d1ae6 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 21 Nov 2025 19:47:05 -0500
Subject: [PATCH 137/249] [BugFix] Fix Eagle `IndexError: list index out of
 range` for even `num_speculative_tokens` (#29102)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/conftest.py            |  8 ++++++++
 vllm/config/compilation.py   | 16 ++++++++++------
 vllm/v1/spec_decode/eagle.py | 33 +++++++++++++++++++--------------
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index b17081352edcf..5afdb225b8923 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -748,6 +748,14 @@ class VllmRunner:
             # being captured which can trigger edge cases that we don't handle yet.
             kwargs["compilation_config"] = {"cudagraph_capture_sizes": [4]}
 
+            # Make sure we have atleast one cudagraph large enough for a single decode.
+            if (speculative_config := kwargs.get("speculative_config")) and (
+                num_speculative_tokens := speculative_config["num_speculative_tokens"]
+            ):
+                kwargs["compilation_config"]["cudagraph_capture_sizes"].append(
+                    num_speculative_tokens + 1
+                )
+
         with init_ctx:
             self.llm = LLM(
                 model=model_name,
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index abdae49106120..9b5309598d0e2 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -950,14 +950,18 @@ class CompilationConfig:
             )
         )
 
+        if len(rounded_sizes) == 0 and multiple_of <= self.max_cudagraph_capture_size:
+            # if one valid but would be round_down use that
+            rounded_sizes = [multiple_of]
+
         if len(rounded_sizes) == 0:
-            logger.warning(
-                "No valid cudagraph sizes after rounding to multiple of "
-                " num_speculative_tokens + 1 (%d); please adjust num_speculative_tokens"
-                " or max_cudagraph_capture_size (or cudagraph_capture_sizes)",
-                multiple_of,
+            raise ValueError(
+                f"No valid cudagraph sizes after rounding to multiple of {multiple_of} "
+                f"(num_speculative_tokens + 1 or tp if sequence parallelism is enabled)"
+                f" please adjust num_speculative_tokens ({uniform_decode_query_len - 1}"
+                f") or max_cudagraph_capture_size ({self.max_cudagraph_capture_size})"
+                f" or cudagraph_capture_sizes ({self.cudagraph_capture_sizes})"
             )
-            return
 
         self.max_cudagraph_capture_size = rounded_sizes[-1]
         self.cudagraph_capture_sizes = rounded_sizes
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 0df9cd3214e53..3de418f1d13c8 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -86,9 +86,9 @@ class EagleProposer:
 
         self.use_cuda_graph = False
 
-        compilation_config = self.vllm_config.compilation_config
-        if compilation_config.mode == CompilationMode.VLLM_COMPILE:
-            cudagraph_mode = compilation_config.cudagraph_mode
+        self.compilation_config = self.vllm_config.compilation_config
+        if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
+            cudagraph_mode = self.compilation_config.cudagraph_mode
             if cudagraph_mode != CUDAGraphMode.NONE and not cudagraph_mode.has_mode(
                 CUDAGraphMode.PIECEWISE
             ):
@@ -103,13 +103,6 @@ class EagleProposer:
                 and not self.speculative_config.enforce_eager
             )
 
-        self.cudagraph_batch_sizes = (
-            (sorted(self.vllm_config.compilation_config.cudagraph_capture_sizes))
-            if self.use_cuda_graph
-            else []
-        )
-
-        self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes)
         # persistent buffers for cuda graph
         self.input_ids = torch.zeros(
             self.max_num_tokens, dtype=torch.int32, device=device
@@ -276,7 +269,10 @@ class EagleProposer:
             per_layer_attn_metadata[layer_name] = draft_indexer_metadata
 
         cudagraph_runtime_mode = CUDAGraphMode.NONE
-        if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]:
+        if (
+            self.use_cuda_graph
+            and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+        ):
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
             cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
@@ -366,7 +362,10 @@ class EagleProposer:
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
 
-        if self.use_cuda_graph and batch_size <= self.cudagraph_batch_sizes[-1]:
+        if (
+            self.use_cuda_graph
+            and batch_size <= self.compilation_config.max_cudagraph_capture_size
+        ):
             input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
             cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
@@ -777,7 +776,10 @@ class EagleProposer:
             self.positions[:num_tokens] = tree_positions.view(-1)
             self.hidden_states[:num_tokens] = tree_hidden_states.view(num_tokens, -1)
 
-            if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]:
+            if (
+                self.use_cuda_graph
+                and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+            ):
                 num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
                 cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
             else:
@@ -1114,7 +1116,10 @@ class EagleProposer:
     ) -> None:
         # Determine if CUDA graphs should be used for this run.
         cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
-        if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]:
+        if (
+            cudagraphs_enabled
+            and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+        ):
             num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
 
         with set_forward_context(

From d5dbdbfcb2cfc2e4d82a1e2605576f1e4e440ca7 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Fri, 21 Nov 2025 17:10:27 -0800
Subject: [PATCH 138/249] [docs] Fix cudagraph mode config (#29170)

Signed-off-by: angelayi <yiangela7@gmail.com>
---
 docs/design/debug_vllm_compile.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index 3b454e851b54e..8912eb58f8ac7 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -9,7 +9,7 @@ TL;DR:
 |----------|----------|-------------|
 | --enforce-eager | enforce_eager=True |  Turn off torch.compile and CUDAGraphs |
 | -O.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
-| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(mode=CompilationMode.NONE) |  Turn off CUDAGraphs only |
+| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) |  Turn off CUDAGraphs only |
 | -O.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
 
 ## vLLM-torch.compile overview

From 9a3101b2ba6821488f4b7a9b93124e479edc4d3e Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Fri, 21 Nov 2025 19:11:02 -0600
Subject: [PATCH 139/249] [Rocm][CI] Fix DeekSeek V2-Lite Accuracy CI (#29135)

Signed-off-by: charlifu <charlifu@amd.com>
---
 .../deepseek_v2_lite_ep_eplb.sh                      | 12 +++++++++++-
 .../qwen30b_a3b_fp8_block_ep.sh                      | 11 ++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
index 5302f524a0ae4..8106f50f18f66 100644
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -17,7 +17,17 @@ wait_for_server() {
 }
 
 MODEL="deepseek-ai/DeepSeek-V2-lite"
-BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
 
 cleanup() {
   if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
index a5135299297e2..0d06f53a183d0 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
@@ -17,7 +17,16 @@ wait_for_server() {
 }
 
 MODEL="QWen/Qwen3-30B-A3B-FP8"
-BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
 
 cleanup() {
   if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then

From 1d34eb11e057f6b42af36bdb13852d2701f04245 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 21 Nov 2025 20:14:49 -0500
Subject: [PATCH 140/249] [CI] Bug: Fix triton import issue (#29202)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/v1/worker/gpu/block_table.py | 3 +--
 vllm/v1/worker/gpu/input_batch.py | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py
index ff24e88ede2c0..b31e9b179d26c 100644
--- a/vllm/v1/worker/gpu/block_table.py
+++ b/vllm/v1/worker/gpu/block_table.py
@@ -3,10 +3,9 @@
 from collections.abc import Iterable
 
 import torch
-import triton
-import triton.language as tl
 
 from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.triton_utils import tl, triton
 from vllm.utils.math_utils import cdiv
 from vllm.v1.utils import CpuGpuBuffer
 
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 89f375649146f..8313b32d29797 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -7,9 +7,8 @@ import numba
 import numba.types as types
 import numpy as np
 import torch
-import triton
-import triton.language as tl
 
+from vllm.triton_utils import tl, triton
 from vllm.utils import random_uuid
 from vllm.utils.math_utils import cdiv
 from vllm.v1.utils import CpuGpuBuffer

From d045e22dfeee61ece1a20ac4aec8cf483a42d406 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sat, 22 Nov 2025 01:30:55 +0000
Subject: [PATCH 141/249] [Model][Qwen3VL] Tune Triton w8a8 block fp8 kernel
 for L40s (#29217)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 4 files changed, 584 insertions(+)
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..6b2c1dc1312bf
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..b0eaf02a541ad
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..4cd357d5086ca
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..ca2179ddf3d2f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}

From ed8e6843cc7167113bb9a436818f2e242c841b9f Mon Sep 17 00:00:00 2001
From: Ryan Rock <ryan.rock@amd.com>
Date: Fri, 21 Nov 2025 19:31:22 -0600
Subject: [PATCH 142/249] [CI/Build] Add terratorch for AMD (#29205)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
---
 requirements/rocm-test.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 432e11977872d..eabb5065bfceb 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -39,3 +39,6 @@ mteb[bm25s]>=1.38.11, <2
 
 # Required for eval tests
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+
+# Plugins test
+terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e

From 5c8f2adf50e0cf2c5acf908ac796089cc45abdcf Mon Sep 17 00:00:00 2001
From: Jie Luo <65482183+Livinfly@users.noreply.github.com>
Date: Sat, 22 Nov 2025 09:34:28 +0800
Subject: [PATCH 143/249] [Bugfix] Fix block size in block_table with PCP
 (#29094)

Signed-off-by: Livinfly <luojie3m@gmail.com>
---
 vllm/v1/worker/block_table.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 76e17f3797a1a..37ec0fb97e06b 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -84,7 +84,7 @@ class BlockTable:
             self.pcp_world_size = get_pcp_group().world_size
             self.pcp_rank = get_pcp_group().rank_in_group
         except AssertionError:
-            # DCP might not be initialized in testing
+            # PCP might not be initialized in testing
             self.pcp_world_size = 1
             self.pcp_rank = 0
         try:
@@ -268,6 +268,11 @@ class MultiGroupBlockTable:
         # (max_model_len//dcp_world_size) tokens in kvcache,
         # so the block_size which used for calc max_num_blocks_per_req
         # must be multiplied by dcp_world_size.
+        try:
+            pcp_world_size = get_pcp_group().world_size
+        except AssertionError:
+            # PCP might not be initialized in testing
+            pcp_world_size = 1
         try:
             dcp_world_size = get_dcp_group().world_size
         except AssertionError:
@@ -280,12 +285,14 @@ class MultiGroupBlockTable:
                 f"must match block_sizes length ({len(block_sizes)})"
             )
 
+        total_cp_world_size = dcp_world_size * pcp_world_size
+
         self.block_tables = [
             BlockTable(
                 block_size,
                 max_num_reqs,
                 max(
-                    cdiv(max_model_len, block_size * dcp_world_size),
+                    cdiv(max_model_len, block_size * total_cp_world_size),
                     1 + num_speculative_tokens,
                 ),
                 max_num_batched_tokens,

From 1ef9c9e29480f95340e124cc7d81a2876a60516d Mon Sep 17 00:00:00 2001
From: qli88 <qiang.li2@amd.com>
Date: Fri, 21 Nov 2025 19:36:19 -0600
Subject: [PATCH 144/249] [CI/Build] Disable test_gptoss_tp.py in 'LoRA TP
 Test' group for ROCm platform (#29204)

Signed-off-by: qli88 <qiang.li2@amd.com>
---
 .buildkite/test-amd.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4e2ff5c5a6bd5..4ee81fdabf665 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1319,7 +1319,10 @@ steps:
     - pytest -v -s -x lora/test_llama_tp.py
     - pytest -v -s -x lora/test_llm_with_multi_loras.py
     - pytest -v -s -x lora/test_olmoe_tp.py
-    - pytest -v -s -x lora/test_gptoss_tp.py
+
+    # Disabled for now because MXFP4 backend on non-cuda platform 
+    # doesn't support LoRA yet
+    #- pytest -v -s -x lora/test_gptoss_tp.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min

From 052950e5b3c48b1189df62f833ed9cff4aabb0bd Mon Sep 17 00:00:00 2001
From: FlintyLemming <muchenran@hotmail.com>
Date: Sat, 22 Nov 2025 09:37:51 +0800
Subject: [PATCH 145/249] Add fused MoE config for H200 E160 N192 fp8 (#29182)

Signed-off-by: FlintyLemming <admin@flinty.moe>
---
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 147 ++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..54fe5374cb95d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}

From 6f403501a085f4917e49e1714bdf44d2aabd06f9 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 21 Nov 2025 20:13:18 -0600
Subject: [PATCH 146/249] [CI/Build][AMD] Enable Entrypoints Integration Test
 (Pooling) to run without error on ROCm (#29212)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/entrypoints/pooling/correctness/test_mteb_embed.py    | 6 ++++++
 tests/entrypoints/pooling/correctness/test_mteb_score.py    | 6 ++++++
 tests/entrypoints/pooling/llm/test_embedding.py             | 6 ++++++
 tests/entrypoints/pooling/llm/test_encode.py                | 6 ++++++
 tests/entrypoints/pooling/llm/test_score.py                 | 6 ++++++
 tests/entrypoints/pooling/openai/test_embedding.py          | 6 ++++++
 .../entrypoints/pooling/openai/test_embedding_dimensions.py | 6 ++++++
 .../entrypoints/pooling/openai/test_embedding_long_text.py  | 6 ++++++
 tests/entrypoints/pooling/openai/test_rerank.py             | 6 ++++++
 tests/entrypoints/pooling/openai/test_score.py              | 6 ++++++
 tests/entrypoints/pooling/openai/test_truncation.py         | 6 ++++++
 11 files changed, 66 insertions(+)

diff --git a/tests/entrypoints/pooling/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/correctness/test_mteb_embed.py
index 7f16638e51e2c..64673534fd32a 100644
--- a/tests/entrypoints/pooling/correctness/test_mteb_embed.py
+++ b/tests/entrypoints/pooling/correctness/test_mteb_embed.py
@@ -11,6 +11,12 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
     run_mteb_embed_task,
 )
 from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 
diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/correctness/test_mteb_score.py
index 1afe68b189db8..81ad0097187b0 100644
--- a/tests/entrypoints/pooling/correctness/test_mteb_score.py
+++ b/tests/entrypoints/pooling/correctness/test_mteb_score.py
@@ -13,6 +13,12 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
     run_mteb_rerank,
 )
 from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 
diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py
index 5455b5f91fc09..f5eab4c29ae18 100644
--- a/tests/entrypoints/pooling/llm/test_embedding.py
+++ b/tests/entrypoints/pooling/llm/test_embedding.py
@@ -9,6 +9,12 @@ import torch.nn.functional as F
 
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
 
diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py
index ca85d2758fce4..f86ecef2e4744 100644
--- a/tests/entrypoints/pooling/llm/test_encode.py
+++ b/tests/entrypoints/pooling/llm/test_encode.py
@@ -7,6 +7,12 @@ import pytest
 
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
 
diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/llm/test_score.py
index b69c6a47c1913..ce36d61cb8476 100644
--- a/tests/entrypoints/pooling/llm/test_score.py
+++ b/tests/entrypoints/pooling/llm/test_score.py
@@ -9,6 +9,12 @@ import torch
 from tests.models.utils import softmax
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
 
diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py
index e971b23e8f1a0..0c88d800e2f99 100644
--- a/tests/entrypoints/pooling/openai/test_embedding.py
+++ b/tests/entrypoints/pooling/openai/test_embedding.py
@@ -19,6 +19,7 @@ from vllm.entrypoints.openai.protocol import (
     EmbeddingResponse,
     PoolingResponse,
 )
+from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils.serial_utils import (
     EMBED_DTYPE_TO_TORCH_DTYPE,
@@ -28,6 +29,11 @@ from vllm.utils.serial_utils import (
     decode_pooling_output,
 )
 
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 DTYPE = "bfloat16"
diff --git a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
index ba9fb64262772..8018dac2d3ffe 100644
--- a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
@@ -12,6 +12,12 @@ from tests.models.language.pooling.embed_utils import run_embedding_correctness_
 from tests.models.utils import EmbedModelInfo
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODELS = [
     EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/openai/test_embedding_long_text.py
index f977c81a9084e..a9ade09dad0b5 100644
--- a/tests/entrypoints/pooling/openai/test_embedding_long_text.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py
@@ -16,6 +16,12 @@ import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 
 def _generate_random_text(word_count: int) -> str:
diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py
index 1d85190c12a19..5a772e22a7414 100644
--- a/tests/entrypoints/pooling/openai/test_rerank.py
+++ b/tests/entrypoints/pooling/openai/test_rerank.py
@@ -8,6 +8,12 @@ import torch.nn.functional as F
 
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"
diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/openai/test_score.py
index b8f796d47efaa..ceff9d0181825 100644
--- a/tests/entrypoints/pooling/openai/test_score.py
+++ b/tests/entrypoints/pooling/openai/test_score.py
@@ -10,6 +10,12 @@ from torch import tensor
 
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import ScoreResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODELS = [
     {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
diff --git a/tests/entrypoints/pooling/openai/test_truncation.py b/tests/entrypoints/pooling/openai/test_truncation.py
index 6889628dc9145..0d2d385840402 100644
--- a/tests/entrypoints/pooling/openai/test_truncation.py
+++ b/tests/entrypoints/pooling/openai/test_truncation.py
@@ -7,6 +7,12 @@ import pytest
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
 max_model_len = 128

From 77e1c035d039ec546bb01b4915eed6b5735156c2 Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Fri, 21 Nov 2025 19:18:00 -0800
Subject: [PATCH 147/249] [chore][LMCache connector] Remove useless logs from
 lmcache connector (#29069)

Signed-off-by: ApostaC <yihua98@uchicago.edu>
---
 .../v1/lmcache_integration/multi_process_adapter.py            | 1 -
 .../kv_transfer/kv_connector/v1/lmcache_mp_connector.py        | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
index ab2eeed9f6b8a..6acfb73997f25 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
@@ -310,7 +310,6 @@ class LMCacheMPWorkerAdapter:
                     request_id,
                     result,
                 )
-            logger.info("Retrieve request for request_id=%s finished", request_id)
 
         # Remove the finished requests from the tracking dicts
         for request_id in finished_stores:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index 22ddabbf1e352..d1d3e475cc889 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -469,9 +469,6 @@ class LMCacheMPConnector(KVConnectorBase_V1):
             ops.append(meta.op)
 
         if len(request_ids) > 0:
-            logger.info(
-                "HERE! SUBMITTING THE BATCHED RETRIEVE REQUESTS %s", request_ids
-            )
             self.worker_adapter.batched_submit_retrieve_requests(
                 request_ids, ops, event
             )

From fd65015a14be5f2ce663cd959dff6970285c54b4 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 21 Nov 2025 21:34:33 -0600
Subject: [PATCH 148/249] [CI/Build] Only use supported types and features on
 ROCm in MoE kernel tests (#29149)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/kernels/moe/test_batched_moe.py                  | 9 +++++++--
 tests/kernels/moe/test_block_fp8.py                    | 5 +++++
 tests/kernels/moe/test_gpt_oss_triton_kernels.py       | 5 +++++
 tests/kernels/moe/test_modular_kernel_combinations.py  | 6 ++++++
 tests/kernels/moe/test_moe_permute_unpermute.py        | 6 ++++++
 tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py | 6 ++++++
 tests/kernels/moe/test_triton_moe_ptpc_fp8.py          | 6 ++++++
 7 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 2285709fa7d60..dab1207d78031 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -39,6 +39,11 @@ MNK_FACTORS = [
 NUM_EXPERTS = [8, 64]
 TOP_KS = [1, 2, 6]
 
+DTYPES = [torch.bfloat16]
+
+if not current_platform.is_fp8_fnuz():
+    DTYPES.append(torch.float8_e4m3fn)
+
 vllm_config = VllmConfig()
 
 
@@ -96,7 +101,7 @@ class BatchedMMTensors:
 @pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512])
 @pytest.mark.parametrize("K", [128, 1024])
 @pytest.mark.parametrize("N", [128, 1024])
-@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
 def test_batched_mm(
@@ -229,7 +234,7 @@ def test_batched_mm(
 @pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("input_scales", [False])
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index 88db4b3e537c2..b0ff1e64e3219 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -31,6 +31,11 @@ dg_available = has_deep_gemm()
 
 if current_platform.get_device_capability() < (9, 0):
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
+if current_platform.is_fp8_fnuz():
+    pytest.skip(
+        "Tests in this file require float8_e4m3fn and platform does not support",
+        allow_module_level=True,
+    )
 
 vllm_config = VllmConfig()
 
diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
index af33fd4e3fc3b..98e80ec029777 100644
--- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -270,6 +270,11 @@ class Case:
 @pytest.mark.parametrize("num_token", [2])
 @pytest.mark.parametrize("tp", [1, 2, 4, 8])
 def test_equiv(num_token, a_dtype, w_dtype, tp):
+    from triton_kernels.tensor_details import layout
+
+    if not hasattr(layout, "make_default_matmul_mxfp4_w_layout"):
+        pytest.skip("make_default_matmul_mxfp4_w_layout not available")
+
     M = num_token
     E = ModelConfig.num_experts
     K = ModelConfig.hidden_size
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index e3b8621b452fa..2a30ef2355529 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -46,6 +46,12 @@ meets_multi_gpu_requirements = pytest.mark.skipif(
     reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages",
 )
 
+if current_platform.is_fp8_fnuz():
+    pytest.skip(
+        "Tests in this file require float8_e4m3fn and platform does not support",
+        allow_module_level=True,
+    )
+
 
 def format_result(verbose, msg, ex=None):
     if ex is not None:
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index ba1f657b3ecda..12dd322dccc52 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -23,6 +23,12 @@ TOP_KS = [2, 6, 8]
 EP_SIZE = [1, 4, 16]
 current_platform.seed_everything(0)
 
+if current_platform.is_rocm():
+    pytest.skip(
+        "moe_permute_unpermute_supported is not defined for ROCm",
+        allow_module_level=True,
+    )
+
 
 def torch_permute(
     hidden_states: torch.Tensor,
diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
index d6b78dd2c2323..b220205759e2d 100644
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -14,6 +14,12 @@ from vllm.platforms import current_platform
 from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
 from vllm.utils.math_utils import cdiv, round_up
 
+if current_platform.is_fp8_fnuz():
+    pytest.skip(
+        "Tests in this file require float8_e4m3fn and platform does not support",
+        allow_module_level=True,
+    )
+
 fp8_dtype = torch.float8_e4m3fn
 
 CASES = [
diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
index 7a467e160b784..0ab025dceca40 100644
--- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -19,6 +19,12 @@ if current_platform.get_device_capability() < (9, 0):
 
 vllm_config = VllmConfig()
 
+if current_platform.is_fp8_fnuz():
+    pytest.skip(
+        "Tests in this file require float8_e4m3fn and platform does not support",
+        allow_module_level=True,
+    )
+
 
 def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
     """Matrix multiplication function that supports per-token input

From 933f67ecd81231ebfa5e2434d3ae3819b6c28068 Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Fri, 21 Nov 2025 19:59:07 -0800
Subject: [PATCH 149/249] [Bugfix]Fix a conditional to not check zero value
 (#28754)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
---
 vllm/compilation/caching.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index 63b7ad7279e37..6297d9f995aa4 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -116,7 +116,8 @@ class VllmSerializableFunction(SerializableCallable):
             the AOT compiled path.
             """
             compile_inputs = [
-                inp or example_inputs[i] for i, inp in enumerate(fn.example_inputs)
+                inp if inp is not None else example_inputs[i]
+                for i, inp in enumerate(fn.example_inputs)
             ]
             with tracing(TracingContext(fake_mode)):
                 fn.optimized_call = vllm_backend(

From 1489902b531bb649f8110c94572b2d8b753a72cc Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 22 Nov 2025 12:01:30 +0800
Subject: [PATCH 150/249] [LoRA] Cleanup FusedMoEWithLoRA (#29187)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers/fused_moe.py           | 193 ++++++++++++------------
 vllm/lora/punica_wrapper/punica_base.py |   4 +-
 vllm/lora/punica_wrapper/punica_gpu.py  |   4 +-
 3 files changed, 98 insertions(+), 103 deletions(-)

diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index adf30855cafc3..5aeaca8de5e53 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -42,6 +42,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
         self.device = base_layer.w2_weight.device
+        self.w13_slices = 2
         self._inject_lora_into_fused_moe()
 
     def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]:
@@ -60,8 +61,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
     def _get_lora_moe_configs(
         self,
         op_prefix: str,
-        lora_a_stacked: torch.Tensor,
-        lora_b_stacked: torch.Tensor,
+        num_loras: int,
+        rank: int,
         num_slices: int,
         M: int,
         layer: FusedMoE,
@@ -69,23 +70,25 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         config_dtype: str,
     ):
         if envs.VLLM_TUNED_CONFIG_FOLDER:
+            hidden_size = layer.hidden_size
+            intermediate_size = layer.intermediate_size_per_partition
             shrink_config = get_lora_op_configs(
                 op_type=f"fused_moe_lora_{op_prefix}_shrink",
-                max_loras=lora_a_stacked.shape[0],
+                max_loras=num_loras,
                 batch=M,
-                hidden_size=lora_a_stacked.shape[-1],
-                rank=lora_a_stacked.shape[-2],
+                hidden_size=hidden_size,
+                rank=rank,
                 num_slices=num_slices,
-                moe_intermediate_size=lora_b_stacked.shape[-2],
+                moe_intermediate_size=intermediate_size,
             )
             expand_config = get_lora_op_configs(
                 op_type=f"fused_moe_lora_{op_prefix}_expand",
-                max_loras=lora_a_stacked.shape[0],
+                max_loras=num_loras,
                 batch=M,
-                hidden_size=lora_a_stacked.shape[-1],
-                rank=lora_a_stacked.shape[-2],
+                hidden_size=hidden_size,  # lora_a_stacked.shape[-1],
+                rank=rank,
                 num_slices=num_slices,
-                moe_intermediate_size=lora_b_stacked.shape[-2],
+                moe_intermediate_size=intermediate_size,  # lora_b_stacked.shape[-2],
             )
         else:  # fall back to the default config
             get_config_func = functools.partial(
@@ -152,12 +155,12 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
                 num_tokens = hidden_states.size(0)
                 M = min(num_tokens, CHUNK_SIZE)
-
+                max_lora_rank = self.w13_lora_a_stacked[0].shape[-2]
                 shrink_config, expand_config = self._get_lora_moe_configs(
                     op_prefix="w13",
-                    lora_a_stacked=self.w1_lora_a_stacked,
-                    lora_b_stacked=self.w1_lora_b_stacked,
-                    num_slices=2,
+                    num_loras=self.max_loras,
+                    rank=max_lora_rank,
+                    num_slices=self.w13_slices,
                     M=M,
                     layer=layer,
                     top_k=top_k,
@@ -165,7 +168,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 )
 
                 # get the block size of m from customized config or default config
-                max_loras = self.w1_lora_a_stacked.shape[0]
                 (
                     sorted_token_ids_lora,
                     expert_ids_lora,
@@ -175,7 +177,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                     num_tokens,
                     shrink_config["BLOCK_SIZE_M"],
                     self.base_layer.local_num_experts,
-                    max_loras,
+                    self.max_loras,
                     self.adapter_enabled,
                     expert_map,
                 )
@@ -186,17 +188,15 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                     num_tokens_post_padded_lora
                 )
 
-                w13_lora_a_stacked = [self.w1_lora_a_stacked, self.w3_lora_a_stacked]
-                w13_lora_b_stacked = [self.w1_lora_b_stacked, self.w3_lora_b_stacked]
-                max_lora_rank = self.w1_lora_a_stacked.shape[-2]
-                expert_ids_lora = expert_ids_lora.view(max_loras, -1)
-                sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1)
+                expert_ids_lora = expert_ids_lora.view(self.max_loras, -1)
+                sorted_token_ids_lora = sorted_token_ids_lora.view(self.max_loras, -1)
+                #
 
                 self.punica_wrapper.add_lora_fused_moe(
                     input.view(-1, top_k, input.shape[-1]),
                     hidden_states,
-                    w13_lora_a_stacked,
-                    w13_lora_b_stacked,
+                    self.w13_lora_a_stacked,
+                    self.w13_lora_b_stacked,
                     topk_weights,
                     sorted_token_ids_lora,
                     expert_ids_lora,
@@ -230,11 +230,11 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
                 num_tokens = hidden_states.size(0)
                 M = min(num_tokens, CHUNK_SIZE)
-
+                max_lora_rank = self.w2_lora_a_stacked.shape[-2]
                 shrink_config, expand_config = self._get_lora_moe_configs(
                     op_prefix="w2",
-                    lora_a_stacked=self.w2_lora_a_stacked,
-                    lora_b_stacked=self.w2_lora_b_stacked,
+                    num_loras=self.max_loras,
+                    rank=max_lora_rank,
                     num_slices=1,
                     M=M,
                     layer=layer,
@@ -247,20 +247,19 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 num_tokens_post_padded_lora = moe_state_dict[
                     "num_tokens_post_padded_lora"
                 ]
-                max_loras = self.w1_lora_a_stacked.shape[0]
-                expert_ids_lora = expert_ids_lora.view(max_loras, -1)
-                sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1)
+
+                expert_ids_lora = expert_ids_lora.view(self.max_loras, -1)
+                sorted_token_ids_lora = sorted_token_ids_lora.view(self.max_loras, -1)
                 intermediate_cache2 = moe_state_dict["intermediate_cache2"]
                 intermediate_cache3 = args[0]
-                max_lora_rank = self.w2_lora_a_stacked.shape[-2]
 
                 shard_size_w2 = divide(self.base_layer.hidden_size, self.tp_size)
 
                 self.punica_wrapper.add_lora_fused_moe(
                     intermediate_cache3,
                     intermediate_cache2,
-                    [self.w2_lora_a_stacked],
-                    [self.w2_lora_b_stacked],
+                    (self.w2_lora_a_stacked,),
+                    (self.w2_lora_b_stacked,),
                     topk_weights,
                     sorted_token_ids_lora,
                     expert_ids_lora,
@@ -289,7 +288,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         fused_experts.moe_sum = moe_sum_decorator(
             self.base_layer, fused_experts.moe_sum
         )
-
         self.base_layer.quant_method = FusedMoEModularMethod(
             self.base_layer.quant_method, m_fused_moe_fn
         )
@@ -301,33 +299,42 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         model_config: PretrainedConfig | None = None,
     ) -> None:
         """Initializes lora matrices."""
+        assert self.w13_slices == 2
+        self.max_loras = lora_config.max_loras
         self.fully_sharded = lora_config.fully_sharded_loras
 
         self.adapter_enabled = torch.tensor(
             [0] * (max_loras + 1), dtype=torch.int, device=self.device
         )
 
-        self.w1_lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                lora_config.max_lora_rank
-                if not self.fully_sharded
-                else divide(lora_config.max_lora_rank, self.tp_size),
-                self.base_layer.hidden_size,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
+        self.w13_lora_a_stacked = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    lora_config.max_lora_rank
+                    if not self.fully_sharded
+                    else divide(lora_config.max_lora_rank, self.tp_size),
+                    self.base_layer.hidden_size,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            )
+            for _ in range(self.w13_slices)
         )
-        self.w1_lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                self.base_layer.intermediate_size_per_partition,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
+
+        self.w13_lora_b_stacked = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    self.base_layer.intermediate_size_per_partition,
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            )
+            for _ in range(self.w13_slices)
         )
 
         self.w2_lora_a_stacked = torch.zeros(
@@ -353,29 +360,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
             device=self.device,
         )
 
-        self.w3_lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                lora_config.max_lora_rank
-                if not self.fully_sharded
-                else divide(lora_config.max_lora_rank, self.tp_size),
-                self.base_layer.hidden_size,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.w3_lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                self.base_layer.intermediate_size_per_partition,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-
         # They will be used by 'LoRALayerWeights.create_dummy_lora_weights'
         # to create a dummy LoRA weights.
         self.lora_a_stacked = []
@@ -383,20 +367,28 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         for lora_id in range(max_loras):
             for experts_id in range(self.base_layer.local_num_experts):
                 # gate_proj,down_proj,up_proj
-                self.lora_a_stacked.append(self.w1_lora_a_stacked[lora_id][experts_id])
+                self.lora_a_stacked.append(
+                    self.w13_lora_a_stacked[0][lora_id][experts_id]
+                )
                 self.lora_a_stacked.append(self.w2_lora_a_stacked[lora_id][experts_id])
-                self.lora_a_stacked.append(self.w3_lora_a_stacked[lora_id][experts_id])
+                self.lora_a_stacked.append(
+                    self.w13_lora_a_stacked[1][lora_id][experts_id]
+                )
 
-                self.lora_b_stacked.append(self.w1_lora_b_stacked[lora_id][experts_id])
+                self.lora_b_stacked.append(
+                    self.w13_lora_b_stacked[0][lora_id][experts_id]
+                )
                 self.lora_b_stacked.append(self.w2_lora_b_stacked[lora_id][experts_id])
-                self.lora_b_stacked.append(self.w3_lora_b_stacked[lora_id][experts_id])
+                self.lora_b_stacked.append(
+                    self.w13_lora_b_stacked[1][lora_id][experts_id]
+                )
 
     def reset_lora(self, index: int):
         """Resets the lora weights at index back to 0."""
-        self.w1_lora_a_stacked[index] = 0
-        self.w1_lora_b_stacked[index] = 0
-        self.w3_lora_a_stacked[index] = 0
-        self.w3_lora_b_stacked[index] = 0
+        for pos in range(self.w13_slices):
+            self.w13_lora_a_stacked[pos][index] = 0
+            self.w13_lora_b_stacked[pos][index] = 0
+
         self.w2_lora_a_stacked[index] = 0
         self.w2_lora_b_stacked[index] = 0
         self.adapter_enabled[index] = 0
@@ -434,7 +426,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 if self.fully_sharded:
                     # Based on S-LoRA, we slice W1 and W3 A along the rank dim,
                     # and W2 B along the hidden_size dim.
-                    w13_shard_size = self.w1_lora_a_stacked[index, eid].shape[0]
+                    w13_shard_size = self.w13_lora_a_stacked[0][index, eid].shape[0]
                     w13_start_idx = self.tp_rank * w13_shard_size
                     w13_end_idx = (self.tp_rank + 1) * w13_shard_size
                     w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :]
@@ -444,29 +436,32 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                     w2_start_idx = self.tp_rank * w2_shard_size
                     w2_end_idx = (self.tp_rank + 1) * w2_shard_size
                     w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :]
-
-            self.w1_lora_a_stacked[
+            # w1 lora_a
+            self.w13_lora_a_stacked[0][
                 index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1]
             ].copy_(w1_lora_a, non_blocking=True)
-
-            self.w3_lora_a_stacked[
+            # w3 lora_a
+            self.w13_lora_a_stacked[1][
                 index, eid, : w3_lora_a.shape[0], : w3_lora_a.shape[1]
             ].copy_(w3_lora_a, non_blocking=True)
 
+            # w1 lora_b
+            self.w13_lora_b_stacked[0][
+                index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1]
+            ].copy_(w1_lora_b, non_blocking=True)
+            # w3 lora_b
+            self.w13_lora_b_stacked[1][
+                index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1]
+            ].copy_(w3_lora_b, non_blocking=True)
+
+            self.w2_lora_a_stacked[
+                index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1]
+            ].copy_(w2_lora_a, non_blocking=True)
+
             self.w2_lora_b_stacked[
                 index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1]
             ].copy_(w2_lora_b, non_blocking=True)
 
-            self.w1_lora_b_stacked[
-                index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1]
-            ].copy_(w1_lora_b, non_blocking=True)
-            self.w3_lora_b_stacked[
-                index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1]
-            ].copy_(w3_lora_b, non_blocking=True)
-            self.w2_lora_a_stacked[
-                index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1]
-            ].copy_(w2_lora_a, non_blocking=True)
-
     @classmethod
     def can_replace_layer(
         cls,
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index 7c0fc8167711d..ce38751e4b6a7 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -470,8 +470,8 @@ class PunicaWrapperBase(PunicaWrapperABC):
         self,
         y: torch.Tensor,
         x: torch.Tensor,
-        lora_a_stacked: list[torch.Tensor],
-        lora_b_stacked: list[torch.Tensor],
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
         topk_weights: torch.Tensor,
         sorted_token_ids: torch.Tensor,
         expert_ids: torch.Tensor,
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 52138ef0cc3b0..ef4b4ab7c3497 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -360,8 +360,8 @@ class PunicaWrapperGPU(PunicaWrapperBase):
         self,
         y: torch.Tensor,
         x: torch.Tensor,
-        lora_a_stacked: list[torch.Tensor],
-        lora_b_stacked: list[torch.Tensor],
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
         topk_weights: torch.Tensor,
         sorted_token_ids: torch.Tensor,
         expert_ids: torch.Tensor,

From e9056056fbacecbac4318bd0323745fdd7fe55b6 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 21 Nov 2025 20:21:35 -0800
Subject: [PATCH 151/249] [Model Runner V2] Limit cudagraph size to max decode
 batch size (#29221)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 31a706475243c..763bd61834625 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -27,9 +27,11 @@ class CudaGraphManager:
         device: torch.device,
     ):
         self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
         self.device = device
 
         self.max_model_len = vllm_config.model_config.max_model_len
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
         self.dp_size = vllm_config.parallel_config.data_parallel_size
         self.compilation_config = vllm_config.compilation_config
         assert self.compilation_config is not None
@@ -39,9 +41,11 @@ class CudaGraphManager:
         else:
             self.cudagraph_mode = self.compilation_config.cudagraph_mode
         if self.compilation_config.cudagraph_capture_sizes is not None:
-            self.cudagraph_sizes = sorted(
-                self.compilation_config.cudagraph_capture_sizes
-            )
+            cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
+            # Limit the cudagraph sizes to the max decode batch size.
+            self.cudagraph_sizes = [
+                x for x in cudagraph_sizes if x <= self.max_num_reqs
+            ]
         else:
             self.cudagraph_sizes = []
         self.padded_sizes = self._init_padded_sizes()
@@ -54,9 +58,10 @@ class CudaGraphManager:
         if not self.cudagraph_mode.has_full_cudagraphs():
             # Full cuda graphs are not used.
             return {}
+        if not self.cudagraph_sizes:
+            return {}
 
         padded_sizes: dict[int, int] = {}
-        assert len(self.cudagraph_sizes) > 0
         for i in range(1, self.cudagraph_sizes[-1] + 1):
             for x in self.cudagraph_sizes:
                 if i <= x:

From 742e9ff6b39ad0433bac0d7417a41bbdc74854a3 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Fri, 21 Nov 2025 23:42:11 -0800
Subject: [PATCH 152/249] [responsesAPI] parse reasoning item input (#28248)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../online_serving/openai_responses_client.py | 44 ++++++++++++
 .../openai/test_response_api_simple.py        | 71 +++++++++++++++++++
 .../openai/test_response_api_with_harmony.py  | 27 ++++++-
 tests/entrypoints/test_responses_utils.py     | 58 +++++++++++++++
 vllm/entrypoints/responses_utils.py           | 13 ++++
 5 files changed, 212 insertions(+), 1 deletion(-)
 create mode 100644 examples/online_serving/openai_responses_client.py
 create mode 100644 tests/entrypoints/openai/test_response_api_simple.py

diff --git a/examples/online_serving/openai_responses_client.py b/examples/online_serving/openai_responses_client.py
new file mode 100644
index 0000000000000..b4eb24671507a
--- /dev/null
+++ b/examples/online_serving/openai_responses_client.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Set up this example by starting a vLLM OpenAI-compatible server.
+Reasoning models can be used through the Responses API as seen here
+https://platform.openai.com/docs/api-reference/responses
+For example:
+vllm serve Qwen/Qwen3-8B --reasoning-parser qwen3
+
+"""
+
+from openai import OpenAI
+
+input_messages = [{"role": "user", "content": "What model are you?"}]
+
+
+def main():
+    base_url = "http://localhost:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = "Qwen/Qwen3-8B"  # get_first_model(client)
+    response = client.responses.create(
+        model=model,
+        input=input_messages,
+    )
+
+    for message in response.output:
+        if message.type == "reasoning":
+            # append reasoning message
+            input_messages.append(message)
+
+    response_2 = client.responses.create(
+        model=model,
+        input=input_messages,
+    )
+    print(response_2.output_text)
+    # I am Qwen, a large language model developed by Alibaba Cloud.
+    # I am designed to assist with a wide range of tasks, including
+    # answering questions, creating content, coding, and engaging in
+    # conversations. I can help with various topics and provide
+    # information or support in multiple languages. How can I assist you today?
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/entrypoints/openai/test_response_api_simple.py b/tests/entrypoints/openai/test_response_api_simple.py
new file mode 100644
index 0000000000000..425b8199a0fd0
--- /dev/null
+++ b/tests/entrypoints/openai/test_response_api_simple.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import pytest_asyncio
+from openai import OpenAI
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-8B"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
+    env_dict = dict(
+        VLLM_ENABLE_RESPONSES_API_STORE="1",
+        # uncomment for tool calling
+        # PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+    )
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response is not None
+    print("response: ", response)
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_reasoning_item(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"type": "message", "content": "Hello.", "role": "user"},
+            {
+                "type": "reasoning",
+                "id": "lol",
+                "content": [
+                    {
+                        "type": "reasoning_text",
+                        "text": "We need to respond: greeting.",
+                    }
+                ],
+                "summary": [],
+            },
+        ],
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # make sure we get a reasoning and text output
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "message"
+    assert type(response.output[1].content[0].text) is str
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index dea8d2d28f61a..6251e1776c30a 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -35,7 +35,7 @@ GET_WEATHER_SCHEMA = {
 
 @pytest.fixture(scope="module")
 def server():
-    args = ["--enforce-eager", "--tool-server", "demo"]
+    args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
     env_dict = dict(
         VLLM_ENABLE_RESPONSES_API_STORE="1",
         PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
@@ -550,6 +550,31 @@ def call_function(name, args):
         raise ValueError(f"Unknown function: {name}")
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_reasoning_item(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"type": "message", "content": "Hello.", "role": "user"},
+            {
+                "type": "reasoning",
+                "id": "lol",
+                "content": [
+                    {
+                        "type": "reasoning_text",
+                        "text": "We need to respond: greeting.",
+                    }
+                ],
+                "summary": [],
+            },
+        ],
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling(client: OpenAI, model_name: str):
diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
index 48bf06088bc05..91c818374e3fd 100644
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@@ -1,7 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+from openai.types.responses.response_reasoning_item import (
+    Content,
+    ResponseReasoningItem,
+    Summary,
+)
+
 from vllm.entrypoints.responses_utils import (
+    construct_chat_message_with_tool_call,
     convert_tool_responses_to_completions_format,
 )
 
@@ -28,3 +36,53 @@ class TestResponsesUtils:
         result = convert_tool_responses_to_completions_format(input_tool)
 
         assert result == {"type": "function", "function": input_tool}
+
+    def test_construct_chat_message_with_tool_call(self):
+        item = ResponseReasoningItem(
+            id="lol",
+            summary=[],
+            type="reasoning",
+            content=[
+                Content(
+                    text="Leroy Jenkins",
+                    type="reasoning_text",
+                )
+            ],
+            encrypted_content=None,
+            status=None,
+        )
+        formatted_item = construct_chat_message_with_tool_call(item)
+        assert formatted_item["role"] == "assistant"
+        assert formatted_item["reasoning"] == "Leroy Jenkins"
+
+        item = ResponseReasoningItem(
+            id="lol",
+            summary=[
+                Summary(
+                    text='Hmm, the user has just started with a simple "Hello,"',
+                    type="summary_text",
+                )
+            ],
+            type="reasoning",
+            content=None,
+            encrypted_content=None,
+            status=None,
+        )
+
+        formatted_item = construct_chat_message_with_tool_call(item)
+        assert formatted_item["role"] == "assistant"
+        assert (
+            formatted_item["reasoning"]
+            == 'Hmm, the user has just started with a simple "Hello,"'
+        )
+
+        item = ResponseReasoningItem(
+            id="lol",
+            summary=[],
+            type="reasoning",
+            content=None,
+            encrypted_content="TOP_SECRET_MESSAGE",
+            status=None,
+        )
+        with pytest.raises(ValueError):
+            construct_chat_message_with_tool_call(item)
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index d966f58804b67..912e8a690573d 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -10,6 +10,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
     Function as FunctionCallTool,
 )
 from openai.types.responses import ResponseFunctionToolCall
+from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
 
 from vllm import envs
@@ -37,6 +38,18 @@ def construct_chat_message_with_tool_call(
                 )
             ],
         )
+    elif isinstance(item, ResponseReasoningItem):
+        reasoning_content = ""
+        if item.encrypted_content:
+            raise ValueError("Encrypted content is not supported.")
+        if len(item.summary) == 1:
+            reasoning_content = item.summary[0].text
+        elif item.content and len(item.content) == 1:
+            reasoning_content = item.content[0].text
+        return {
+            "role": "assistant",
+            "reasoning": reasoning_content,
+        }
     elif item.get("type") == "function_call_output":
         # Append the function call output as a tool message.
         return ChatCompletionToolMessageParam(

From ea38474ac564efdc09762ad066139b75cf68f924 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mads=20Kildeg=C3=A5rd?= <mkildegaard99@gmail.com>
Date: Sat, 22 Nov 2025 10:58:22 +0100
Subject: [PATCH 153/249] [Frontend][Responses API] Multi-turn (with type:
 "output_text") support for non-harmony requests (#29175)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mads Kildegård <mkildegaard99@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index aaf8a3ae9d2dd..bf80856c1bbfc 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1283,6 +1283,7 @@ MM_PARSER_MAP: dict[
     "text": lambda part: _TextParser(part).get("text", None),
     "thinking": lambda part: _ThinkParser(part).get("thinking", None),
     "input_text": lambda part: _TextParser(part).get("text", None),
+    "output_text": lambda part: _TextParser(part).get("text", None),
     "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None),
     "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
@@ -1463,7 +1464,7 @@ def _parse_chat_message_content_part(
         )
         return None
 
-    if part_type in ("text", "input_text", "refusal", "thinking"):
+    if part_type in ("text", "input_text", "output_text", "refusal", "thinking"):
         str_content = cast(str, content)
         if wrap_dicts:
             return {"type": "text", "text": str_content}

From 988ee66b0d54ec08a24135f7a947affe69e9dd52 Mon Sep 17 00:00:00 2001
From: jinghanhu <hujinghan.hjh@alibaba-inc.com>
Date: Sat, 22 Nov 2025 18:07:50 +0800
Subject: [PATCH 154/249] Handle triton kernel import exception  (#29062)

---
 vllm/model_executor/layers/fused_moe/config.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 21eb4d590a7d1..1826fafa8c4f5 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -28,10 +28,11 @@ logger = init_logger(__name__)
 if has_triton_kernels():
     try:
         from triton_kernels.matmul_ogs import PrecisionConfig
-    except ImportError:
+    except (ImportError, AttributeError) as e:
         logger.error(
             "Failed to import Triton kernels. Please make sure your triton "
-            "version is compatible."
+            "version is compatible. Error: %s",
+            e,
         )
 
 
From e6309acdba3a26e803d1ea7f66804f4ad30c2b9a Mon Sep 17 00:00:00 2001
From: "Jane (Yuan) Xu" <31798555+janeyx99@users.noreply.github.com>
Date: Sat, 22 Nov 2025 05:35:32 -0500
Subject: [PATCH 155/249] Simplify `from_blob` usage in
 `get_cuda_view_from_cpu_tensor` (#29027)

Signed-off-by: Jane Xu <janeyx@meta.com>
---
 csrc/cuda_view.cu | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/csrc/cuda_view.cu b/csrc/cuda_view.cu
index 938bd4ab7fc62..9853fc942bab7 100644
--- a/csrc/cuda_view.cu
+++ b/csrc/cuda_view.cu
@@ -22,15 +22,10 @@ torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
   auto strides = cpu_tensor.strides();
   auto options = cpu_tensor.options().device(torch::kCUDA);
 
-  // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
-  // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
-  // memory, so we don't free it here.
-  auto deleter = [](void*) {
-    // no-op, since the memory is owned by the original CPU tensor
-  };
-
+  // use default no-op deleter, since the memory is owned by the original CPU
+  // tensor
   torch::Tensor cuda_tensor =
-      torch::from_blob(device_ptr, sizes, strides, deleter, options);
+      torch::from_blob(device_ptr, sizes, strides, options);
 
   TORCH_CHECK(cuda_tensor.device().is_cuda(),
               "Resulting tensor is not on CUDA device");

From a4fdf2405c737843d1e95e406959f3e2e6bcf899 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Sat, 22 Nov 2025 04:59:39 -0600
Subject: [PATCH 156/249] [CI/Build] Skip tests that require libcudart in
 test_lmcache_integration.py (#29228)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 .../kv_connector/unit/test_lmcache_integration.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/v1/kv_connector/unit/test_lmcache_integration.py b/tests/v1/kv_connector/unit/test_lmcache_integration.py
index 11507d7cd4e7b..33418edc325af 100644
--- a/tests/v1/kv_connector/unit/test_lmcache_integration.py
+++ b/tests/v1/kv_connector/unit/test_lmcache_integration.py
@@ -9,6 +9,12 @@
 # Assumption vs. Correctness Tests:
 # these unit tests do *not* test correctness of LMCache-side or vLLM-side logic
 # it is to ensure that assumptions LMCache makes about vLLM's interface are stable
+
+import pytest
+
+from vllm.platforms import current_platform
+
+
 def assumes(obj, attr, is_callable=False, is_instance_of=None):
     import inspect
     from dataclasses import is_dataclass
@@ -48,6 +54,9 @@ def assumes(obj, attr, is_callable=False, is_instance_of=None):
                 assert isinstance(attr_value, is_instance_of), assumption_msg
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
+)
 def test_multimodal_interface():
     # protect against interface changes
     from vllm.multimodal.inputs import PlaceholderRange
@@ -72,6 +81,9 @@ def test_multimodal_interface():
     assert token_ids.tolist() == [0, 0, 0, 0, 4, 4369, 4369, 4369, 4369, 9]
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
+)
 def test_config_interface():
     # protect against interface changes
     from vllm.config import VllmConfig
@@ -146,6 +158,9 @@ def test_config_interface():
     )
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
+)
 def test_request_interface():
     # protect against interface changes
     from types import NoneType

From 8e22da1d7fcd43efd8fec18c0c0bf6a8e7cf61a6 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Sat, 22 Nov 2025 05:00:54 -0600
Subject: [PATCH 157/249] [CI/Build Don't add FLASHINFER backend in
 test_cpu_offloading.py (#29229)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/v1/kv_offload/test_cpu_offloading.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py
index 3ee41c40859dc..406d4c0b4c1fd 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -12,10 +12,14 @@ from tqdm import tqdm
 from vllm import LLM, SamplingParams, TokensPrompt
 from vllm.config import KVEventsConfig, KVTransferConfig
 from vllm.distributed.kv_events import BlockStored, KVEventBatch
+from vllm.platforms import current_platform
 from vllm.utils.system_utils import set_env_var
 
 CPU_BLOCK_SIZES = [48]
-ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER"]
+ATTN_BACKENDS = ["FLASH_ATTN"]
+
+if current_platform.is_cuda():
+    ATTN_BACKENDS.append("FLASHINFER")
 
 
 class MockSubscriber:

From 5a4802588ed8f7918468986fce130c19ee721674 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 22 Nov 2025 19:34:15 +0800
Subject: [PATCH 158/249] [Misc] Further clean up chunked prefill and prefix
 caching init (#29186)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/engine/test_arg_utils.py  |  2 +-
 tests/v1/core/test_scheduler.py | 19 +++++++------------
 tests/v1/core/utils.py          | 11 +++--------
 vllm/config/cache.py            |  4 ++--
 vllm/engine/arg_utils.py        | 24 +++++++++++++++++++-----
 vllm/v1/core/sched/scheduler.py |  2 +-
 6 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 472b1487ef440..10827e3b4b9cd 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -279,7 +279,7 @@ def test_prefix_cache_default():
     args = parser.parse_args([])
 
     engine_args = EngineArgs.from_cli_args(args=args)
-    assert not engine_args.enable_prefix_caching, "prefix caching defaults to off."
+    assert engine_args.enable_prefix_caching, "prefix caching should default to on."
 
     # with flag to turn it on.
     args = parser.parse_args(["--enable-prefix-caching"])
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index d9a69a77c9797..09acde6e08faa 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -76,11 +76,11 @@ def test_get_num_unfinished_requests():
 @pytest.mark.parametrize(
     "enable_prefix_caching, prompt_logprobs",
     [
-        (None, None),
+        (False, None),
         (True, 5),
     ],
 )
-def test_schedule(enable_prefix_caching: bool | None, prompt_logprobs: int | None):
+def test_schedule(enable_prefix_caching: bool, prompt_logprobs: int | None):
     """Test scheduling.
     Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
     """
@@ -582,12 +582,12 @@ def test_check_stop_min_tokens():
 @pytest.mark.parametrize(
     "enable_prefix_caching, prompt_logprobs",
     [
-        (None, None),
+        (False, None),
         (True, 5),
     ],
 )
 def test_schedule_concurrent_batches(
-    enable_prefix_caching: bool | None, prompt_logprobs: int | None
+    enable_prefix_caching: bool, prompt_logprobs: int | None
 ):
     scheduler = create_scheduler(
         max_num_batched_tokens=1024,
@@ -1425,7 +1425,7 @@ def create_scheduler_with_priority(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
-    enable_prefix_caching: bool | None = None,
+    enable_prefix_caching: bool = False,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
     use_kv_connector: bool = False,
@@ -1444,7 +1444,7 @@ def create_scheduler_with_priority(
       max_num_batch_tokens: max num tokens to batch
       enable_prefix_caching: optionally force APC config
                              (True/False) or use default
-                             (None)
+                             (False)
 
     Returns:
       {class}`Scheduler` instance with priority scheduling
@@ -1467,17 +1467,12 @@ def create_scheduler_with_priority(
         seed=42,
     )
     # Cache config, optionally force APC
-    kwargs_cache = (
-        {}
-        if enable_prefix_caching is None
-        else {"enable_prefix_caching": enable_prefix_caching}
-    )
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
-        **kwargs_cache,
+        enable_prefix_caching=enable_prefix_caching,
     )
     kv_transfer_config = (
         KVTransferConfig(
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 65511c17473b2..6830f68736453 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -42,7 +42,7 @@ def create_scheduler(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
-    enable_prefix_caching: bool | None = None,
+    enable_prefix_caching: bool = False,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
     use_kv_connector: None | bool | MockKVConfig = None,
@@ -63,7 +63,7 @@ def create_scheduler(
       max_num_batch_tokens: max num tokens to batch
       enable_prefix_caching: optionally force APC config
                              (True/False) or use default
-                             (None)
+                             (False)
 
     Returns:
       {class}`Scheduler` instance
@@ -87,17 +87,12 @@ def create_scheduler(
         skip_tokenizer_init=skip_tokenizer_init,
     )
     # Cache config, optionally force APC
-    kwargs_cache = (
-        {}
-        if enable_prefix_caching is None
-        else {"enable_prefix_caching": enable_prefix_caching}
-    )
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
-        **kwargs_cache,
+        enable_prefix_caching=enable_prefix_caching,
     )
     kv_transfer_config = None
     if isinstance(use_kv_connector, MockKVConfig):
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 2652c7c06ad0f..ef6928d8ebd5c 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -73,8 +73,8 @@ class CacheConfig:
     sliding_window: int | None = None
     """Sliding window size for the KV cache. This is primarily set in
     `ModelConfig` and that value should be manually duplicated here."""
-    enable_prefix_caching: bool | None = None
-    """Whether to enable prefix caching. Enabled by default for V1."""
+    enable_prefix_caching: bool = True
+    """Whether to enable prefix caching."""
     prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
     """Set the hash algorithm for prefix caching:\n
     - "sha256" uses Pickle for object serialization before hashing.\n
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 888f57b1ac1df..611bf1b375849 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -425,7 +425,7 @@ class EngineArgs:
         ParallelConfig.max_parallel_loading_workers
     )
     block_size: BlockSize | None = CacheConfig.block_size
-    enable_prefix_caching: bool | None = CacheConfig.enable_prefix_caching
+    enable_prefix_caching: bool | None = None
     prefix_caching_hash_algo: PrefixCachingHashAlgo = (
         CacheConfig.prefix_caching_hash_algo
     )
@@ -1975,10 +1975,11 @@ class EngineArgs:
         if self.prefill_context_parallel_size > 1:
             default_chunked_prefill = False
             default_prefix_caching = False
-            logger.warning(
+            logger.warning_once(
                 "--prefill-context-parallel-size > 1 is not compatible with "
                 "chunked prefill and prefix caching now. Chunked prefill "
-                "and prefix caching have been disabled by default."
+                "and prefix caching have been disabled by default.",
+                scope="local",
             )
 
         if self.enable_chunked_prefill is None:
@@ -1988,15 +1989,27 @@ class EngineArgs:
                 "%s chunked prefill by default",
                 "Enabling" if default_chunked_prefill else "Disabling",
             )
+        elif (
+            model_config.runner_type == "generate"
+            and not self.enable_chunked_prefill
+            and default_chunked_prefill
+        ):
+            logger.warning_once(
+                "This model does not officially support disabling chunked prefill. "
+                "Disabling this manually may cause the engine to crash "
+                "or produce incorrect outputs.",
+                scope="local",
+            )
         elif (
             model_config.runner_type == "pooling"
             and self.enable_chunked_prefill
             and not default_chunked_prefill
         ):
-            logger.warning(
+            logger.warning_once(
                 "This model does not officially support chunked prefill. "
                 "Enabling this manually may cause the engine to crash "
                 "or produce incorrect outputs.",
+                scope="local",
             )
 
         if self.enable_prefix_caching is None:
@@ -2011,10 +2024,11 @@ class EngineArgs:
             and self.enable_prefix_caching
             and not default_prefix_caching
         ):
-            logger.warning(
+            logger.warning_once(
                 "This model does not officially support prefix caching. "
                 "Enabling this manually may cause the engine to crash "
                 "or produce incorrect outputs.",
+                scope="local",
             )
 
         world_size = self.pipeline_parallel_size * self.tensor_parallel_size
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4cb5348cbacc3..a7ec0de372631 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -180,7 +180,7 @@ class Scheduler(SchedulerInterface):
         self.kv_cache_manager = KVCacheManager(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
-            enable_caching=bool(self.cache_config.enable_prefix_caching),
+            enable_caching=self.cache_config.enable_prefix_caching,
             use_eagle=self.use_eagle,
             log_stats=self.log_stats,
             enable_kv_cache_events=self.enable_kv_cache_events,

From 6965a392a4cd38ee65ac6a9c2730e0a7c62a658d Mon Sep 17 00:00:00 2001
From: Nandan Vallamdasu <nandan.vallamdasu@outlook.com>
Date: Sat, 22 Nov 2025 18:28:22 +0530
Subject: [PATCH 159/249] Fix: Resolve circular import in model_loader/utils.py
 (#29189)

Signed-off-by: nandan2003 <nandan.vallamdasu@outlook.com>
Signed-off-by: Nandan Vallamdasu  <nandan.vallamdasu@outlook.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/model_loader/utils.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index e74434e9d12cb..1db6337f4c9f9 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -19,12 +19,7 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from vllm.model_executor.models.adapters import (
-    as_embedding_model,
-    as_reward_model,
-    as_seq_cls_model,
-    try_create_mm_pooling_model_cls,
-)
+
 from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal
 from vllm.utils.platform_utils import is_pin_memory_available
 
@@ -172,6 +167,12 @@ _MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]()
 
 
 def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]:
+    from vllm.model_executor.models.adapters import (
+        as_embedding_model,
+        as_reward_model,
+        as_seq_cls_model,
+        try_create_mm_pooling_model_cls,
+    )
     architectures = getattr(model_config.hf_config, "architectures", [])
 
     model_cls, arch = model_config.registry.resolve_model_cls(

From 2d4978a57e0addf55cde6113e9615ed064b72fb7 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Sat, 22 Nov 2025 21:00:04 +0800
Subject: [PATCH 160/249] fix: clean up function never use in setup.py (#29061)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 setup.py | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/setup.py b/setup.py
index 5591bcb132447..8871b04d8fc46 100644
--- a/setup.py
+++ b/setup.py
@@ -74,18 +74,6 @@ def is_ninja_available() -> bool:
     return which("ninja") is not None
 
 
-def is_url_available(url: str) -> bool:
-    from urllib.request import urlopen
-
-    status = None
-    try:
-        with urlopen(url) as f:
-            status = f.status
-    except Exception:
-        return False
-    return status == 200
-
-
 class CMakeExtension(Extension):
     def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
         super().__init__(name, sources=[], py_limited_api=True, **kwa)
@@ -533,28 +521,6 @@ def get_nvcc_cuda_version() -> Version:
     return nvcc_cuda_version
 
 
-def get_gaudi_sw_version():
-    """
-    Returns the driver version.
-    """
-    # Enable console printing for `hl-smi` check
-    output = subprocess.run(
-        "hl-smi",
-        shell=True,
-        text=True,
-        capture_output=True,
-        env={"ENABLE_CONSOLE": "true"},
-    )
-    if output.returncode == 0 and output.stdout:
-        return (
-            output.stdout.split("\n")[2]
-            .replace(" ", "")
-            .split(":")[1][:-1]
-            .split("-")[0]
-        )
-    return "0.0.0"  # when hl-smi is not available
-
-
 def get_vllm_version() -> str:
     # Allow overriding the version. This is useful to build platform-specific
     # wheels (e.g. CPU, TPU) without modifying the source.

From 5f7209a793ec553889f8ba9972a0034393a6b196 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Sat, 22 Nov 2025 08:00:50 -0500
Subject: [PATCH 161/249] [tiny] Remove unsupported TRITON_MLA backend from
 batch invariance (#28832)

Signed-off-by: Bram Wasti <bwasti@meta.com>
Signed-off-by: Bram Wasti <bwasti@fb.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 vllm/model_executor/layers/batch_invariant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index bec7af0286345..8b33727f05fbc 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -805,11 +805,11 @@ def override_envs_for_invariance():
         "FLASH_ATTN",  # best supported backend
         "FLASHINFER",
         "FLASH_ATTN_MLA",
-        "TRITON_MLA",
         # Not yet supported MLA backends
         # "FLASHMLA",
         # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance
         # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967
+        # "TRITON_MLA",
     ]
     if curr_attn_backend not in supported_backends:
         warning = (

From 066209a045216c87bd582be97830eae728a29369 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 22 Nov 2025 15:38:44 +0100
Subject: [PATCH 162/249] [Attention] Refactor FA `block_size` limitations to
 hybrid models only  (#29084)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/attention/test_mla_backends.py       |  2 +-
 tests/v1/worker/test_gpu_model_runner.py      |  4 ++-
 vllm/attention/backends/abstract.py           | 10 ++++---
 vllm/v1/attention/backends/flash_attn.py      | 27 ++++++++++++++-----
 vllm/v1/attention/backends/flashinfer.py      | 12 ++++-----
 vllm/v1/attention/backends/mla/cutlass_mla.py |  5 +++-
 .../attention/backends/mla/flashattn_mla.py   |  5 +++-
 .../attention/backends/mla/flashinfer_mla.py  |  5 +++-
 vllm/v1/attention/backends/mla/flashmla.py    |  5 +++-
 .../attention/backends/mla/flashmla_sparse.py |  5 +++-
 vllm/v1/attention/backends/mla/indexer.py     |  6 ++---
 .../attention/backends/mla/rocm_aiter_mla.py  |  4 ++-
 vllm/v1/attention/backends/rocm_aiter_fa.py   |  5 +++-
 vllm/v1/attention/backends/tree_attn.py       |  5 +++-
 vllm/v1/attention/backends/triton_attn.py     |  5 +++-
 vllm/v1/attention/backends/xformers.py        |  5 +++-
 vllm/v1/worker/gpu_model_runner.py            |  4 +--
 17 files changed, 82 insertions(+), 32 deletions(-)

diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 1bd05e6183dc2..783e02ce89bdb 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -61,7 +61,7 @@ for backend in BACKENDS_TO_TEST:
 
 BACKEND_BLOCK_SIZES = {}
 for backend in BACKENDS_TO_TEST:
-    supported_sizes = backend.get_class().supported_kernel_block_sizes
+    supported_sizes = backend.get_class().get_supported_kernel_block_sizes()
     if supported_sizes:
         default_size = supported_sizes[0]
         block_size = (
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 01c1364f7ee62..d0f1b703fcb92 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -185,7 +185,9 @@ def _make_mock_backend_for_kernel_block_size(
     supported_sizes: list[int | MultipleOf],
 ):
     class _MockBackend:
-        supported_kernel_block_sizes = supported_sizes
+        @staticmethod
+        def get_supported_kernel_block_sizes():
+            return supported_sizes
 
     return _MockBackend()
 
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 67ded88475243..bd7e81b15bfc3 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -46,9 +46,12 @@ class AttentionBackend(ABC):
     # makes sure the output tensor is allocated inside the cudagraph.
     accept_output_buffer: bool = False
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(1)]
     supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = ["auto"]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(1)]
+
     @staticmethod
     @abstractmethod
     def get_name() -> str:
@@ -142,10 +145,11 @@ class AttentionBackend(ABC):
         if block_size not in valid_sizes:
             return False
 
-        if not cls.supported_kernel_block_sizes:
+        supported_kernel_block_sizes = cls.get_supported_kernel_block_sizes()
+        if not supported_kernel_block_sizes:
             return True
 
-        for supported_size in cls.supported_kernel_block_sizes:
+        for supported_size in supported_kernel_block_sizes:
             if isinstance(supported_size, MultipleOf):
                 supported_size = supported_size.base
             # With hybrid_blocks feature, the framework-level block size
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 9fa6b1dfd19dd..a9a4af5ac1183 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -32,7 +32,7 @@ if is_flash_attn_varlen_func_available():
         get_scheduler_metadata,
         reshape_and_cache_flash,
     )
-from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.config import VllmConfig, get_current_vllm_config, get_layers_from_vllm_config
 from vllm.config.cache import CacheDType
 from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
@@ -56,11 +56,26 @@ logger = init_logger(__name__)
 class FlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    # NOTE(tdoublep): while in principle, FA supports
-    # MultipleOf(16), these are the block sizes that do not
-    # suffer from the NaN propagation problem described here:
-    # https://github.com/Dao-AILab/flash-attention/issues/1974
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        vllm_config = get_current_vllm_config()
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        if (
+            model_config
+            and model_config.is_hybrid
+            and (
+                cache_config.mamba_ssm_cache_dtype == "float32"
+                or cache_config.mamba_cache_dtype == "float32"
+            )
+        ):
+            # NOTE(tdoublep): while in principle, FA supports
+            # MultipleOf(16), these are the block sizes that do not
+            # suffer from the NaN propagation problem described here:
+            # https://github.com/Dao-AILab/flash-attention/issues/1974
+            return [16, 32, 64]
+        return [MultipleOf(16)]
 
     @staticmethod
     def get_name() -> str:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index e3f499216d7f1..8159f4096107f 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -16,7 +16,6 @@ from flashinfer import (
 from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 from flashinfer.utils import FP4Tensor
-from typing_extensions import override
 
 from vllm import envs
 from vllm.attention.backends.abstract import (
@@ -275,10 +274,6 @@ class BatchDCPPrefillWrapper:
 class FlashInferBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    # Note: Not sure for all platforms,
-    # but on Blackwell, only support a page size of
-    # 16, 32, 64
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
@@ -286,6 +281,12 @@ class FlashInferBackend(AttentionBackend):
         "fp8_e5m2",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        # Note: Not sure for all platforms, but on Blackwell,
+        # only support a page size of 16, 32, 64.
+        return [16, 32, 64]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHINFER"
@@ -566,7 +567,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             )
 
     @classmethod
-    @override
     def get_cudagraph_support(
         cls: type["FlashInferMetadataBuilder"],
         vllm_config: VllmConfig,
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 60cb5022a55eb..5e3fbc0abf083 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -36,13 +36,16 @@ class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
 
 class CutlassMLABackend(MLACommonBackend):
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [128]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
         "fp8_e4m3",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [128]
+
     @staticmethod
     def get_name() -> str:
         return "CUTLASS_MLA"
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index 12639edc8b9a1..d369814c10b6f 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -41,9 +41,12 @@ logger = init_logger(__name__)
 
 class FlashAttnMLABackend(MLACommonBackend):
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
+
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN_MLA"
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py
index 52bb19e039e45..f02a4bb1ef35a 100644
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -35,13 +35,16 @@ class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
 
 class FlashInferMLABackend(MLACommonBackend):
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [32, 64]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
         "fp8_e4m3",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [32, 64]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHINFER_MLA"
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 3aab1f9bb7fb6..74a4cd8430250 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -39,13 +39,16 @@ logger = init_logger(__name__)
 
 class FlashMLABackend(MLACommonBackend):
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
         "fp8_e4m3",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [64]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHMLA"
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index 3f2cc8c38327e..1eee1d225293b 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -55,9 +55,12 @@ structured as:
 class FlashMLASparseBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto", "fp8_ds_mla"]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [64]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHMLA_SPARSE"
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index d38361e0fcbf8..77f1ba00d5b04 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -24,9 +24,9 @@ logger = init_logger(__name__)
 
 
 class DeepseekV32IndexerBackend(AttentionBackend):
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [
-        1 if current_platform.is_rocm() else 64
-    ]
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [1 if current_platform.is_rocm() else 64]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 6ccc1a341d56c..56f9c7a281e7f 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -21,7 +21,9 @@ from vllm.v1.kv_cache_interface import AttentionSpec
 
 
 class AiterMLABackend(MLACommonBackend):
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [1]
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [1]
 
     @staticmethod
     def get_name() -> str:
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index ea611848b0e81..c8742e9835203 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -447,7 +447,10 @@ class AiterFlashAttentionMetadataBuilder(
 class AiterFlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index 1bf38ed225a4c..523f759e05a21 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -31,7 +31,10 @@ logger = init_logger(__name__)
 class TreeAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 09c36043c8c86..d051a89f03bb4 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -154,7 +154,6 @@ class TritonAttentionBackend(AttentionBackend):
         torch.bfloat16,
         torch.float32,
     ]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
@@ -162,6 +161,10 @@ class TritonAttentionBackend(AttentionBackend):
         "fp8_e5m2",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
+
     @staticmethod
     def get_name() -> str:
         return "TRITON_ATTN"
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index d15d79417cc61..5039c44b9c3e6 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -42,7 +42,10 @@ logger = init_logger(__name__)
 class XFormersAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e786cd8bc7c97..298bb1ef5f6fd 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4618,7 +4618,7 @@ class GPUModelRunner(
             """
             for backend in backends:
                 is_supported = False
-                for supported_size in backend.supported_kernel_block_sizes:
+                for supported_size in backend.get_supported_kernel_block_sizes():
                     if isinstance(supported_size, int):
                         if block_size == supported_size:
                             is_supported = True
@@ -4649,7 +4649,7 @@ class GPUModelRunner(
         all_int_supported_sizes = set(
             supported_size
             for backend in backends
-            for supported_size in backend.supported_kernel_block_sizes
+            for supported_size in backend.get_supported_kernel_block_sizes()
             if isinstance(supported_size, int)
         )
 

From d44a63c6d6e1a545aff270b3b85cf231ef779dab Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sat, 22 Nov 2025 06:41:25 -0800
Subject: [PATCH 163/249] [BugFix] Fix returned logprobs with spec decode +
 prefill chunking (#29216)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/sample/test_logprobs.py   | 13 +++++++++----
 vllm/v1/sample/sampler.py          |  5 ++++-
 vllm/v1/worker/gpu_model_runner.py | 19 +++++++++----------
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 42584938bc06f..c0b0e1ea226ed 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -521,8 +521,8 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
         pytest.param(
             (
                 "eagle",
-                "meta-llama/Llama-3.1-8B-Instruct",
-                "yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
+                "meta-llama/Llama-3.2-1B-Instruct",
+                "nm-testing/Llama3_2_1B_speculator.eagle3",
             ),
             marks=large_gpu_mark(min_gb=32),
         ),
@@ -541,7 +541,7 @@ def test_spec_decode_logprobs(
     """
     from vllm import LLM
 
-    prompt = "Hello world"
+    prompt = "Hello world " * 50
     sampling_params = SamplingParams(
         temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
     )
@@ -582,6 +582,9 @@ def test_spec_decode_logprobs(
         seed=42,
         logprobs_mode=logprobs_mode,
         gpu_memory_utilization=0.4,
+        # Force prefill chunking
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=32,
     )
     spec_results = spec_llm.generate([prompt], sampling_params)
     # Collect logprobs outputs from spec decode LLM.
@@ -597,6 +600,8 @@ def test_spec_decode_logprobs(
     # Per-token logprobs are expected to be the same.
     assert len(ref_logprobs) == len(spec_logprobs)
     for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
-        assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3)
+        assert math.isclose(
+            ref_logprob.logprob, spec_logprob.logprob, rel_tol=5e-2, abs_tol=1e-1
+        )
         assert ref_logprob.rank == spec_logprob.rank
         assert ref_logprob.decoded_token == spec_logprob.decoded_token
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 39c63fe31ad2c..c75b4f0543c0d 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -81,7 +81,10 @@ class Sampler(nn.Module):
             if logprobs_mode == "raw_logprobs":
                 raw_logprobs = self.compute_logprobs(logits)
             elif logprobs_mode == "raw_logits":
-                raw_logprobs = logits.clone()
+                if logits.dtype == torch.float32:
+                    raw_logprobs = logits.clone()
+                else:
+                    raw_logprobs = logits.to(torch.float32)
 
         # Use float32 for the logits.
         logits = logits.to(torch.float32)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 298bb1ef5f6fd..979f977587038 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2466,7 +2466,9 @@ class GPUModelRunner(
 
         num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
+        logprobs_tensors = sampler_output.logprobs_tensors
         invalid_req_indices = []
+        cu_num_new_tokens: list[int] | None = None
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
@@ -2479,6 +2481,12 @@ class GPUModelRunner(
                     sampled_token_ids,
                     self.input_batch.vocab_size,
                 )
+                if logprobs_tensors:
+                    # Needed for extracting logprobs when spec decoding.
+                    # This must be done prior to discarding sampled tokens.
+                    cu_num_new_tokens = [0]
+                    for toks in valid_sampled_token_ids:
+                        cu_num_new_tokens.append(cu_num_new_tokens[-1] + len(toks))
             # Mask out the sampled tokens that should not be sampled.
             for i in discard_sampled_tokens_req_indices:
                 valid_sampled_token_ids[int(i)].clear()
@@ -2506,10 +2514,6 @@ class GPUModelRunner(
         # the sampled tokens back, because there's no direct communication
         # between the first-stage worker and the last-stage worker.
         req_ids = self.input_batch.req_ids
-        logprobs_tensors = sampler_output.logprobs_tensors
-        cu_num_accepted_tokens = (
-            [0] if spec_decode_metadata and logprobs_tensors else None
-        )
         for req_idx in range(num_sampled_tokens):
             if self.use_async_scheduling:
                 sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
@@ -2518,11 +2522,6 @@ class GPUModelRunner(
 
             num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0
 
-            if cu_num_accepted_tokens is not None:
-                cu_num_accepted_tokens.append(
-                    cu_num_accepted_tokens[-1] + num_sampled_ids
-                )
-
             if not sampled_ids:
                 continue
 
@@ -2544,7 +2543,7 @@ class GPUModelRunner(
             req_state.output_token_ids.extend(sampled_ids)
 
         logprobs_lists = (
-            logprobs_tensors.tolists(cu_num_accepted_tokens)
+            logprobs_tensors.tolists(cu_num_new_tokens)
             if not self.use_async_scheduling and logprobs_tensors is not None
             else None
         )

From ae66818379fc2403e43c47154a98170aa7cab192 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 22 Nov 2025 22:48:01 +0800
Subject: [PATCH 164/249] [Misc] Fix pre-commit (#29238)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/model_loader/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 1db6337f4c9f9..2021b68b8a60b 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -19,7 +19,6 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-
 from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal
 from vllm.utils.platform_utils import is_pin_memory_available
 
@@ -173,6 +172,7 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
         as_seq_cls_model,
         try_create_mm_pooling_model_cls,
     )
+
     architectures = getattr(model_config.hf_config, "architectures", [])
 
     model_cls, arch = model_config.registry.resolve_model_cls(

From d84d8f4429a5246a9d9f179b47fac7e13801710d Mon Sep 17 00:00:00 2001
From: ZiTian Zhao <zitian.zhao@tencentmusic.com>
Date: Sat, 22 Nov 2025 22:48:59 +0800
Subject: [PATCH 165/249] Fix EVS crash when using `video_embeds` inputs in
 Qwen2.5-VL (#29232)

Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/qwen2_5_vl.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 8e3c0e84dfe51..1500a437613cc 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -230,6 +230,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
         - hidden_size must match the hidden size of language model backbone.
         - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
           format
+        - second_per_grid_ts: The video time interval (in seconds) for each
+          grid along the temporal dimension in the 3D position IDs. Returned
+          when `videos` is not `None`.
     """
 
     type: Literal["video_embeds"]
@@ -244,6 +247,11 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
         TensorShape("nv", 3),
     ]
 
+    second_per_grid_ts: Annotated[
+        torch.Tensor | None,
+        TensorShape("nv"),
+    ] = None
+
 
 Qwen2_5_VLVideoInputs: TypeAlias = (
     Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs
@@ -1311,6 +1319,7 @@ class Qwen2_5_VLForConditionalGeneration(
                 type="video_embeds",
                 video_embeds=video_embeds,
                 video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
             )
 
     def _process_image_input(
@@ -1422,7 +1431,13 @@ class Qwen2_5_VLForConditionalGeneration(
 
         # Cast to long to match the original code
         # https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa
-        second_per_grid_ts = video_input["second_per_grid_ts"].long()
+        second_per_grid_ts = video_input.get("second_per_grid_ts")
+        if second_per_grid_ts is None:
+            raise ValueError(
+                "second_per_grid_ts is required when video_pruning_rate > 0 "
+                "is enabled for video inputs, including the video_embeds path."
+            )
+        second_per_grid_ts = second_per_grid_ts.long()
         tokens_per_second = self.config.vision_config.tokens_per_second
 
         video_embeds_out = []

From f55c76c2b3270bb45072c05d6d53460c373b2172 Mon Sep 17 00:00:00 2001
From: Federico <65908512+coval3nte@users.noreply.github.com>
Date: Sat, 22 Nov 2025 17:42:48 +0100
Subject: [PATCH 166/249] chore: add RTX_PRO_6000 GLM4.6-FP8 kernel tuning
 (#29240)

---
 ...ackwell_Server_Edition,dtype=fp8_w8a8.json | 147 ++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..8b78f87e7f73b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}

From 730bd35378bf2a5b56b6d3a45be28b3092d26519 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Sat, 22 Nov 2025 17:04:36 +0000
Subject: [PATCH 167/249] [perf][cpu] Accelerate paged attention GEMMs (QK, PV)
 on Arm CPUs with NEON (#29193)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 csrc/cpu/cpu_attn.cpp                  |  17 ++
 csrc/cpu/cpu_attn_impl.hpp             |   8 +-
 csrc/cpu/cpu_attn_neon.hpp             | 386 +++++++++++++++++++++++++
 vllm/engine/arg_utils.py               |   3 +-
 vllm/v1/attention/backends/cpu_attn.py |   7 +-
 5 files changed, 416 insertions(+), 5 deletions(-)
 create mode 100644 csrc/cpu/cpu_attn_neon.hpp

diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
index 50f17c758c148..92f8bee5a47a0 100644
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -13,6 +13,18 @@
   #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
 #endif
 
+#ifdef __aarch64__
+  #include "cpu_attn_neon.hpp"
+  #define NEON_DISPATCH(...)                                                   \
+    case cpu_attention::ISA::NEON: {                                           \
+      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
+                                                     scalar_t, head_dim>;      \
+      return __VA_ARGS__();                                                    \
+    }
+#else
+  #define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
+#endif  // #ifdef __aarch64__
+
 #define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
   case HEAD_DIM: {                            \
     constexpr size_t head_dim = HEAD_DIM;     \
@@ -41,6 +53,7 @@
   [&] {                                                                       \
     switch (ISA_TYPE) {                                                       \
       AMX_DISPATCH(__VA_ARGS__)                                               \
+      NEON_DISPATCH(__VA_ARGS__)                                              \
       case cpu_attention::ISA::VEC: {                                         \
         using attn_impl =                                                     \
             cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t,   \
@@ -73,6 +86,8 @@ torch::Tensor get_scheduler_metadata(
     isa = cpu_attention::ISA::VEC;
   } else if (isa_hint == "vec16") {
     isa = cpu_attention::ISA::VEC16;
+  } else if (isa_hint == "neon") {
+    isa = cpu_attention::ISA::NEON;
   } else {
     TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
   }
@@ -158,6 +173,8 @@ void cpu_attn_reshape_and_cache(
       return cpu_attention::ISA::VEC;
     } else if (isa == "vec16") {
       return cpu_attention::ISA::VEC16;
+    } else if (isa == "neon") {
+      return cpu_attention::ISA::NEON;
     } else {
       TORCH_CHECK(false, "Invalid ISA type: " + isa);
     }
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 294b4f714a769..12c6f5d3015cc 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -14,7 +14,7 @@
 #include "utils.hpp"
 
 namespace cpu_attention {
-enum class ISA { AMX, VEC, VEC16 };
+enum class ISA { AMX, VEC, VEC16, NEON };
 
 template <ISA isa, typename scalar_t, int64_t head_dim>
 class AttentionImpl {};
@@ -143,6 +143,12 @@ struct AttentionMetadata {
       case ISA::VEC:
         ss << "VEC, ";
         break;
+      case ISA::VEC16:
+        ss << "VEC16, ";
+        break;
+      case ISA::NEON:
+        ss << "NEON, ";
+        break;
     }
     ss << "workitem_group_num: " << workitem_group_num
        << ", reduction_item_num: " << reduction_item_num
diff --git a/csrc/cpu/cpu_attn_neon.hpp b/csrc/cpu/cpu_attn_neon.hpp
new file mode 100644
index 0000000000000..827f0cfbc718e
--- /dev/null
+++ b/csrc/cpu/cpu_attn_neon.hpp
@@ -0,0 +1,386 @@
+#ifndef CPU_ATTN_NEON_HPP
+#define CPU_ATTN_NEON_HPP
+
+#include "cpu_attn_impl.hpp"
+#include <arm_neon.h>
+#include <type_traits>
+namespace cpu_attention {
+
+namespace {
+
+#define BLOCK_SIZE_ALIGNMENT 32
+#define HEAD_SIZE_ALIGNMENT 32
+#define MAX_Q_HEAD_NUM_PER_ITER 16
+
+// These do not use vectorized class for loading / converting
+// because csrc/cpu/cpu_types_arm.hpp does not have fallback options
+// for vec_op::BF16Vec* / vec_op::BF16Vec* on Arm HW that
+// doesn't support BF16.
+// We don't use vec_op::FP32Vec* or vec_op::FP16Vec* for consistency.
+template <typename kv_cache_t>
+FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, float32x4_t& b0,
+                                     float32x4_t& b1);
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<float>(const float* p, float32x4_t& b0,
+                                            float32x4_t& b1) {
+  b0 = vld1q_f32(p + 0);
+  b1 = vld1q_f32(p + 4);
+}
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<c10::Half>(const c10::Half* p,
+                                                float32x4_t& b0,
+                                                float32x4_t& b1) {
+  const float16_t* h = reinterpret_cast<const float16_t*>(p);
+  float16x8_t v = vld1q_f16(h);
+  b0 = vcvt_f32_f16(vget_low_f16(v));
+  b1 = vcvt_f32_f16(vget_high_f16(v));
+}
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
+                                                    float32x4_t& b0,
+                                                    float32x4_t& b1) {
+  const uint16_t* u = reinterpret_cast<const uint16_t*>(p);
+#ifdef ARM_BF16_SUPPORT
+  uint16x8_t u0 = vld1q_u16(u);
+  bfloat16x8_t bf0 = vreinterpretq_bf16_u16(u0);
+  b0 = vcvtq_low_f32_bf16(bf0);
+  b1 = vcvtq_high_f32_bf16(bf0);
+#else
+  uint16x8_t x0 = vld1q_u16(u);
+  uint32x4_t lo = vshlq_n_u32(vmovl_u16(vget_low_u16(x0)), 16);
+  uint32x4_t hi = vshlq_n_u32(vmovl_u16(vget_high_u16(x0)), 16);
+  b0 = vreinterpretq_f32_u32(lo);
+  b1 = vreinterpretq_f32_u32(hi);
+#endif
+}
+
+// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
+// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
+// #FMLAs = (K // 4) * (4 * 2 * M)
+// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
+template <int32_t M, typename kv_cache_t>
+FORCE_INLINE void gemm_micro_neon_fmla_Mx8_Ku4(
+    const float* __restrict A,       // [M x K],
+    const kv_cache_t* __restrict B,  // [K x 8],
+    float* __restrict C,             // [M x 8],
+    int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) {
+  // kernel supports max M of 8, as it'd spill for larger M
+  static_assert(1 <= M && M <= 8, "M must be in [1,8]");
+
+// helpers for per-M codegen
+#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7)
+#define IF_M(i) if constexpr (M > (i))
+
+  // A row base pointers
+#define DECL_A(i) const float* a##i = A + (i) * lda;
+  ROWS_APPLY(DECL_A)
+#undef DECL_A
+
+  // declare 2 accumulators per row of M
+#define DECL_ACC(i) float32x4_t acc##i##_0, acc##i##_1;
+  ROWS_APPLY(DECL_ACC)
+#undef DECL_ACC
+
+  // initialize accumulators
+#define INIT_ACC(i)                              \
+  IF_M(i) {                                      \
+    if (accumulate) {                            \
+      acc##i##_0 = vld1q_f32(C + (i) * ldc + 0); \
+      acc##i##_1 = vld1q_f32(C + (i) * ldc + 4); \
+    } else {                                     \
+      acc##i##_0 = vdupq_n_f32(0.f);             \
+      acc##i##_1 = vdupq_n_f32(0.f);             \
+    }                                            \
+  }
+  ROWS_APPLY(INIT_ACC)
+#undef INIT_ACC
+
+  int32_t k = 0;
+
+  // K unrolled by 4
+  for (; k + 3 < K; k += 4) {
+    // load A[k..k+3] for each active row (M)
+#define LOAD_A4(i)     \
+  float32x4_t a##i##v; \
+  IF_M(i) a##i##v = vld1q_f32(a##i + k);
+    ROWS_APPLY(LOAD_A4)
+#undef LOAD_A4
+
+    // helper: FMA lane L from aiv
+#define FMAS_LANE(i, aiv, L)                              \
+  IF_M(i) {                                               \
+    acc##i##_0 = vfmaq_laneq_f32(acc##i##_0, b0, aiv, L); \
+    acc##i##_1 = vfmaq_laneq_f32(acc##i##_1, b1, aiv, L); \
+  }
+
+    // k + 0
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 0) * ldb, b0, b1);
+#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0)
+      ROWS_APPLY(STEP_K0)
+#undef STEP_K0
+    }
+    // k + 1
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 1) * ldb, b0, b1);
+#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1)
+      ROWS_APPLY(STEP_K1)
+#undef STEP_K1
+    }
+    // k + 2
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 2) * ldb, b0, b1);
+#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2)
+      ROWS_APPLY(STEP_K2)
+#undef STEP_K2
+    }
+    // k + 3
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 3) * ldb, b0, b1);
+#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3)
+      ROWS_APPLY(STEP_K3)
+#undef STEP_K3
+    }
+#undef FMAS_LANE
+  }
+
+  // K tail
+  for (; k < K; ++k) {
+    float32x4_t b0, b1;
+    load_row8_B_as_f32<kv_cache_t>(B + (int64_t)k * ldb, b0, b1);
+#define TAIL_ROW(i)                             \
+  IF_M(i) {                                     \
+    float32x4_t ai = vdupq_n_f32(*(a##i + k));  \
+    acc##i##_0 = vfmaq_f32(acc##i##_0, b0, ai); \
+    acc##i##_1 = vfmaq_f32(acc##i##_1, b1, ai); \
+  }
+    ROWS_APPLY(TAIL_ROW)
+#undef TAIL_ROW
+  }
+
+  // store accumulators to C
+#define STORE_ROW(i)                          \
+  IF_M(i) {                                   \
+    vst1q_f32(C + (i) * ldc + 0, acc##i##_0); \
+    vst1q_f32(C + (i) * ldc + 4, acc##i##_1); \
+  }
+  ROWS_APPLY(STORE_ROW)
+#undef STORE_ROW
+
+#undef ROWS_APPLY
+#undef IF_M
+}
+
+template <int32_t N, typename kv_cache_t>
+FORCE_INLINE void gemm_macro_neon_fmla_Mx8_Ku4(const float* __restrict A,
+                                               const kv_cache_t* __restrict B,
+                                               float* __restrict C, int32_t M,
+                                               int32_t K, int64_t lda,
+                                               int64_t ldb, int64_t ldc,
+                                               bool accumulate) {
+  // micro kernel is Mx8
+  static_assert(N % 8 == 0, "N must be a multiple of 8");
+  for (int32_t m = 0; m < M;) {
+    int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1;
+    const float* Ab = A + m * lda;
+    float* Cb = C + m * ldc;
+
+    for (int32_t n = 0; n < N; n += 8) {
+      const kv_cache_t* Bn = B + n;
+      float* Cn = Cb + n;
+      switch (mb) {
+        case 8:
+          gemm_micro_neon_fmla_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        case 4:
+          gemm_micro_neon_fmla_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        case 2:
+          gemm_micro_neon_fmla_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        default:
+          gemm_micro_neon_fmla_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+      }
+    }
+    // no tail loop for N as it's guaranteed to be a multiple of 8
+    m += mb;
+  }
+}
+
+template <typename kv_cache_t>
+class TileGemmNeonFMLA {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    if constexpr (phase == AttentionGemmPhase::QK) {
+      gemm_macro_neon_fmla_Mx8_Ku4<BLOCK_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c);
+    } else {
+      gemm_macro_neon_fmla_Mx8_Ku4<HEAD_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc,
+          accum_c);
+    }
+  }
+};
+
+}  // namespace
+
+// this is similar to "ISA::VEC" at the moment
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      BLOCK_SIZE_ALIGNMENT;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      HEAD_SIZE_ALIGNMENT;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::NEON;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+  static_assert(HeadDim % HeadDimAlignment == 0);
+  // the gemm micro kernel is Mx8
+  static_assert(HeadDimAlignment % 8 == 0);
+  static_assert(BlockSizeAlignment % 8 == 0);
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemmNeonFMLA<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+
+  // Copy q to q_buffer and cast it to fp32
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      float* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, float scale) {
+    static_assert(head_dim % 16 == 0);
+    constexpr int32_t unroll_size = head_dim / 16;
+    using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
+
+    vec_op::FP32Vec16 scale_vec(scale);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
+        scalar_t* __restrict__ curr_q =
+            src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
+        float* __restrict__ curr_q_buffer =
+            q_buffer + q_num_idx * q_heads_per_kv * head_dim +
+            q_head_idx * head_dim;
+
+        vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
+          load_vec_t vec(curr_q);
+          vec_op::FP32Vec16 fp32_vec(vec);
+          fp32_vec = fp32_vec * scale_vec;
+          fp32_vec.save(curr_q_buffer);
+
+          curr_q += 16;
+          curr_q_buffer += 16;
+        });
+      }
+    }
+  }
+
+  // reshape K as column-major and V as row-major
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key
+          const scalar_t* key_start_ptr = key +
+                                          token_idx * key_token_num_stride +
+                                          head_idx * key_head_num_stride;
+          scalar_t* key_cache_start_ptr =
+              key_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
+            key_cache_start_ptr[j] = key_start_ptr[i];
+          }
+        }
+        {
+          // Write Value
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset * head_dim;
+          std::memcpy(value_cache_start_ptr, value_start_ptr,
+                      sizeof(scalar_t) * head_dim);
+        }
+      }
+    }
+  }
+};
+}  // namespace cpu_attention
+
+#endif  // #ifndef CPU_ATTN_NEON_HPP
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 611bf1b375849..b7c8f56e18c52 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1392,11 +1392,10 @@ class EngineArgs:
         # Set default arguments for V1 Engine.
         self._set_default_args(usage_context, model_config)
         # Disable chunked prefill and prefix caching for:
-        # POWER (ppc64le)/ARM/s390x/RISCV CPUs in V1
+        # POWER (ppc64le)/s390x/RISCV CPUs in V1
         if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
             CpuArchEnum.POWERPC,
             CpuArchEnum.S390X,
-            CpuArchEnum.ARM,
             CpuArchEnum.RISCV,
         ):
             logger.info(
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index f1254352c0585..590bf91b0d057 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -25,7 +25,7 @@ from vllm.v1.kv_cache_interface import AttentionSpec
 
 logger = init_logger(__name__)
 
-_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86,)
+_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86, CpuArchEnum.ARM)
 
 
 class CPUAttentionBackend(AttentionBackend):
@@ -491,6 +491,9 @@ def _get_attn_isa(dtype: torch.dtype, block_size: int) -> str:
     if supports_amx and dtype in (torch.bfloat16,) and block_size % 32 == 0:
         return "amx"
     elif block_size % 32 == 0:
-        return "vec"
+        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
+            return "neon"
+        else:
+            return "vec"
     else:
         return "vec16"

From d1cf8214e523ce664797b3f65a26ffdc6e81f032 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 23 Nov 2025 02:22:48 +0800
Subject: [PATCH 168/249] [Bugfix] Use HF config fields as fallback when
 loading Mistral config (#29239)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-amd.yaml                   |  1 +
 .buildkite/test-pipeline.yaml              |  1 +
 vllm/transformers_utils/config.py          | 14 +++++++++++++-
 vllm/transformers_utils/configs/mistral.py | 13 ++++++++++---
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4ee81fdabf665..f098e23866eb3 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -754,6 +754,7 @@ steps:
   torch_nightly: true
   source_file_dependencies:
   - vllm/model_executor/models/
+  - vllm/transformers_utils/
   - tests/models/test_initialization.py
   commands:
     # Only when vLLM model source is modified - test initialization of a large
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a5719d438eece..7a46e919f93bf 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -691,6 +691,7 @@ steps:
   torch_nightly: true
   source_file_dependencies:
   - vllm/model_executor/models/
+  - vllm/transformers_utils/
   - tests/models/test_initialization.py
   commands:
     # Only when vLLM model source is modified - test initialization of a large
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 9eac7bb50afa6..db7bf228f411d 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -204,7 +204,19 @@ class MistralConfigParser(ConfigParserBase):
 
         from vllm.transformers_utils.configs.mistral import adapt_config_dict
 
-        config = adapt_config_dict(config_dict)
+        # Get missing fields from HF config if available
+        try:
+            hf_config_dict, _ = PretrainedConfig.get_config_dict(
+                model,
+                revision=revision,
+                code_revision=code_revision,
+                token=_get_hf_token(),
+                **kwargs,
+            )
+        except OSError:  # Not found
+            hf_config_dict = {}
+
+        config = adapt_config_dict(config_dict, defaults=hf_config_dict)
 
         # Mistral configs may define sliding_window as list[int]. Convert it
         # to int and add the layer_types list[str] to make it HF compatible
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index 8da4ab35c56c3..966737aad0867 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -9,14 +9,18 @@ from vllm.logger import init_logger
 logger = init_logger(__name__)
 
 
-def adapt_config_dict(config_dict: dict[str, Any], **kwargs) -> PretrainedConfig:
-    config_dict.update(kwargs)
+def adapt_config_dict(
+    config_dict: dict[str, Any],
+    defaults: dict[str, Any],
+) -> PretrainedConfig:
     config_dict = _remap_general_mistral_args(config_dict)
 
     if bool(config_dict.get("quantization")):
         config_dict = _remap_mistral_quantization_args(config_dict)
 
-    if bool(config_dict.get("moe")):
+    if config_dict.get("model_type") == "mamba":
+        config_dict["architectures"] = ["Mamba2ForCausalLM"]
+    elif bool(config_dict.get("moe")):
         config_dict["architectures"] = ["MixtralForCausalLM"]
     else:
         config_dict["architectures"] = ["MistralForCausalLM"]
@@ -52,6 +56,9 @@ def adapt_config_dict(config_dict: dict[str, Any], **kwargs) -> PretrainedConfig
     if is_audio:
         config_dict = _remap_mistral_audio_args(config_dict)
 
+    for k, v in defaults.items():
+        config_dict.setdefault(k, v)
+
     config = PretrainedConfig.from_dict(config_dict)
 
     logger.debug("Initialized config %s", config)

From eb5352a7707dea349f77fcfcd6f8842cca92b34a Mon Sep 17 00:00:00 2001
From: Benjamin Bartels <benjamin@bartels.dev>
Date: Sat, 22 Nov 2025 18:23:09 +0000
Subject: [PATCH 169/249] [CI/build] Removes source compilation from runtime
 image (#26966)

Signed-off-by: bbartels <benjamin@bartels.dev>
---
 docker/Dockerfile                             |  70 +++++---
 .../dockerfile-stages-dependency.png          | Bin 121695 -> 134558 bytes
 tools/ep_kernels/install_python_libraries.sh  | 156 ++++++++++--------
 tools/install_deepgemm.sh                     |  44 +++--
 4 files changed, 157 insertions(+), 113 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 709b79e84fbbc..1b937bbc1225e 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -85,7 +85,7 @@ ARG GET_PIP_URL
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo python3-pip \
+    && apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \
     && curl -LsSf https://astral.sh/uv/install.sh | sh \
     && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
     && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
@@ -224,6 +224,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
+# Install DeepGEMM from source
+ARG DEEPGEMM_GIT_REF
+COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
+RUN --mount=type=cache,target=/root/.cache/uv \
+    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
+
+# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
+RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
+
+COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
+# Install EP kernels(pplx-kernels and DeepEP)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
+    /tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \
+    find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
+
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
@@ -289,7 +305,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y software-properties-common curl sudo python3-pip \
     && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
         if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
@@ -356,36 +372,32 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 uv pip list
 
-# Even when we build Flashinfer with AOT mode, there's still
-# some issues w.r.t. JIT compilation. Therefore we need to
-# install build dependencies for JIT compilation.
-# TODO: Remove this once FlashInfer AOT wheel is fixed
-COPY requirements/build.txt requirements/build.txt
+# Install deepgemm wheel that has been built in the `build` stage
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt \
+    --mount=type=bind,from=build,source=/tmp/deepgemm/dist,target=/tmp/deepgemm/dist,ro \
+    sh -c 'if ls /tmp/deepgemm/dist/*.whl >/dev/null 2>&1; then \
+              uv pip install --system /tmp/deepgemm/dist/*.whl; \
+           else \
+              echo "No DeepGEMM wheels to install; skipping."; \
+           fi'
+
+# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
+RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system ep_kernels/dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REF
-COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
-RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
-
-COPY tools/install_gdrcopy.sh install_gdrcopy.sh
-RUN set -eux; \
+RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
+    set -eux; \
     case "${TARGETPLATFORM}" in \
       linux/arm64) UUARCH="aarch64" ;; \
       linux/amd64) UUARCH="x64" ;; \
       *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
     esac; \
-    ./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \
-    rm ./install_gdrcopy.sh
-
-# Install EP kernels(pplx-kernels and DeepEP)
-COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
-ENV CUDA_HOME=/usr/local/cuda
-RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a 10.0a+PTX}" \
-    && bash install_python_libraries.sh
+    /tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
 
 # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
 # return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
@@ -415,6 +427,11 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y git
+
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
     CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
@@ -455,12 +472,11 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 
-COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
-
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
     if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
-        uv pip install --system -r requirements/kv_connectors.txt; \
+        uv pip install --system -r /tmp/kv_connectors.txt; \
     fi; \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         BITSANDBYTES_VERSION="0.42.0"; \
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index f8c104ba14259e820a5b4ddf4d872b8696d82b33..57a33524a5169c8b56c7de309cdeb243ab3fa918 100644
GIT binary patch
literal 134558
zcmb5W2RxT;*gt$VgchYzvL!-P$es-up{U48q9}XIx=|`wA=wHc$<C%tDnd$jAtNIr
zd%WMP)P29t`~Khe|Mxtf`+1PxbzSFq9LIMY$9Y{>S5;WInqf6T5bG3=96U}C>pci!
z#RaP6_=(W1>vQ-&tIjGb93&RW|A{Y64j~9uLh+!Src?AltFzOIhPxsiZntEg(QZ6^
zY}GP~9qcRBM^>|Yv&wsMcc-mrJiRHG{erJTTHdL$(|PR;4?G_~ORE03A&ha89Bt@b
zPs76$VRD-`T%&y@D_=cXQ797bcj5Xf2dd%OPfZsGOREDXoK@AnIDWkT;%pXf<<FPs
za%w}$Ki?39XXn+4-T!{exNA`7-|s|^H|78PomFD4<+8uub{sKnC!h59*OTD7GKxRn
z5S|=KZfpL2M=0>@8u<HN#gSvjYVRM78XX(Uw`?zS9B#|(Boy?*cuv&S*Vot8^?%BU
z3r*yukp29loO|i7WDiYD_r#S?e~TLHsWEtb>2p;W@8n=Jy#qVz%3Zs5ZQHiZyeYS@
zG0P}U+KoZPg3*DU2rvrP;@je|YUvX^_dRx>pP{DPS+X$m{Y|(4jl>%Dp<L7Y`yPu6
z2d=I?_MD=F^8Mc@;0eudcfUN)m}Ob|^W#4&DA&iTDJhwK&NlYjvb+C7sz$=$b>(t2
zTweF?-J6@6D=8^Ck#;g&zqBYNrMIKu#fukZZnNuIq)X1%#2Rw0>~Ko_yBM0Z!=;mL
zOpYyk&sRs^ioib&V3L>^Y|h6&#Um)Hs;=M5O#8^o%ggx9?W0lqcQg3&Yh}h46cn86
z^IF4SFxhrl!f}Z4tRxY@@po&USX<7|{1j=mtM7UpqHKZ%khJT4&vxMIhYSOm7iVOr
z#~V%=9KEy0aBgZO&SSy(^|^BQ`Bwo<%_+wcXuX$N>HhoR0Y)!hz6_L}>BTK(UHtTD
zO6Bn3SNLcA#RJ*6S*x67-7Q!T{mX-S9*dJ6i$9*zbPA@XrcSmxw)=~;-3bb!wb)No
z3H}YD4yT_#KN%P}cZUs^&05T5A4@pwF;gG+sHmT(RHvhoNz|(2^ZB^>`C*U6m-GfN
z&by5Fj?9hKd`ded&~k<d)%njuntpk91zVw8?(RmrGWbZ>%=e7Nz{ROR24ORwm^GsO
zgN<38kBWyFBpfbTTU*nvT*n~f)Rz=qEweb6X}e6F`QLD@h)a%hpDFtOh?kFVtyjF^
z^Q$#dle*#jde7EVFix6OM?3#)IAz&Vu$RgUfu@F)dmk@n=({|=MUhJ4Paw<o-Ne>4
zUCi)5Whqed<Jpxp^bgvW8{fmKDc|NR?7ipb#oAczu_*jWckIau8Xu<Bo;C`zTbBM@
zG`7WUvQ?zr)pjoVi^9#Vd);QnXMTS7Wm=sg+W#?q_M?vFIu>b3DlaX*iRShNhYu;o
z$Qy7~ESViDeqR0OCaey=y1C7Eb}+x9BEQw4*_%fCPEbeh-uRoGJ9dos)h8>3@eG%Y
zsnV_#JYTJH^r+P0+}QJK3R&k3*I%ABJpboYrar~Gj=naS`%0gs#kcYL&256;?nhay
zme`>F)PI|x$%|_n<Y*7F2Xn=_jz(U5L@lv_==S<sG^`l!h}oLd#7g<yxZ%Z0MySVR
zar>1RDp9t%J~fXSY%R-5xg@2q`)YCSPbuhqVyHZS{JR(3t*`vIOW5^hA{SYyERb*I
zbKL@lT+3Bi^IL4*a0E(jT>11hS6s=3+I!>z;je@K^Vg@39Xpn9(TeLct&|&XE9Fv)
zjYbeUaCfY<>3H&wGJoN@rd(4wTD!hFV=UG6>({Z2tSZts|CFxorWG`jv>9#tEn21`
z&c_}rr3lDEq|@s=-Bk?PDMwp^2cnUgvueCfP(|eYsehiM-$T@-O`9JNwv}F{qoeD2
z8&RKSRFO%yj4P$kw#T8buKUS9D^L~m%3PguOx`_HT`7CPb^F?-73VI$x}K>BrSI1h
zpKQ=!J=lZHd?(V*xbkVN)Mbl;FIQ0&X?LFgTVlI4g&5vdSFiVq$3mfiWt>bu-FV%T
zt_lAfPdJ7wqLABTTovjkP$oKXknnl_`A_MVb>8Ina^Z)s%!>*dsqvux-LypyPNcPY
zEY6RN#5lI~wH)xIyZv`R)tO62t9bqB1=f3YwAj`Re#$Utv?jb*saCDs`u^?#Usfs`
zTU!GZ0P+Uc|4sCGNd_qwJ69`Y5<<o5Iq~8Ud$7!WpW2A^>*h@aSLC&^nkDe?9R2p3
zMn-b%t)Mq+Wo4ySp^Y)Fun?Y8Eq?Lkjm>hj54Ez`7fdLgP`~{584AZ5jRId%lELx(
z=<*-tpA0<QT=%%oO<9Eyw5$KKPRe{+E;eR<K~$k8&!F-(q#WbGf-Js4omG>b?m}@r
z?9Y&HP%gdkIvI>cRsUXPI@@XFtN(V9RdQCfhv-w=mUpYWxVTIXwR*EQW*R=vHh%Lg
zV8xTH`+p01sDNeJw27<LYtC?=sFfYX<Acp2(sv}CZ3+|#(TTr!={agV)KWM?Mksmr
zdP;5aN^RMMsP)0KSz3+T{v_gpg7>O*h(xVohrx86{Hrk(9TE+x8e9Sb87Q1&?QYa2
zMDIucdwC9LXXouA7N}z8&5sSwFPoCa->vsL^Ze>M{02YhxbrVCR|VHd&+<uMXn1e}
zw^wFx`0!!ZOozcHETPrS|0GrAWAo;(-s1NC<Ri_7l^hK#v0i57^)F0R1q1e`Sa<K<
z-Livd&*(BYo1GZ^vU6F-*80B~S{1BU?C_GZA?*~62=$Z6G(3Q7rfpC4|Gd}{6l+)7
z>j$4<UFMbA?klww5TbGy{sull{I9u%?45hH7rM&d#Y#yGh$@`>^hl30Vz{s;7NJR0
zg#Ha_Ebspbui?DH^k+i|_xL2`@Up3|ekh~WSJ!PH9UZN!t5X&4OH_>!H$HzJEJS4E
z^|?0{ac&cz#caQ!j&e;;PrrWs`sepgGP46Y#rEI#pLychly{LON``{4a<%^Vw%DBT
zuQ`H`?AZ9ax|%<tq{C|s`=ZYGkLfhCPWt+7@p9hyGskwf$=$jIR`vIfUb%8bC*MM^
z(5Ab<x-&Z`r$($Nx}~LM*yU&G<;jJ)9vRC|k6KaA_E@%+APlx_*)r6W_q|?C#_h`$
zdfdDNE@xo<ax++r_*AO>_m3|%lJ4KXt<q-yNiW#)bCLb`zy&obf;;(d+!CVI5A}*C
zfXE(0g*}MX)*}0go7-~0rQBvG%v%aZz{_eXE2+F#MMUxeeYi`Bf%V}A?$a7-Qk(r=
zo_>63dc0n(f<=Ivd#X1faHxE7AvZVo*|TT#2CiABGW07Z)K(FUl>c6l>S>L~!Wg)l
z-%A@C8>OA?s8fxLkI%lmae%oc=e*j<lOOvV(!5zOd`PLKWO3c&I`sp&e(I6#bXSO4
z8A_MTREPIaYcZu<ChiAh#RAnJh-s&4v?!`0E0wBPcPK6p=S@iyAQKxM?fPol`4)#}
z^ELGR4>gjj^94mr>vqJN^;E}jhIq_%vKY17G*&G0ydn1&yFEK!o9DMqeDP&@RYA8?
z<pNfLO++*GcsKAPRln4Qb|oU<xi5`O*Hu(0De20pDkU|w#R+90%eJ}6@<q38J*x<w
z)qhJT5wJaKzd3`9yQ^yp3M>jb^TcBf+-DVU#?3%m8yjiFk@u<cxzRUMtTHl-#ri2&
zIAdCRBIVkxXX}5*s=CR<P^*r*dL{^MYq4X+`AB@^^@9PfzK8^V?d-0O2-Lq%DExdH
z%ZVQxl0QFaAZh&qDc5fOA?2uV^O|!c)`D-#<%!Sd7w6j-OV6ctl{5nacWY<w5ELA1
z4$L=e;Mw9ArfosQFRxun^C~~U<M8*4^0vq3I);W{9_ba6?8VPu?w+Q^+!q=Tr;dNX
zo<HYX8gy5M1-kk=k9|X840l9;SJ-|3I9TZj)~<l4>2wSD@Zm#ZVj|d>p2z%8g|Y**
zD-Rtyq^P*SL`ih)0V~tqH}oO3ezN5=%G0fpXffOM`c>7{v7**zX;)sJ92#sX9J{uG
z1IXHxs2Gy%znak0T=@7~0B{R}Hn-sK@V{k{5iURbW1PQ5B)QC0OAJR#U3PeFjFPL=
zTy|7dRr=gFV0Ti}mkJiklj`a}fXc38??gBSQBadw$+bX6rK6nrGYCVnTn1z9ky)Hp
zSha#k*}7%-Y0Z5WZ6(u_EzWg{TwyoKX!H(WMZ-K-rz`+=qa#N<&{jHI1{$}xFpGLT
zAni7z>pJqXC-$<%*|TT0DJcn_Kd>5qNi*%FIk**PxJZlDbA!d1Psaz<ab<4-V3JPL
z+TA*jOS*0;KRA`aT&|*~rgr2Acf)#;+#*07e!NXhjy?T=X+5Lp`-j>&5Gyefj;rJ{
zk-3(RUW-Xg5)K2+`Ic{P<5MnC{~{JitzQtt(*rp**v89v0F>|H#1K_d`<RL>Be*Tk
zW;H%FFc<F*)d=|U6ze(IUS8gHYQsi4x^|~d|I4TnPbpbiN-sOQ_J9%dqi}(`3Sa!l
z2&R8jr1b-o-@Zi-g4pc2R0nAxPv1p}pkz(I5h$Mnt1qAFPeU5zK^<yFCEmZ!+)~zx
z@G>>hMO>iLsj8|z!t#SsLMs|HF=25foqlxGN!w@Mym@nWxO}l<zzU_#1EDNq0>T1*
zqydr$nT3yZs|aB79lsU$A}ynA=GAt~P~H6(-gkgCO$DbCggmLl>J|#*t|M)jTRx_1
zrq)6K91dWdMiDF@uTutu?f&eHK<Y%<5wU1Nm4=pzx_$fPs@i3qs}C-f3965e^e#7<
zCNoKfL`Um3O_(RE#z6931onatMhahy1z;rk*FquGNgqFUD52I{z(e-s4)+Td*Abmz
z4!+tsCf(n@eatXe{Gg#mmRhjj=+LgX+k|J(iS*OLyLP3fq;R`~ZdTuFmzpv(Yj_}x
zV)*5`Z%=nOJ`WVu5Kqn-S4Li{`%QRFgkbXNJTsa7;)MD?Q@k606^@h(Wms;MWK;&`
zX^wVj;|VMb&)|3R$4PEJ9HDpaXf-L|Stn8IEfEKZ7nG}+Wt7wk3AL2WPVrs)cI`4n
zEpg!@WDD4Xj|?{Dl|yAubq89u-%bI1)VNRN#LW+!-@gBn(+`A)+jy;0g=1e`5-A&{
zCR;RiW-KQ(*Dh&2tCm?H28-*Dr}m5K@$sw-{wQ3Nm5wQ=^ZM1?M?-M0D2_>p5GXuR
zu*g9DGXehy)<}*iEQA9XUwI3poyvIRvW5ty#Fl$}PtX6_39yR+HX9dvAbXhIr@jCo
z3|#t>Mk0Vi$FOIjcb4LjaT0}eHots+up=Vx;-}jVAwaXjfsMyDkSOu_^JfFMpAAqV
zkd<RRCmw`4_>v3#_Rd~%Q{h7NCC9bhJLHFZCNb$kA_bw!h=MF&+Wzg^w}s~N`R2)%
z5}WR-kLfy;a(gbkk3S#leE*!Zpr9=lgcQkOF*LJhAkm8)2E}c^z3rfMD*|5ZU&0G+
z(*L*>85w!&mQ7cn5Wq@01zI3v=TTJrCPZVx#sE4+>=3C6j2yJeT&G*e7>Uu@H%78y
z#FK);3W6sRDu<(d8iKCy`=jF3t5<umI!$+n`_iy1NQuZyw%Cq5^<1{1taN6ig2m(1
z?0zaQ)QMNk`M&cMp2v-GLw9^KSNsa0C)bCNKIG13VBf1_B=7{q*q*U(?|2*5!yO>^
z^4q&8M+637JUTl14g3V|q$?cxC?MVf7eA!j#cna4G&3`M?SyJaDUf%emVM*(T%_x{
z`}ic0HYX$3IUvo^IO+M~$Md6a1jry_rHT}>jE#wzAKb3*^ySL1VjePUij_pGv=a|B
z6_u34FMR`|9!5~^xD^+-C^fr+s1R5Z3$m4@=8I~d`_>Tw%OeHUDyb>)fbV)^hSgP1
zi~^-5!prB3$z8?sl)Bi@c@`bl&b?5J0uvV;T#I}GrDFCEB4nN4W5aJ*EA|Led7+j*
z1mIG6LCt*cm?iBVKpmf6kr0)aocF3y)cy0(*00zuy8~0^XD8bMe>*mAlX4k1415rS
zLP=iujlsfbvoaYBiW!rrh%%Q3fI0J>filuNY-v}bu7dC*(oIcG+ef%o5doIfzX91Z
zD8~BLHaxAgMN5G-mQoE&)rvz-TTxN*b!BC5bN+C8e%nlO`~0KaK%}pZMPU-S1dt32
zWf<s+Jq|dmO0eqw>~E0hiMIXPS(~6Ba60z_lneA!(YMHptJkdYW-UN<ped8#<mBYy
zN=9<Y(OzoGO~v9lwpdqued!0ZEd&!G^;WO~vuSJ5PjEHbl~1u8&Bczx!!CVM7DatG
zcpu%_Z+;qF7d#C@<R^(8$TFBAyZHGbV<6$7h{OkSID7gIC)9$DAciiY^1`2+y~U|q
z7PE~=Il>OxV&J#7R^KqtUhV-E2?0BVTSr<b{Tu+Ww7~2WtQJi`ZB^i6Vhn#&@bv~s
zLA$b1HvxpA15fP<%^i!=af{nz%<N#5ebRH(YqS<U3~!+|=X}j`=G%auPl4M-hMLFl
z2U9;YJmz1}sgb~zdHG|iMh_meu>V0?S?M@ZX0d#6axvCpVH#-bb}A7@%=|Zz^65^N
z#VI`&na38bF4bZ^FVuRZ4x)0x42+13oY9NY*y~ijT-5Nd0tv80%`8C7&<p63MFRnY
zqG9;Vhcs)Fn$BT;$fs!^f;vH-xXq1J@Sb`!2civ|G66}o`2;ebe)s8dGL}C+)HZM(
zIb>*Pn0`8sloo?|pBBcGWuyi;gv~x|m2sc5^q6fX%NhO}Q4g|z@%ZuMr7l1B2@1}R
z_u0POe9HCI;?J9c)l;+Wiwki7%12&pN1P5J0Z5euS%@bE#-$-9p#Mq{CfkM0#-$>@
ze@G39ikf|YfaYkd<aADrbTc>tf~-cg^X9S@t6D&@;N%+rh+9Lb^NTHksCy`wL7ZKk
zB4rJ{pxtmU)U}-Bm&3Mu2)>=l?2PonL)9b4Q35$R#qO-?nCjqD>~Q)rzq-SxJ7epw
z0@b0AiXew17BO$K1KrOhGrHITkGDo*lcq{idfO_wA0X!Q;7hB$;_(EiOTZ;t7MI?;
zK0ZDPK_tl*J+>rDpDbo0wBZbf_Xopc6EXXcf<IB%weg2H0Q(gsxad3>VWD~R{sz)a
z?@(Bh;2j4~X=xR}$Rg`8Xg+cT8N(!OrkU!1pT0a+wQBB&(W{%<1y+U@7UY^r&yUxS
zRZ9rKznHs-v>u=$k~;m@bR10hZ2>%?PKP&)x&j8bD?&T-jNCP6&+G=^79Q|b1#}2#
z?P}rmghC?1eB>!mkMtkr9fn%Av1n%KOF_Rg8Z7~D^!tq)H=fS7m>()0ekE`b3yX}m
z;#j7jc>33uY23`9rH2l97i#<>kW9zStiP6O)e|<*;7)JsJ+!@R`K|oaTO#~~D(#<-
z?VXt%ZijmU#cbdIVHXDn2OAsPz(HB(W15=dQ=NeofkMi#*TE%&*bZ2fjJ$$ft7*k)
zaJ#Y0?eZ_9;&t-}nyK6LM1}AY2&&p~9KJ~<v0Pm>T`TKe*C!vCkFcc#3cGIMhNG#-
zCIHJr{ZjEIf8qfs+<*Ovc=kr5G)VD%)tDkEj#K*1oCBh=6u}V@z(9jCs2cp<eQ|#B
z)I+UWaL&vF%QRirZQl<y?0|Ab8bQ^|f4JS|*JD^Ihnloq=mM?779t?^2;Y`962EbA
z&WglC=o?VVQa~9i6?<?ca1g0L=fEGoeCAt}<#7!S-V+ZtIna9E$iSw78m{&_0|x**
zxL#uY6T?fNvq`1{1M~Ywa~PP06>xNv{<zR{+$0%+(c9V3%t{3g-swxTEVK|LFYK|4
zj+T~|y1G^9D4}@<m0Z>J2ei&FlS&1R1!->QH_JO-++B*Jk#uB;9IX%%1V$H_aX*j%
zX8Z;RZqLPi;1VbolYRB)$9ro*uE<-nPEd=J9(jtE!!Jh*MvUuw@N3fE894N>tJR?Z
z9((kc1-i^?D|T(Gpoz+BV{n6HwYWC21HrM3ksg?WhmMMiGXPBnt-d|cYwej}zPK<C
zwb%~w4;Y3-hz<?iMD})eEkU?di$*C$6^fY|Q%l~vni@;^^w*q;0D5$jZm@X3d-+Hz
z@BvZ6H&WF4E5x4{D=;vlfA<sW$|ct?lpbzs97H5^0Wz1#VT)&WE=uh#jr%1~ipen!
zO(2!9{j6N?>@_+Z$Sj4=fP%xSKMz>(qug$=$sVa9NBcHXI88a6pWGp>B^!+O+?p+X
zcW~tw*Xd;7Pq0$)L&cvma1$O-CkP*S9C>C9NX!$#N6vv+lh&QYh9@4{k1uJPn3&9&
z_roV$E3uwnJMw48xr#1klup1Qu`O}gzVa!0QEOJOe&)ZDknKfq33yDuVWomZV}acZ
z6KpQ9c&Lci{j1@e`D#zPpe2UORtzV>oAro_%D~6;=HA)_nVH^%(9lrCcqsjd$g4~Z
z!pErh{au9nTCa}0Px?_Z9`3(A;4OSwK_MY@D^YRQU0u6vqXQ%HY{PXAn%@EcEF0)L
z+3rzpQ0|VUUnj9fb~@X2dK6rArQCbiOKIr)JV<*a4)5XldI~EQsfDRd2wNz9DAQQs
zizns1G+ue`*-TIG*#Ds~ZgK8B?aI!I!1AG@@1%n+`o|@Tk1hfxD)=*;u-SI~AZ((U
ziNWn>cX%2-%*@QhmB|qG>Zny<`N=UXTR`raO;;r=)xLfEkg$=zgXq~(=o3Wu2M~dg
ze#joI3o5h!%d>FqTOk@jNE-{0)DeQ`9fsSaw)nwIHUhZ-UqW}p%Ih?GQHWa*+pwUZ
zJ*b!{jAV!6{Ss2Zlo@Vk@jooMZ(lWVE;EPF<o^CC1FWD21c^j1SiGz&flN@6y0j~M
zV8^2<@tk^u7G{T!hL%<%Vgk0}N;#|gqayxH68lMOU-36QdhW|aoI*Z)yUrEcSmHbe
z?FVy?4jDr51fU$heft)RfwsWe7uOV+op%fzd_vuWq7xuu5hZ-_@wwL<WfTZ2ucct^
zsOYatdINTAU_%yuWZ_OOF5j)xgy-s=F!!)96ciM4w33Kpnc2@Z*pO)1g@ws-nr-`b
zP<cTLy@kF+c|`Aq_1vExb4P(A0QUtYupPZS(j7&&<T)9y6eMcjV$s^V?_4F)<N3Q4
z(W2N%EX5X<KqiTkKeu?Iq><brQ}7pxTe+q|OG1pef|*H;yr6@FXMwDd_o`<hp5^07
z;iPCj(k+BSEu&wiNkIHT?$6Fmj~QiYsj5~}(Cb?DrzT%TkF5hjp#jqS7oYl(W-#AB
z(QHXWHn!w<Z1KAiv(%~JCR;y0z?f0w!Gh3TEZS~Sa2H^LF4EL7BpMJa*6HhYR;mYR
zHo)Qe2|sv1UG~am|6hI1q0c#7R9>Xh09iPpr$K!E6~fnl=Ft9`1;De#pPeB)W%%<j
z-JP7A>3YSFfk|wKmrsqLFv6hwopS)*oho<s!0JGzZhjAg)xOpK)8@^aDdgT~7?hJ`
z1<Q=e5=vj@zw`u~>q%z=w0bR*xDePnl^4HWQA*qL&9daDsYW}20<Mx%*V1x&C;VwG
z@GUcQ&w#G%OF*+7+KObKXQY4d#3{X>FeCAIR(t-G-V_8Fq&SL&x(O0o2(1{~YO!QA
zwa|6o-i|&p*}p@{{2C5M`9)@7BoHk#u@+5PicClsUJ&s4$@V~l07kLVyS~gf4zho_
zzdXKV$yw|u-$YNZ7Axrtz=4*9jU3^jyOM~HCM_2@awTA3tW@9~i?F>7K+L5pmdmKX
zzWf$(-u-8qL3ejIg<Mc{G=972+*KKD*t0xda*2kqHUeEe7J57an5A5R-6Ii>ZO+LG
zfy}r8!qdZaVQw0++LCE_U@jX!CA)t}^5;h>7#(j57)Ya(z>^no{|s0FZ|IJ;t)>>)
zOmv@=Z~VRJRPAd7N~~Zz+}5<L5&!=1>({R!C1_*xqC*YQ@~H4z<Wx8C116FLb`zq{
zXZu=FePq4~04yRRVx-@E$HwXD8jmukk+a~CWxh+DM=Hbvgy1+zGZw}Oomp*dZTCm?
z7~^=;Y+3I1e4PlXKHVcw&SPT8Z*}cz4^dNB?_9C|gs!e6fE>n#W$lGrSVz*IaYf|Q
z+&w@3)gs%8?zkm@kfU8`Sw8Oorvw%A+4|+O&em^s=SRErr=n7&0d3KR3OFJs_Y_*y
z0qqI%)}p*~*$`(KLwHkFg)*XyOujbaIeG>O2<*RcH~~V8td_4`!McYFIs>*_meg*i
zAQa@6s6HY30sZAVC<m-PV66uo=pxvqm<T~v-s+&kK;vHU5%TWQT<Cr}h87D-pULmw
z&<4#NC%^2<r-{uvd|R54_^@EcMn@qG^1!0Pcux&M{dQbtfC+_lLCc+8SXlUBG{NZj
z2Ua`}pu##JS`fS=Z$MUI|1bXFef#8CwnZ!1GetoRH>@qk+HrB6heWJTR)gB~&^BMA
zxfK^EL(?Djb7WXp*qu9DfsFxzOB|SZV(2>c#TVGPNG1ZTSZok!{)TF(QcG-7->XOZ
z7p0T>uDZxe{0^=IE_Mp{4x1T|3Ks=5K^Zb$S=p~WC^i<I3tjvco_SYS7e*jp6z>)j
z6O)pX;^f3f@Nh;A!}fuW3qea<#$y3J6L4ht8I9#%NU{yJ;oKAR2^<XUQfy$0CyUCT
zjhIp;g+3TCyvm=@Ygb|@QZkug;lL>oF)lt2Er8vHbOXx9-nw-hVFFT1j$t4=kSNUy
z?TmhFHtWFfuV2)`zE`8wi<?BzQs1xyjGD4A0Z9Ewwy-k2(c>$B4DJBV$=nc*eu>N1
z>s+{ZTTKHB!jtacpUmP`&e9qHKbMB<H~aHD>D7|0z9;1$p_LLok?uXLJzuV)k%`d0
zvf4|<Bt;a~Q7NiZ&*EI%BH6scXbmPCb^=TZtAjA}9EBd6V@ZIaaC$*00$wc;MD(4d
zz~H%twgtIEg@uJ9Z%Feh`wjmZ5g4e9cLxRr_O;vo=y;0Xp_cHD0XsyW{)Yf_Z<o3}
z;R<9Y88lRQ5@<d+B4(uQq26(AOJt9VgYwa%ZUb2rtJX1S?kq;JU|VYPs$0)hc+6Em
zZBNojPssumtsr^W2dJJ{@7%jL3mn`y`vgQJ)Xxb_Cy2WIx#s|&Ke*z4wx{j$%W3bU
ziIIb+21xvb1rPDUsD=)rN7s5#Pm~d$yLnrQbHqVZz~`ePnZFR1kd4^I$2ZrI;f_uo
z>1807KnwVRkW8yZrQwCK@7mQzDWeZ_9j+cIMj{A_@g0oBl^y*PPTYdj)YO0iWvD+)
z#2_yr1^G+I>ktp!r_lU`(uEBSi~2li2V#WEcd#C;hg11Bn(6tMI<Xjrhg|D7{k|n`
z9aPQY`HAL=Ko%Cim&_h>Bj6ENW2m?dU?2%YVB$_*hj78E?yk6mRI(#U6&jov{V?84
zgFE|E<p?WH71)UYW_QpMv|$^yJ#7pjOf>GRTtA$EzWz7JWg2~@BS$X43jurhQOOa=
z=p1qv@KqzT=nAP4Ag@IbQ>uTmoOp7Klnxh!B11w1!SD{8UJWKm&WJcTIKa_GzXG!`
zCMMrdAp_4fBa$vX^Pxulm*ly-jOZx-<NJ6HkD~?iNWXMuaiM*2y93AqcMRqhKIfe8
z>FO%M)DdhU(Q|pY5optDlw2gEjm_v&WmMqV@WEzO5D+;Jlrm#R27(B<xr8582{2U1
zMhzX8s>+cgcck51;1oPpT}cp&qz#ABALMo${s9UhV|HY4aBxgaOjsC(5R#AVuxVu4
z!msOu9*=vCa;{lJ{}p=u3+UZNTv<hU9=G|89<qgF7>ZiQBo4)zE=Nnkc*MR#hz8Hm
zOHgsBQMDKBzP*h=Z;w(SDKXJ*s6~vI_u>=JWdOCgaGe-L0*w7Bc)T>l;($l8<YN_g
zN3{^d4{y=maTO0)@kW4zLM#iKy0AIR7k*}h>p5sQ-i76&PPv}(j)<kc5JL<sas?JQ
zyGk<zAU_NOWGpr$8I5nV9?&cV^ia@i)Y||S8F$=;437{&gi`!A8a!=C#mk`(F!vg*
zhfn1Tc`z}4W$$7Z#3G#Cg9){8r#sQLl0KIQ00$nUM*tdpRB7g)@`~;LhjVgrXkz=|
zt=iT|If8#KJb6zL>i3uOv?>M@UYPd!*;mgsz=|g{Kx07UcNk;QhuX^*JCb14lP-{i
zL@{U&9Ok)^j64u6Ak@Gwf(ehReSXN192KA$8eNXO{X2(Ibp<`SWTXb^p*PBTE*{s_
zEy0irvgCqL5nQJOt5+caO8{Nj+1V&Z;jyv$j|Eymj>+dj?+>t_%nx%fyOoh3+;_CE
z{Vn#0XCkm^i$T!QEk%`g!HfY$BOnX79+5#AA;`tW)f09~pX5nwX{bGe2w!yXG3Td?
z5r%|NHVKK#q(v<=Z-=lAtW!oMr};LpG=IwK3Sc3DZ7xorv;3?F=5g$M-+w_DE(5FE
z*w^8c38J6h=EBS1Uk0K81|aUVi;e9re*1e95Q3ts4AI%R6%xV`T)Ec?bFSRG|FqrX
zb$@Z+bqu*c5pOO!SYK$YV2BIbly6iKa6ZQVB!Kjp64mB9czp@A7a$=35I6k%X!IW(
zPf~)aL<PkcX>w<7#Hu?lxpIgfde}#0nCB?#xe8^3Muv3T3k)b4VW9#}2&Jf_CreIg
zTo9T;vS4&{Ca3}Ur-wc;q6h1mic-r>y^M&L{C6jVvEN<Y-J}M^Xeyiqt;{n|!6%_H
zJsP9FH>A<aY!Ts?lA7O|e5LaZ3cesd2>T3T*4@*?@;|eAqmn#KP5!@=Hr+??L?&<V
zY*_geriosLekm&zC{#Xz#fi=lq!?}GK0}t92iXyU1CWYnN4Trd9VpC)nEDk?;!wbt
z?C=hUX)NwI6de|34>t||DEC7j0Ilm19E%>t6ph)ZM@$Z@3GQK%txJu+KY&p?RCTDV
znF(@qX$*tDKicsR*B9PRe7uG$8?=xQqX>v85CD=o&}c&_IiL@ogSed>HbA(jV;7!`
z@cx-L=JFxoj=p^AB@~zo0ti%wsLBH{PMtxs&DXpgP1<5WDAwt^0f_=53_~!q$hn;<
zZ@{*s$v<~>5w$fS>j|~J7%PC%K_vmDMY3S`5p<EEiF*J|GC?%;zYt2H^#FY8?*@zF
zTm0s`xMW_xdnYxRTaUaTx=)d9V{afE<LmjQo;O-@t-!=M=$H}8JWH`!SC|7pAaK42
zTnxWjudk${B8mAUTo0s&9O3vCZHMmt_U^1Uy*!%*Ns46QaTmj^5rm;|%Mn7o+W6c#
ztk*Ew-gw$1m(0Y+I!%;crt<y^QCfTsKaRtNf;)2?iU9+&vGCh5whY<7%cio&^4CLE
zy&tQQ@xWAUMFlQ2^PFXY)`A(q11H_$^({_dD>@=%gLZ>727?#CN};UdFk!)a@}cMh
zYPN&04zP^Xcqq7Guv@GvQB`7HMqbjNH#R229`t!VCmTU%IxV%bL~lGQ8LcAwJQ(P@
zJYK8d3ugeFT2%U+i3vH-g0jN`wuiw-Hy^bV(ER`j_-}f7W+&w2<dl@Ej;*7vXhbz7
z$2TqriDUW)E)z4E279eKo&W|A-3si%zk=raf#0~&!3`k_%ZX**xg+F2aP@Mc!XLij
zD@qLYij7W~)2@U*!)12!@SMirBA8czs<`NVIDjEA#{3VXm7@*q{Np$*NaSm1SeP&_
zhK4oguN+vuq(McCVDdZ!Nlm&H5wVR0{lQ#K8fM+*hO+J{k{5z-qJ>WmIfC~4KEzYx
zqu`@9gzMTUf_ooOq#_6|(Z9va_4V7g$jmIQOW2Miw>{F_=H~QO-XK(^Kn{Q|z7Pj8
z=Y(OuOGrrY@_t7p#k>d=x#+aXe<lh;1^W8>0EzlwD<CN_PIhb*dqznkJv)(-<h}1P
zfCTfW-CbQOKvUSWlmeVPcg8|LleY%&#b_BSCW!H0UB~U%1i}W+jm3YYo%1cXURB~*
zGE7g@u>?SV=;Oz0DzvPvN6<CItP^}>5JI*zgk(SZcfgCJaQ(>Jx3`ehs^jv%)k+mK
z3JK}Os+E22b#T6ACk+O@3}NDeUz>k?`1#lxDw!7m`gZ7?UxbY8eluk^LGwO*_z=4f
zk)DY%es>4RPk_&o^d)dA0DT*2ts=WVw7&!9o}yZyxJ`LC<%iN?OeJ=ov62vj8=AM+
z54Ru3F|q@zw#2R(ER;-3B55ft0Rj@Flz)n4RQ}oVG_MtO@8L}&In#%_h5Ov!VqD4A
z2*WhW{||<El6+F%KO<H|RSXFGM-+WL8@ihO4c?rC&14O^qWeQ&)22;tG5v_D2%okY
z%$hBYBs3T}IC=6UN;shJSE$T61s^Covvlr5of+fM-Ed!MyyAf~pn8&Mc42b21*Z{E
z5@#>R^6>Mks;RXX+MWkuk)tdRpV{1d^X5$yEO6i;)H{$e;J`^Zn<NNkXCu}(T|X#K
zhGi(*5_w`tDJT%#c)eE*yx0aS23v2Uc?^8pMR8j`YInP|Y%YWcW*n{`WJi`Bb#a;5
zOZ|r0?EjZ1uH{A4S_v&h!<;MvuxGUU4G_vS4|RzFk3J7OZ8wRKUKFH1WH_7()<L@J
zD&(W)Bf+42FfN1E|Ak-k@tEOX6KHP6QwF4Y8zmyZ;IA4f)Rzz_Eq;z<ePrT+#iI|d
zp`r0>O&QIXc)!d+ED6Gt!S5vulPv=JB)`Edxa=YHd5fk2WpOedxDO;p6LhbkWg3Vg
zb`#b;#@e>6d`gZ(!&Wk42)%I;fF@}2_S)eDvUvnyO{0xy*2Zq);^68bw{HDfWJW>h
z@n1wNv<ajQJ%OgFzEv<rWBPX!jOIrp!Fm&4!&IzirMhwB26j0tiu{s5eR}CfkFN?#
zfKhYkE)AvF(?xcsqSmRh{~OHn_reb>HE84Qz^+K7kBUMw0xLkW8O(h{+C90>D<k89
zBO4<|)_DF!oB|2wgYRZhd+!jNNoT<X5S%<&0XN<E?Wc#@W4Dy~`!LRh^@Zgl{~B|9
zKFYspvM1eXSWU=*-zSFjwX|TZ1kjLdWH7%yHZHCmg#^)vXA~Yd7%C6b9u@`0wm~YN
zL+4WB`IV%323m_n21FTyaasq2?Hp<-I_0|H8yF72SQWjT*l}`Z1VePn%4r7W3+V2D
zM*#}w6NYI9iU>|g&I`mz6P`9NdmKb-8RrV69xhh=E}c)P&jVvg+<x~?4?L0#a+Gld
z0oxH}<%{+A-)WzZX}~No?3RXy+Ja>JlboIXU!G|Kn$KVi3e9sna3~rEp%k==;W`rw
zzBPlgwEu6%o?PGRpI%zAL#UxX)O7c1d*P=8#6kt&9Fa#10GgCE%mb4NUwrcZzYau+
z-Cm5QAzY(s2A@od-i~X|`*rdmr33zB90WxL^W{s}TVK8t7<+w-*()#{kblO@)LhqK
zyn$|l+&aSnf(Dd0mj}!6l%9P9rZdrAW;!g+$_RgP0(7|D<$L-8)ffNsbd`Um`<soT
zx3O{9^Cb@t^cc38_bK!b;Ot3Y(hVK-t6{W+s+M&A3eKwMJr2Li4i9O#lFy1Z_5x(I
z(#|vt@c}?l_K}XlmTi)I_wIe+BU^wEC&x2?Ylugu^B$a&W@GyTqni=q5T?m7Th2;$
zY;}SyHw42HU6|0fJ9h4b5t)T%F4*BENZ;Q+r`6_7oA}Uz$H*1LEixSjBBwIi4Zr4H
z(Cw$~L_**WJ1+bDEY$nGNjFT<|0l;fpO(vh0(zjPj$j~$9Ok<A)N|AnnGq<nFonep
ziHl>2OT|NAfO5P-W+4k}@x^3Cz*Vw2YFB?079W3vDJBHT^BHTm9S3&fT-Rb;>1@Eo
z{u#*HXUywq^VwMg9%w!+ICLn9f7ACT;+Z^O+tS?`^8(ED8j7sGd@OR*%lmlcvft_R
zCp1^md)>HHIUG1U-DV)Yl0DmEPeu=mv?{>Farsq8i?&I(aSbP@DRKltz+j>4HQlt5
zrRA`vCvo}mWr`Imh>o$tZR<Nix=h+se=Z#^loqM2t;OJuPVR+Z*MM2FE$ckm1=h*4
zeJe3PiK#_QD5^mu!pW!;4hRUqm?7juMD{<+Di5Qrf!c|tzOJ2JuH8|@6->>RM|u*G
zUst?`?dgod2uGs?9M<kAlnQK3=`&9VP#t3<+>PLYw)L#_He8m;GfU^M;=*8FVkbv%
z-MW_$|J9MgZ=OFtP#aHl?Zu4(m12N&Dk?Ya-gP|n#_=8_6O;Rw&Jfu{sGfR7c47_#
zckF80+7>Vz*azFMy?sSx0ZdL9ylyat@7}#zCk)BkT3~I2f)}=KE#X<S%2~qm;hVma
znEoY+>ne@uPSQY5JzZT_XM-*=oQAG=Nzb=vdmO_q;t(O9o}H~N-0b3ngoJx?%PMby
z0?bTJNxIKzQQbYBIy2rk2Jc}Dx4LX!0nw$1@;M7fYx(l!iBcl4f%DK7L$vH-q9+Jv
zX57G~P+Mn-hNYuHQ;Zn6h7*1kAzSwn5Pqqpp!W4^#m^~3m%Uk@%k*eBJY7FOKRWqM
zyHDGJ?;$bN#rcVdEQ?lg50AwcFAlODMBx*;)c79#k|gitK|M?|?j@L|!<3Ocm7rh_
zi2cmW3=b3hJLj#qf#@JQ=i*}yE0<1Ya_<ro``q4cz+VcfIeeXti@TH}$et(c0dfpg
z*WSTli+qXeblvmkodCq+xON;LD{DNa73xk+p-2`vjj$VhSy8!#nVA_;Fg8BENq)zU
z9j+KHh2S_T&P@=bJfjYT*bc5yhfYXg;{z9dNgBda9^)=e`IhDO-%qmnq^9;kSiK5l
zvAlHYp`9)v&4P_4R}=7kKwciR6>uj{f|cIb%*(?>?|G9%SoF@5*0DhY6j^s3c1WiP
z@_(3~4o%3gZJYC#XB2EN5MC#E!hC&w>#Ua%K@~yyD=LqDj?LFtqT8O5e4LywKYen<
zoy$|%+KwS)?EC5t^~DpeSNV4DF1G8lAtyKUEih&WKR*XN<oE$y!o>o`7lT5<uM!jj
zV+GF#1aF8su)NX)2YS#G$j;53gZp=d7BYrp9>BQJfHj1tq<0CEEPrL&zX8}~xMI~>
z95za)eb&@8gX*#SbgsHMI}x{=jxJ<9lei>0E?^hXqL(KIOi|IVZ{);BdGaGOx9vIO
zNh_apoMPq5o7+WH`>qkL)h(^9g98KWnI*Y>L|vycp@`Db((2^!=#=E6AN?GbpO$FS
z`g{FlV+HZ45U$j0;MkZ12n5^VW!m6bV*vjE^k{P6GlviNv3oBNLO19I9@(iAbq+`_
z#DVy(U5ZqD0nEUC9;OY1_Ya`>O%@q_Dnm0f7ucTg(Yf(+!o|OY$&)?%zf0bSZZrBP
z07NVo+KF>${e}fldEUL|=XVj$sIDGFdk@B-LI5Lh;us<*6$fu%?@M_s=u+)PFCCin
zD4us~5VjW!Y+&^W^I<F~GO}{oZ<|w>&PK?#(J?av?(O>u=77aJ)nXso9VFJi!2m#O
zOG~VzGrJFZ6)YGpOvld`6!6ffk7_aEoId0**uwm*|IM4B>sNa&!?9pAy1Q|;E3{Bd
z*p-y5tSoX};~guTmvQs@IZIf=h$g=(FXPd7gtd`I1-xY?-xW_fHo>$7*~9#?{561)
zQv0Um?^%!?w-=0V@6s><PjbCN!y3MA`*xuA=i1t1R2w#J!eJH3Bo4S*EW!CJ)K6|1
zb8uNYlf@c{6Bs&hzkO&ql|td}U4=(nw_({Hb(rVd_n!5k{fee@0jv?tn`>mPG5qCd
zZ}0E#FT8(0oEFT><dMt$4`!QHP+brt=>X=If@7F^`E|iS#L9cRi0WoBqu<X&I4f#u
zen6`|aRkR~ca>7a#{=lVR!>n`BRwEUcQLK^T<nLyMe@>Ohc@7WWaYr|LZnMzo{kT<
z8}wNb#Cw6RL1O&i{r^0Rz&4CM;@knd580h18S#2b&%4E!CoPccAQH&J;r4Qa2b#NZ
zZ)-`b1|ZMjV`XE5oWGivmq$MM1q2FYoZF9%e`vp=HE^k|q!2$R;_0zHy<u;=2gm;e
zMeGQ?dWtn$sK}s)ah9a#8iu3tzuyaBCrYsLz%jAvkYzh~>Cymfr?B;ud+bn(v6ppD
zAPnS#Swjs#BLuwyJ>V$C;eJ~3vDCheIy_i9pc-_ZY;C}Uk-vsGg8s(>5man1K)Iks
z1boP;aJU@ghhM;|mAm|WKzk3wqphtC5ktsR)!scoDIy}m_5!Gk@s=dq8NnThTw<;|
z_%EoBUhVDe#e-9`q3%`qZ%@WQ`(c1{9}5e!!&hk0_<>V$LY(nQyrR_T6<UpC)f1_j
zRX8ZKF-(c%-A9b3&Cx760Jb1YW6W-ODl32KB?6;9Po6v>2`U>#dx|f8-k0=zndjm#
z2oE@PKwzK%4whWFkb<nlI$!NYEP%XkkjMDPG??EpaYkZ_h3pf<WxGO)$+6J3hXk=l
zg<Pl4T>sgQHr7)uO2P$1yv%BP{uay|vb{ilLfWU>iF*EXy2L3<Xsf7Ie0DJ5`53`_
z#2f3uyPcXKs@N?l<7=dMStd^|F`Q9P96W~S&2|v-755j9#RzIbfdW#n`}`Og`CMe5
zR`-QirI3CS){;0M-`lDx9!bd(*?*QRDXb(GIe2&;9K5~}EwPD-i9i+^lW=pw(~!JQ
z&;R{8*jMEhsl_{p-Y9ZhhF@2U${O=pK0Yh4N^7gZpq@W_=0l4jM@9?H=f=h(T>X+T
zIZ?~W?e*M8hGn5z)xWih879>-)A!lN%ElbaJcUpS@-NiNQ;on$z^);B&R8AUO!UeK
z2nd+6#_3QIo_E=BV~hL#TUk5MjgY?vM;eiIqECxpH6L~Kt<%}!*@A}8HdEGyrz<>}
zZ?(sF<?|85+I7wn#C-V3zi8B348T%2e0Vh*4mE1>gkh~@n{c#=e_f+0L#5l(r%&an
zU|lIFC?pvz^UMv~t@E+ES~Z9Vp9Ny^KR!!hnw8$)a#?Q<Yyll@ZE7VmPhI*A8#Zj&
zveU=P-u{I67UDfv`XQ<jB-yoVRQO1OH6Nv}l6T*ozW`0RO``o{s-AkB6YD}S-Sqe8
z)l6aaL0jsdx-8=Ap#?RC2ZU@5>_=^e$~bGp7$ahA^v2q6gPKd_QntMmZDj@&VXe<c
z2w9Ona~_lnaq-hwp%ojqZQHeuTv@Scb0zmq*w<DhQvQ#7m>ASCRmX5Ic@M3ZE?r8L
z)6@(_Os=}3xlC4{t^8;(xd^miS0(J;|I)knjy7vvqfxeNJ9ReujvWV+UV`HG73@m{
zv1)n>T_8!P{1c_|aw_<)v16tBVQ}Z9r?Zp8Vg4)ka3`*Ic?K|;wbXqe2Eeb46~ti^
z&ad<5wmExwT2QRs6tpT4*}Gy~tb1^q%0)%iyALJZBu{E@`xgT}%QVs&>O7y<6XVgr
zwH|>*$%%>F+o{i}cP(1uZ^xtO_8)qHMU(xXi$+wgg9hkdtjFQ)r00seIXR6Xr%^W_
z9!q!yC-Hb|k;yxTU869ej7R>#R3Fc7do<zs_%Sb_kGd1Q@akx^=-yH>=(NzWe_C$t
z);ZTj4SVhh?YTYaod5V_kIy*`ocH`g#R*y*5Y&Z}2!Dp(piJ^yB`tDk34VG7{?t`M
z7KZwp&cwtW6xfRAq0!Nj9t(4LxdJy6j4ymWsW|q>N+s^$<`xzfK%y}-(~mJUlw06G
z#@0{~_PI`3IXX_F#JNrVkOQ0e^<{VWk3u1Qva&`HyO0MgjEuT4waNQA5Pa*_#;sd>
z`}-L-ZQ9|Zp4RZsKmVWrOId&QsY45y{7Uru`%tLa9T-WBd}k389nHtb=R-@wA`O+=
zR~gJ<WMmW;0EUDw)^&ekmHg(E;%-YE$+4ao&}Q=q4nBqv9=r>~8rY6vvziSe5OS#V
z>sNU?-Cutl@0+p&ZbUpu{6WeeK=2>Bu_-Gv)5^{cdbtxf(O{NWch|TL=VH;aufRmy
z1NFP|0J7@pYVbJB@hQv8d*K;8Z{alGd+4UZ?)03TG17wBtz%+t4r1ww86Z<WAE&7w
zcfYB~Qf;88&v%`+!7AcZ5v<nF&CTT4D9-<VjD}5?4Gsr@-OR7s`1UPcDR358fWcHZ
zB_*Zxhkjydq3H33lyZ2XU`8-hFTpXx90?p<X_t1e26Qne00wv#Sa0|5-^YIeGJOqc
zVC%(L2EmQ8R#rb?K9Sw`#3LIZcCg7%#AQG|><g;RHrFkr-2z_=?ErYriRX`DxCWS@
zU*;<2%}Xx#WoX3b?d|1whTFGqBdbzilEao`=im_TL1Py~c>|zR7;LS?N24Q2&fRX@
zcuQ|DZV}_?kKqeJ5Wk6&DNju`%{u!MBz$o!ZV~5WcXCt=$;->bW9;khHnz01#Caie
zbEsESHMJVdNXR&k?(jB*1SF@-;m59DzaD4C!Rb<P5E?%DzStqN)igA55E#kH$==@H
zNl8im{znneZmzB;AL&K}GwL@#^`<N>EoEY41fRLFY1h}jzCxJ$$dGqppoo|S^TV$r
zBDC+yU%}5w)W__eG{<*^Lx<9J3$w9Qn|U<3)#E)23!QQI_&36r8Z&Xk=^T5<Lr`*^
z925V*z`$`{Lu2Faj*gDLKC&3SfB#;EZz~<$7^dE^`Wdi!;q9U!jQ3Tvqi>!qH9C;I
zBr(TzpyQH^v_S(j#kZUbM5jzqrXK&r^a0N8J#TCjeYfHc!q#Q9OYuRmL5b6`%j6Cm
zKN^LW#Y8ULg!_>vv(6qGG)1G!^sqa^(776$XeU{L$zY|~<clzLzd?$T%E!QVV&Jpd
zWs18QILc$Etg3pO_u|XNyb}L_fRC}$2u`J=R-Cv|Dfc-imp7<2x{;4jqoda@=5!AX
zM1N!9V|xLG^FYiN8qZX<7%cN!E_88~NS~9gb2Y?^Ml1PZpFssNcwiuL2&hPtwWxC{
zN=nDm8t|;7iixy^t1$t|@UZ-BYAieqN+eL7>p=yD3OFT+`Kn;lct)_E=x=VEn|V)`
z*!A5CID_Da=7X_Nr}Ol~hjFZKsq1uhz#&BZ$z)X~CPSq22sluL`%Bi&`PIak%JDD5
z9pDx9%1+>b8b?(HE51WWJ}@0U+GLupqHvXwnfcK?H;CJ*v<5i+d-o|N<eR_~={}$c
zrHn3WU96FzVHPMCUzx61-VBDALGe_F57EhsiiyF&XDt<)#Z!Y+)dxRh*uFiGQEVR@
zo1EiTIOS=meUZGI__VB0rOfhhVzQ|zzs%()Ps4GDR++`QoVhGb9jI0u^-E!T4ih<-
zYY|El@(CcKYMY7y15886r=J8cLB{gG69eqvV0K18&qBx36&@ZQQ03#xx5#7MMahMg
zNhq-5eo-o_TwgjVjx|GY<iWzyGOo>a%^E+9<0!Jc6Y86iDt8{c9}=PwnT9zEgUOI8
zv2Cv9_x=z?g7Y1c6Ojj(ej0sM6eT7)I$OxFU=9VPp9Ng1(La9taKaEkouPXqeug<!
zar9%T$l8I<PPyy!zWw_rP*dSI7s9<ji>9s2EzRx@5tL?^Thb7#LZ^EH<0&yP9#9ss
z_X+pzNds_Z#!aVryD;vd5-W*@VBsOZEvRKW=vF`#proVq%z{TYtwzU07>_sL-w$>K
z>-2G6o*st0pc%h*cES$9>;Y?o8LAUG19sw~{%16^JTT*oLYJn=BO_A|NJN_JpEiWr
z<LBp>zXn_-k1~TH_EEftib5;kK|?>xlt()GLnECPBO@aqzF5sz_-?SHVDsi*`do)9
zG=6dl4i~CgO^gH=$=zC!dPsSQGfc<GprIjuWi>V1W6O53X$cB>exw^fFT09-YX%8E
zI6&aBh?gt4UbxT;-`T{}beE6Q^k`CeI9@ZAfhI8NzC!uwmpJX+xf3RY2=+JZhA|d|
z>}|Z98^lb2zP>(8JvavBc|>@>nwpx&kJkYnXc$FVjqhM|f<fG_7Zy^TE;%2Lj#K#Q
zbO#$7phpAzSyXIn(Phj%VK<`g+yRZi=nKiM#V1k=JQmBaRUf`IynQ=B4&=hUK5JxT
zfHUxp!}|2};RtKvOf@qwKUm*fDtC}0lo}fKxDLRBWB>mB?Cf9hGi=lrc^_Y22sV2d
zgWFugT;1Hzbld4hTn`K+eJ(h7Nl};p@9OO2@WDx(x7ZJS9_zdiIRt@aR-(wm;Vg6n
zRT?-?!FO)Q4q?m3y5CG9QrnB{c@cTQaPlzDUZbma{OxEckgt3|30V6o8dCh|_r|OQ
zyOhUh1IWqA!No)2gW4QLQ8=oif=xpbl4BQBQ&TtjwlXs>pfe=nGS2P0qGoTQ=Gx!K
zIEiyxN_WV}*xXe??y9J$V5;_kR1caLHV0Z@XTlA;SX0{)d9-*kU1jJ5YhAw^Z`3~H
z8<M+uOD!=V&Rt{r=~2wYaYzO%4jTW#$+fc+IZtOd2i?D!U(#?pcxS=P`vczv<3Fz=
z8|Fn^v(wYmY*6+g-_o^*-FPLOM-wFhPc}nCLx{{sI&4Sz)YmnpUP(#h%U1w&qcHz8
zzr)Qq5ovVx>}c(Mb(W;*m4kzWXnCZ~&oXY@c)Y~P+B6_N-CRrSPJsRKpIdLgc=<99
zE@GrQh$be42gNWo5yW-y@L@Bx0>|Olakr9~7`@1vZb;bRs3hbmA9AO`2NWWd=Y&$E
zI`V(qX2&_&;ZqRJ_hmdNHI5&z3KtNd(11VDx=r=Sk!v43_m{p~x$o9TlRXD4a3ohK
z8T<w`tLfcEc&58E1R?Y`aw^@#ut-BRMmj5C;YJFA?lRF#9vdyA!)S|=?K3oz6RCm0
zDz;vBX#jo)n$vlw-ALT)Nr54nTqy7AItS}}*SozUA~N><nh&T^Z_CQc0BrL(Nt0-b
zEK(cZ_OrM4K%@~YIlPZ<kQ#hA%m60=U|QIBdEW+WQ65`yZBADH8psHe;=-cPiWMum
z$yGTroR!#_0!Rn=ek>Cf63PWfsbfbqQMTNWSKXF>3m<cUXLtceZR?}sr@x1Pvz$d)
z>1r<p#7dVQ>1rQCNTQzTQ`G`b;^X5}etm(|Rh8QJ{UzyrAUGLXSzl2Y?z?HA=>RvE
z$G;|8EJ0f?t){KqL%`Sk{P}!`!PC?6IS;{3(3Sm;gtn2wTNFGU-srr$|Cr?vU5jpK
zXQu{8pef+wER!G#?taTQJ|CC|XDkj*crZ)57Qn?26cl{NyPJ=X^yvJ-z>{J48dq%s
zC~HpQ9B8P}5hbN2yyRy8VUnPtlW(>tEg>q3-jWIflrecAu`itr1E7MLfu)57u&&>W
zV$Etc%w5%8;(83<$`mK9E^gnxeUKVB4Uy;SLX&<n4OTKdY!08ytSs$v_tJ^0kFiyH
z+2^hcm05jx23xIOXC2sRhI*=|jbj}IVqY~4TnfBp;5c9!47W$gPM&H514IAd;O@M$
z4da;E-Hex=FyLTv<A)D+nAT4H2K(jnn>We~Z@4ZRKSPxOyXN<Ib91YY76fMl!*#Pu
z1<^i|?348M%a;)hJt;7tRU;)PrhT~@m<Sh;%^&qh9fe$z4lnaqzG4Li^DqeeKmu>u
za{%2v$$&|7o_DQmIJtQm?_o4;L4-6mYPDX3<y3C!PSn2r0RV;3r7#$GKQ1mV*XU~H
z6E!n*`e6swmzCk*Fou$XUg4>VfYL+ubqM)Lpg_gTmtY&Rv*AiqBV<b}<P;EDcukct
zO8=;JEsRr;cI`>;HVj3;b;SH#BUB3DVD;}wJa|yapOmy|{t7@@MI3hwYCe>>4-;@<
zN}T>Z@1nx}<c!-8nB979m|254zaW&1`%->nwcNAj1f<ECGiNaI<L%}30-SN(`t=_N
zjH5QB4yBw-OEOlA77gaG-p*rdYnz&oPWu%v<%m3bUPBumGdfi{IkwPfw!GqOKIFR$
zkX?BhI1u@d<bDwmwVZuV8X8V4B)H<omu^;7R4B|AJ6zAt&u<BLgf$FkGgS>Kl1h60
z@go@+bxuHvCOULjcdi3&MYXY;p4@x{!w<VpX^!K~GmpBCBq4NdYzrXCdmA_5oHc7k
zZ$n!6hi@9al9PPkXFMz4cGK1P(2fp?M`9<1l=pxF#P)XyU5{q-y>g`#eH+_lm8tiS
z$TSNH3Wi8LxStFo44zeAE05*zk-2Hx!L#q6P0&>|y<}*(_grO=5Peo|Zm6HMeNcYs
zJ`oX?y=M<b+Tydjq5$1&F%BsQTv%jvb#;Ym0+h#+O)p*2#oMw>=W1zoiAxI$;k9aQ
z4xPY|qECwT;1$T6q~odkFEs|W`AMFyX6(zAVtaw}N?k^eQ`fRc>-KL(+R*c9#m`!E
zu(Jclljg4>BX5iJzPC+{;{T<6K{G7OfO#$=VQ5sq_QI$Gy*$WJ4|xV=jJh<>@cAm|
zhn=Y~)Gj+ar)%fz6glg)5r@CGGc)rW*xB39C%RljY3-`q@a`fb6Vn^K*i>;Zh|3oc
zz0pJ4w@0bXv)F;@BFd}i(Mu{3JF^On2+0%8VYsEu-0G~Iyf0q8ij{UdJr)VDuF|13
zQO6g=v}+bdHvu4Vx$@AdpgLWCP?$}F4t8s(si$!kML5G0BU6#>x9)TC@Q|l=B0u#`
z4>y1rV{Pm)m)(by-AuxUN6sB;2z8u>!Y_v~v5T?pNiYszIeL`33(Gfae*(z@Z9MTH
z9rMLmCMVKV(oYKL6=_CBx6wZmO9VGI)gSKYU=2hih%nJH4pZvx?d2AJSLf<cEzaaI
zbcL22R<rF1pk+rx7m=Lm(t-gTjIk&deR6bgxV_5MokhSSVG#sL7cDD1eUmL_YZE^<
z(rSl%)6d7a@B0mhQNiKY3;0k`Q;VU&tZ)%lBYY;HFmjfB6dzF#TP!Y$K#CMDX%4-f
zo0Ee>*%}sC&tJL}ZL*(#L1gx*zyYwgTNPiveA#BImUcA9$*wANSNY<?<;dwpoKqoR
z!qUdx$9XRsXThsFF-!6R=X1tB0^`svQZ%BW2^_Dsp_j)r*-49psQVz;l$4apouusi
z?02f}-k~+lB^XCQgQ!nye<=;CJQXFSFHRQS9ZTM|m}uvEM9h7zAb{9`Wk-*RKF<BY
z-P7sJVm4hQg*7)X#f8RIGokrzE#y#<{=0p&uPM6lU%#@)bdskbpFEKr926G9YeaQt
zm5hu!OR`(n{b~FXn>2t^<3$e@Nq_Z3`)W3n--Moqe2^WR@bw%V2|~PQ&Ouzg3yY7x
z32JyjzXgmPGsPhz#ax`6Zg>+$<fofBKZi5vahsmwwKGC~!Z?&LmFh2#*G(iAJ}<zL
zC6jd*4>L-+<f0r!ej>-tLl!tdbdz0nkq3$OIc|>wHxzR|t3ibbujv*%i_IRL7?Qt+
zHI;_P8gjA`{~vNvD8m`nB$wL?7t~};fh%nb`VjA=se@k6aRWwl`k`C;-jt}h@R`4Q
z^$MEl5*Q%PlOq*yFap>D4xLnNM!T)2w-+C(oLbQj0R^ppcV35V7(y;-Yiac^Y!v6x
zOi^ch_g%Q$Tg?)|Xi(-_*8#`<^VTplGXr#l^SPwax!U<LKQl9E6+9lZJn<7R58xac
zUaGLm=bFF2y`v+GxZMRP()#Z&qm&>MU`Qlp!2^MYwO9G*8c8||q|xWPiX;N`oA^<`
z@O(|SxB0ZuDi|9J$;oc{($Nv5|L*MJYy+{Mo3$(5F~#z=tBX92!1e-_pGC?=$YuNj
z40*bc0X|V79A?|UAFsyRB7Y(FCr_;;)2Dauj#h|+1LS9C--^mHtAA+N;3jPjA7)^1
zFexV(EsCJf(8SJN-=_$6MBJQE&i4a1=4Mmp8T#>yuAZKJ_jwnbIl#kXe<VsH;6L5w
zcz7c1HnujvSWizcslOSopTPO~iw+l6Uw-%?l;M%R=eYP%X(X<up`B60bz#48sN>;}
zI;diPfq~(#K*v6|Hr+k<`c~n_O+*)t{E)LvuiIUk8XH{^SlR-mVJN3xVHJf6NTb#X
z2?-T%@hwTPke7dk5|_($q<PdGr&D3b6*5H7K7RbzR3Dwcxmq!Ks>0^n@dkH?+L*WJ
zc7$BNc1;T;oTPM`DQ7NTyomN--4DD<N3s6aty{Tj|KK41WmJr$5xn*Y@2AyAR->*!
zm-j&!6~*=8(J^<Cl!gBc@vMF)KA`DV0GD-?8@;jVm5E3RGkObBc0wj&W&xwmg^W=F
zx%)XjXM8)%8&U0!7VgQ1T5RF=fYOO5%d%akb5k-hhQR=YYpmztv#Fw?`?j?eaQGF)
z`N1NmLdrZx(Ef~|$|(6n#iZk5{CcM=R=&HRiH-cklo>t$>cxweB75bEw>Re!_l>e0
zYWnA(FC2TEl_ljow9acdfjGDG?IO3OT|&ZwD9$3@fx3NUP_9pvg*gSLq<r?AjM%Bh
z^OSgZk6P2FBQ&+YhoA8_@UucX)`^O?zB3SbT~?ynn>TD=3k20kxjd>6Co!6f^}1+r
zcB<BL=kdM!_Awy0zX?b5Qq;lx0v1%!3s1(pWyK5~FE8)(y>B9}+caS=XsRXR@#1p5
zU>-i$Z0A_4SPw(0d$k~&&rsP1Qyc||`L;9h%IeKCA2yRG7W=Qenh8HBEp>aad3<7$
z^ogMe$OG<Z0h5c3E;9>D9A+Ufd$4KO@h%OPy(m+7HKI)6q!kDmaM&Jm0wlM#Ha>!v
z7ooNM89Th14SW(O<P)Yn7BSV2l;6y7sHkuk$NZ~(tsxKbfi}hSns48|gYvH{P8gEu
zEfU<8cUlUFg||VRv(h;+=yn;6Yz!s{;)D+T<0F144gHwNLH=5gcD+U<N=r-Q4wP~_
z%S?10;tENHfmV@;Qs#&n=@pnP4Sj6q1Q8D_{rI2}M~u@><H|xttyT3+Z;i_#$C6!F
zg+IniW!^~S1NyoUR^;DkbmKj_2oxBja!JP<pq@d{@ye+88(v{*2Ct$>`26f7<)GbU
z$of1}2a1pPBj!==5S@><Q7clBT#9MCR(G%Orq^ipVj;>PX9hnh=p6n=#Ri|bw4?-#
zgjX*bv&|qg_KXv?u0OF<_!1oD{%RD*1m|I2K|tHRKivt>vo~)_XG?{{5SFZNbGv8j
zW#ZhfK=bhbBJ51Sdd}On|Fb7s#+C>fLRmst+9zYHLDHs@EK%7LsVvPXdt^yOL`92a
zODb7MQfOzi5f#;lQlhB$^9?iieILhrJdWp`=XuS@|Nr}azt?qM=XGA^we+-aSGQzz
zei5eO2g(wtLeh?m{(1N4j{USJ>{i`?Jj}ymsl6bA?N4W$%^?<DYBWK&p}SjED0b#H
zb0>52XCguxwtLj%kxjP1kT_I3JX2;N666+AT|@EA=bfSu8C#bQlSefsNQBT6`rIJM
z&At<zL-_5|o~M)fa>2A%rhDgdKZFY<`M$)OhQeV2TxqE}V`U`^vq!}r>|Jnl&&``n
z@HZhBckfQ6Kf=>SSw8*I$Io0+zr5#X9#G}*j*%?n!2p2vDH4Ak-(@!<jT;JewY8mO
zWTu%#`<6W{hwMC>@H4mRMj4)z%#RNU92|q&!lUv$XwDV9#Gx)Pk2Z!e_5)I!$<3uz
zL3~IyGhc~}<C_-X-XX|Pfg~RXHXszG7Rx;|qtDxhw(Z*CQ2ML88*GX&)*+gr!vm)p
zf5;w%Y^O${W^?ALaRtXF7H9lj2hahtQYA*XWTd8wDLWKg7cO7U+IRz~?c9*J52)vt
zkkBCXkx@~hb3SnkGyqOe5EBy<iqj*PIynWz+1K8m+IhwI!7tB0U_(kSR(9@LuRqp6
zaYnUtOJB&tc(#H#vnqt{F>(4M@|nhvt-y*oKs7;=hsILqB=FZkoJ>5ZG(^-9jSH=v
z>IP%fW_59)x7w%rt+P)5eDE1B+QwbD)6*P)LAZ+|?Q+eySF$}+)By#Q@?1Co{2Ah1
zK$DgziUq1FzYzJ%UnIT21!S7kUl}=)+>`IzF@PNMy2ca8mo=5S$Z2@<#({#HATg|a
zU_Zz7AfeEwVa^rM!pe#NPwIFe97K(9hzhSHQ&dyzP7jS$@pFP*hZZ@<<TJ4>$#q_G
z(xepu&-h)Bu_wMRxo)&_ru_Jo$8FrT7^H#$V~`1I{WsU}iz}f|Q9GK06DncoSFIW~
zPxxKFnGkLEA*TTy(3r+@d;WaC$v+g6(mjX6At7a4@ahTOtP_nW-^aS%pM~l0kbTe#
zF0&rL=$9q=KK=aZQ~q_gIV;@~7+gTu9j>%;3KhZL-eb>T(_(V_B$O3%oy<uhEjco$
zYaeZz=>7ZmcNAukTbjOFD7-_!sI8q(>Bj?4ne0CYWaMAiD=GeF>C?c6Z;}_|YC342
zs9xhvTdyGN5_%@^CJ6e(G370)bSRM*xI8*)GWYtgO|DdfQnC`#GT+_5Xr$>_!!pR7
zhUM;GaBWzcW5#Lkz0MCtFG>n=o$6<3Mw19NytkLv-{{+zOv<c9?LsS;6c%t$I6!=@
zU*`y7DWEm?n^%C(_T1`)ANzC@M3sW0O~|zUa7Rm@rf_|@EK9>pLZSMH243|)aNuP{
z#WCB+-0PppOZKL{Eg%AmU;6!i>e}^`#~(ITKYKB;<P>48VQr6MFeG-zk|LwMWSc+o
zhrcI1#P43GTc6+J#pV5F%Ga)3H`|q8U>;>|WZ&*(lT%lB^l*-j;g*Z*r`Gj%Q9SyP
ziyHDfRR(nLB^@;nj|NH@2L2}9SOUHX=q6aY*bD9u@a_p;VW&419uEw}ghkm0S)sC7
z{}YTwapgvj9a~Y55PLeKY#4~07>?Pq@PEqGps3MX{lc@f@1rSqz$Xv#TREOd$M`f|
zXKwlBV<#YJQeXM{`aZmOFqKR{(!YB`Lc+15N9|KDtz5a1+VAk=thKJLC)FJfxO|1i
zFE&pPFTQ6n%GLM>6Q(~J4m-2J|0TpzwWV&*sLRya9acY2^zbhnNSsMC*mLq0ja05$
z=7hn6OGq!8%FnO6jkWlrrL?JP`i#|%ZjtBEkS5QMX3j?$o>-77`}$jin{pJ4$Bo+*
z+|b3{{U~(qo{fk1R#rB~?_Nj)=Zs0X%kr)jFsDUDN<P5a5VwODFRmfthg_`r`AHS@
zAQwFxL5LxfYla151pw>$IsfvfPaB5i?4POqp{gol`DaH5=!qzk@Z!ZL$Kv8XWA|6Q
zJN(j&M^=`Ky`!^Nk59CQtRJ;cW6rV_94*wTn#vC5QiZLd3_W#|bC#nyi)Vi8T`+d7
zY1{desxHpz6kd!X`NK<seQ~d)pm}Q2tQ)(J9kZrA5n4spG4#Wtb?AEb>uPPhvP3gF
z_}}b^mJgaRx@P4jnt?><kSb;elzLrFfvs4+Vnyi1y*E0*j?e9+eiw?qqzcae)S@L3
zdk08%GGqQ~?SDA7UjO_h@i)?r2(52Lzn0gcs^DKZal`wdz1Bh+*$FWUtlq%+tL*Iu
zyPg+ueDA7g!Q*GhlQe^0na3?D$j#NvnL!}(yH9k^`Z|=XCTDOYAi&+jV~hGAQj~TG
zr6P^r%nitRc`DFiKPi~SZ5zHd{7ibvxROch4R#Q2Lt^i6YFq~0MP|$QIh@3ceY&PH
zvGR(I3~h)`pF7?~*Dz)3)QTftgaZ^AKqiBRM?q?7SjQXN1}-|Okhy5_;u<)*RBzEL
zy}_%Dm`JI%dGqGXPdrRWyY!28ix-E~yrRd0?)R~aom<O)3f4oS7a8g*h@0ZdUT}-E
z*6yVqQ=hP<abrN#7+Pczab;^~eyFWQ!y2XzlAr7TTox4n`trN4l2n$F(fm|{!9=tK
zGDW$?dFYkhKg6E6ocZz3TyERougV>iXcQrw4t0y%Xq5qFmAT9o7$XL+_a$9$`>;eY
zmQI?ng7DkOnSpX)zmXhjYm=PY%b4wnzT&!?<kKmx6|>TxhO?OF!P3FRHaPgfqeo{z
z!FlvjZdLR0UI#_35s+UzVU4uY{%O;*&NTE@<I2;CGJHjA1X?X*;tbl7&n{~C(fu{t
z3W_IAhK@(>{+a2LpA7g7p$!O5Ej^BYF*85kMgbeMm*fk&#_qLFJ-duOJaqO#@$EV<
zLJ|XW^W^-@Mp8}jnK)X8nHO{6JVFy63(JgizxoaNk5IsSAub2dQu#Q8NqS2}`|9Um
zPdHtfHosKf_$cVPSG82XBFZ0IT-<WmvI@b}ec`+^5<U{ee$H`_CUefdo%c!|k7l=a
z1zI8v?Ai3Rbu1?ma>38KFg_uH#1MEm*6`w`OAP#NSLZBlZul4l`X^N$@-gg^$fIY>
z+-TagY3shy%qwZU4bBD^&CoanoT?@f-3~}dq0Tm;nyx{Du`g-$%Rk$PPrP^)l0<Q$
z^REiamd+u0^8sjH^fI85I77urNPTKsOyoo${gc68&z^-Iy-I8;UWaI>`bq1|eul?4
zDEEQD|Mgd|kF~V^0h))CP%Ak6b>UQL6gL5;)Gvuq1cXC7Kg!Z=im#a9Eyv;XuW;x2
zx#M>DcB1G%a@fJTE~82y&;92@dC${qqwdGN0M?g~eZR<zh9X8gSg@=#j9wYV#dHPH
zv;?o^Zu*04A7Z1kn?gpZS1K)u`3-e9p|O#eWKI}5^bL1v*5xjD==X;hI-rK3JXoTm
zrZe?lAR%NfP+geSe-idz^eS<U!eov3LL-4#z+c@#i7`id7RwhO`gr>xImd7KToVQQ
zVsvueQIYWdVakLn5-bKS42YkD9<!#)P>G?@F?vM;&W^NLMP0(FfJP!W9l=-~Sp^`x
z<lPYImK|9BGv}k)`~*?vK)j#z+_~b1)7*!Zo};cmX*|N$@P_>%bPY`nmXRKx&@Dxa
zbdd*55F$%!yHzhn7eP+TEWLnOMfhG)d_`yxXD`!z%=|>sA)3rbuHoR|0QbVorfBhk
zCi!ieOp6EY0Tzjm0)NRzrEkT+2k@F=)`iYW$f`G}TZqar?<{#|e64Al_JTv^BE9kS
z^aL=CA33s8yyH>x`(aPO{em85wuo6jmDlq?7$*+`Ro1^r|K)}lKLqv-liz0>x2fT3
zWMl=s8sJ|adb;%g6cj*&Vmt%VW4^%-UOW@+<8rpQp|{I7#uC=B7q?GXRdU?Xi7Dyo
zHiFI2H3+o;ZATZqcm@~vexaKN(Wd3bXyU^~kw-HFgN?7uc+$P#+#geSG(oqcS8oIJ
zHX!FRHe(#JaPT@ZcW4s&PBg+u=WD4O`DP=%F!u1#?VG^B=eFzHllTc3Ctc&kKnx5T
zJlK2NHYTpCnX8xW-1+f~vuz9bFGM-zcpZezw+anNCr^kHxt${=j5V{Ulo;E#Zy%zp
zOc%Y8Pe!TC%`==neVE1heExXHmmh*LD-plaS2-`bz?)N1>@emX$1C^#5GLWeD=J0>
zcQpVR^wZmX&O?Y6&c?QKW+I&<proii%buf=S+MTik{dwb7^@NEn~bt(WnKvm(DAq$
z8fqW#!N^Z#9c&-D%IUc}=G)7+l<Yr-P-o24m4B#up~&ObH4Z75xA`QzpIhh24i}26
zjd>mK->)t>h{(z1OB_Tft$i*?%-)IT&!5jUaa^Bd7-JeR&&s}f{!*OsYD$SEn=&K!
zmv%Fa%s~lpNGmk-<%<`q;UsE$Zft5y%uh8uW*AzYbvBI&udq!AzicVy9r7>^#0Jke
z$Qz}7g@VmRJBC+DKil&DKp|%4f$El(7zFTnCHXNrO<EfeEKU5}d(7e;l@Cy0jy<3h
znN!}}W*Zha$}xb8m*Z4RZL;p+{x073#`J6{74aJlSu{iGSK3VL2JM22IH*_QcsVw<
zi>z$s`3x)}ts~Zn*8`LbiQ9n+QAnXoOkin7%**0{D5j|b{}|vB)3xrir+;3HXedI9
zHj(xH3g7)*2K4^oSa@yCqz`K-iz>>?0}2<bsz#C=K5soHY<n}sj7Vm}D#TtP6LTAO
zI@U9Qq4Db3%`G=8yV;9@NSY|iZdPk<PV<&SaF<avCv_A0nUrl2KQ>k+tCMcvLq0V$
ztU}F58iOL=w{zz(xHbZgxK$9AkIs8IQqVskBerVU5^dH7J`jR+2F)#~6G&(Lo3z7)
zU5+w~s)t9%X1X0vX<NR0S_9#MMhRxK39~tucZCH>Nu>z;O$Vt->66H$bSkTeAD{^x
zcCFRa2mV}#Y0VpW1A`sB%4M#ryo>dkKJSZ-t>e=$N>f98;_7sVh-B!TUWMqE#l)<0
zcYWwT>NyPsWPQRuljGvGZCld)`+3OHCQd{Pm!6gu%^!h<$f-)lsPZ1)i6|Ni1w?wI
zKp3p9E}W83C_<q*GfDscJ%adSiHV7oXW!$t0t2r;bf^Qj5YW~Iozk8?S1w(mbz({+
z7tC6Ond;UzHYo2q9-(T8j9j*Q^#!?w<Q~KA!P>N}A=Z%e(BcrAD`*wdRX;q=m6B%u
z(J}Uv%mBRxO2jwQy2B))$+-!?18s$9^ye{S04CSzFcY!4|Lmh#=6?2K=u5<$C{*~=
zNK30Yl%$r2bVJc%i2gccY5cWoGD?)Zp%2OeyNsG0ZVIJe^WlS_UZ|5k^DdYQ8^i1v
zRV!7a7<$hLzCZQD>wHt9Hg&9M?eISYc7crLPQxNHNA$6#hCyIg;)F2mTvmmwD*{+0
zo&q-?bU;Ty3<b7qBRfG$OX!2(@HET!>FKG&SVg&fSEqQ61S0S}x*Je=Uj{7-RejmZ
zud>E}!cX(XeijoJaW`+?6dLyUZ}+va^T4Tmf=W`F!457Zssn>E)ZX^*zwVl|{d;x#
zE&NIPC%X5?DZFZtqs=LUHuhV8FADlmTzvm8nDwKJFq8HVy@2uN<K*Q#|07bClKVNu
zSSf^=svsEeDBJW(DxSy&Cluma8hs~nguD$y;~_-$PK3)i)Rw#$%`-UvXPd1-4L<*<
zhu>&Hm^*UF+cnG1Ps4-%0<&{SNW&47WO|ym@I2u`G*cg7CQ0%Gs@E^R2el)()9b?q
z46p+S8_2-F!Tpr3wE3iMulDc!W&Pu%q$vv5H#L|GNqj7AFUJ8l*R)7(gDAXO_VbEe
zrY$5p&*Z%QJ;0FEvyJc!G@E!RCyyNY1jIwe0c^o~6!8jbEEs3|SAV`%+Uk}t{e0{1
zJk?@bQK<cH!w#7G>F&oE%|Crg(f1f?`{Ewed569vp%pJ0uxe^g?hr$G;~yO}WN{x%
zH3z5EdD<tN|Ng{<g|-U$39hrmCvHb_Nt^*UpPRfR>a}M+LcHFN`eG8`!KyjcO9B5K
z+qr3tc@2L{Fw;XLPVByO_bz=Ze(7!e<je1Y5}5423{ilW2GWo@nkE}{|7-Zb0@=6Y
zYVxUg1Bzl~Oe5*r@WQPNhxp6c8Z~g>AL2c9K)Nd+EHYz5<_e_b6~%9p`_jCD=7N$i
zEYr8s=SQ!t*7<gRYWU72fEF(~T03(zdrH&%U2JS+s_@oPL7(qQ7u`z^oa(b<hq3s#
zd7Kd>CSFNp-n?rC1}f`Ez535d11bj25xs9pCggnLLO3#{eL+18qE3t|Ol#ayM`b96
zXF|5~dZqvU8Og0MU>eqmiXmwEAj$Uxy_XJ3knw<;2$$iWxA%`%@@pK@(z72kn<v%Z
zS67>i!-4OR0kcC*_S&l~!gI=x@B8j>k$eE9Fju?15}<j_`xn0;p7+e=AXS!^t9UtJ
z4?>~Eq?g(RU@b~@H<nou6JP<^lE`!#<9G<Cu_WPbtyr-FEa13s;c3S7q@kV+L7iv_
zxgJ>feTX)>72dDq#l?ff%%c$G3YotNqRLQphrFv%eD`fo$EHrfwRC+%<C#&c5K+G*
zCr?xI^7ro*$`9H)nXi$y$>cv~t0X_eoCMcZO86UBQ~Zmc+BlV=Fn)@+985PlZsweB
z>3?o$FxtxT;=M@@2cHKJeu8;`($L*}Qe8gjyioJVwG;Y%d+`JMFtKp`x1(eD;-otd
zwDj0jGDLN2OPjWB(Yw!*DSG{S$oSiBd{eWX)&Y9KtC1$*;@Q1?DIUT5`KUaj5gpfz
zBhQWyV+s1x#bDKwv!(yexfL1{NXdILN?@~#s6@pZXu>?}nC7|&^?nSGUb-!9`z_R@
zQ{M^3GN;TVk^P8?srOA--p?|z7+&_zWZ0@rPK7hyeV>^w?oG|NZp|7v<4Z9yVB%9W
zWcc`=N64I{-)0#>L83>jn6w%(a^!hXHR=|F9XxOvC)P|_=u>f>g62jq3><;lioJ5h
zl&M##`;R%ae`mV(5YN|FJeeWWl|dcXfYwqQ^w7>c;^xpsS7<1sQEwQnU~+hr4_Nx>
zfdi+PYfHWd4vbm;ms<GXCwSB_(k$tVp+>!<ssD~#*$(Xgnf)b&yt?L@!#=tjH&AKH
z-Ff=74rbVt8MZs5e#ET?J4pT`&{8nNC?8J|!gpz}gn^LQht~Z3=Zl}N%1<{liTkch
z@wGr<ER4boWGpP666lW{Jvxk$*jB=z36R%83GMer+9?Ak?-HZ3N?sFCq{9pF)_-r=
zh8<R@38KZsYBpzjo{bF&fNmyh)QT!A&x@<@rYl$2ePEWGR^pGb!~1*sPRxXYm+1;4
zOi9a!vkNkqG|2<arjS2PEgKP}r>(8soFX^WRnDVIMjCsFc~B)>t_1jG!oL}N94c%n
z`N}fawaM4-KQ5aG)vY;PI?ggme^NA{&}|;871JZzx8H}i^_D9vEEJQ)ywavuKn#E`
zf|sraSRlJV`!H*F>qSRQs|O6wml2&oKwL@M$@yXiCtZt!+};1b{zBl^)zw7?%|bJB
z{?I;Mq)B5pytfOl;m=;8Hz=kJ!+W4X2TN0u8tf2bN_<K&B77K{WBU9eYb&dj&u;`~
za)>j3<)+X5&pF`}#qZudZ@%d+E-sYh40;EP=Y9{J%@t0Yl9#vlJ)R7Kh|r|3z;s;k
z+I4KDL@hJZ{#6UR32ZFNV<f<_ubkX_T#knE5vBS)Tz`}QoP7SxoexOmBObWy0EX!4
zIvMQXG9$DeqpofmK24I}3j|^W*-U1^;>A~B*y(dK_v$-x%a$ho>-J*$2R$G1G+x=Q
z*P?W17<z??$pN{S&=7SWKFkppsYa1ew0!>&ZV84_=1X1ezJ(q&8Gs{I<0ZK}fVOkn
zkf9#J(v5nB$~3i19!&HUc*irAQR>^bFTul<8sYe6W2x;jgwudszrB%Z=t}EZm!QjF
z86ZHEVRCm;Qk*!*RJw-k+Pt_d7Kb5{8AqWhlL6=?O7-rY(S-^4eLt^i=X>mu>c2qb
z=%E7#7A{@dt49yN(kMFkWPdaia8UeW5}^84X6Beaclq7PRGCCVRj;8YA<1Z-fxY{E
zDq4BU4P-P85ehwW-;s@YYGFxv@RCxC<RHd#_zIv+3o(=)4@hO0QdP0LXIeTa`0LlN
zp_OwvsX$M)Z)4_Cbw@5#+|PYJ{5!|%20_w<E?c+OmS~+dJjt(d9XI6a)yFpS3~M`L
zqs1HZmgC%U#cS&7((+BDpb-NHiYNj|?rsof8UX^^%Z8saTm}9>Q3qN#47}nc2MI^d
zQcXj{Y@CX8(uaRT;!X?DFsiRk<W6w1i)LW#Ig8XZV5+^xzo}`=DyETz*UkAHV#v9<
zeyVF$uV#ej*3@?md`I9D^|?*FK-9EFtSl`TTP&A4KEY4{W`%z>+>38K7xh|!tLOEh
zJU<#y0*Q_s=aen{L^&76;v@ae&=YjWNo_Zi_zCpB`C$=Y+TfPbfPIX``c85l_#baF
z_FG(g{bmvZo&`c`Q}N{Mn}?98!Xv!;#(R`hP-dcelJQ>Jn1mFaCY@Nk%;2ojulom1
zrgQYBtSlV&hcTnn)K(|^DrIhQLW0Nh`&gE>ec(jtXs~%C7lw}G^Qpi7$_)3TN6a5g
zB+`ez;!~xG^X20_@;^RR9qAgNSDLP*Oy$Mo0_<CEe!ljrIxA+;P+7v_c*$|{09%6B
z;lw)+G>}eS<f3+l-er6{b26kJiZd>Y3C<jg3wYx^NF;wM^TaS8bS7}^kdLhXiM2Qg
za8HGPKi=}pZ%2uKcq`4}Pf4Gai^0;K69Z!V*rzfsDt@W`1|ds7@9|DQ$d=9gVSEZU
znv_M!ou=%ez9Z}3kYhTgUE8*rT3TXqhJDKuyAJV8OM?;=f&^;E@R}!g?kt2&LLL}e
z)AN&dHWp$Waok7)U8=~*uq5D~E#zw9=Aw!g`b|ZJHFiy9sv7}0&|9EO6XDoAPQ2F1
zalWxxKNKl*>Eb2um6qFFKujd2-uFc<rR+&z%$U7<4UJzt$^s||TnPXi0RF0HU6no4
zX4u%Uyw6%xf6Os{PJtO7KL5sbVvTIuwga~J)pQBtzh=}Sb22@1%o4B&c+UR$<Q<I~
zk_kTBVz`P{F#BEo4;>mbkChzHpyb@#-BtSZf#Ms@O+7hi{D0ik!JJ@zti#pe_6rw|
zL1hm(=#D(ox+e&vgm#qJ71O`}sQJisYxxVWtSrbSQ-+$@r?zU5oPf6x(L!uI1NS2J
zPC3@??WG@DfTtD_rj9&dxVcw8EkEsHaCro<iYyAT+qmWL0!MvQ`&V^UyVqGRo@1&u
zK3QLFwnDG&f6kk|)MiYs`sH)OM?@u<8O@oqHZ|KuWvul%^$`b;+N^HxH`%-EJyYXW
z*$Zd?BG>iM(IU4sC-h>+??2UXZQD<qZ38F14ygEYe0kq(&*Ns^d(6ESV$V<$@9o>4
zzg74mm7nC)Q18!=%$YKIa-iOp8w<B^eW^<uF<+~i0x(?hYOR*%mcgm7*XMqR%VGOD
zlwH^~Q2Jr0UkVg&c2a>AHPTH;0w^8Rb6Zu{tX<0x$s(kC@Wc%MV3+Y*PJJ7@i#6_>
zerf)Ny%lX=Xeq+OQZ@ON{j-P-n0>AT@6g_HE@^EylfuTD+eELV9<r+7#{5&pP&G<J
zhbA*^iKK+l4t}Xu|6U(;{klzkn6JOTKd_p$dZ;||qD#6NFmq51e}XIq+`8LlVhhQ^
z^HYB3pg+7fuKeMc=$SCb+?+r3Q=dMOS!FJo1!(=8CY(Z1L1||TX=gPe9<rTf@2*gI
z_9iAjJ>S{llNK}~Z8I^V%v^TmaA|SS(+UlRN8B39W)EO>+$-tA1bhJKuV6gh%T%R@
z{~K(B6`6A-LCJdGdGzR!>DuZz#an?#KdjaIVFdXlfuas<6u#!zp+lx+1FJoj*xPHs
zZXml?U-snTpl;?<*RTJ`fMGnLh&~)676EUsOT&GGgBPKbqejJNlLac64Qs`9YvDVe
z3-97m!yj-bN`tZA)Vhn`2X*sn1}~_eTk(FrG{Mud5>%Kyn6#`*tNO}YxHihLqM}pA
zL+LHp%$wIk)|&^3UmznTn9-@<DkzwD${3<0;o7yK-DNE-ES4@^>TLkPtmF^Ad1E<x
z;>o-grWayjYCn7c!}W$XLu&eP{XyA(j+4RW&5cEQ7`eryED(W)5hWEBO>S?&3E=Z8
zz#oq=Y|URD?B4nWMp=sAXrjAW{2adwheM;gl<%UfMS*zk+;GSRO0vo&mfI(2YSNhU
z*s$Rm@3EnQ#m0MMDx~~h939K}rj+U(8oQ(}%W2P)mj!PFXuupJR$%=$)yyt@J9`H2
zt+MGX^b-;N1K~MjlHjKx#tcIh@JDIi>wPOJs9ytX+OSyel=Zl*wDgSeP-YR?mHIO>
zVE{XeHSOqW(Df)@tChamv_8M^)TDOa1`8VHf2=DQ;3emts5@fBjI&wOm*qqMnLnEp
z17BNLXF;z#bPQF=gb9D;=DKqy;h8torlGk0gx55}xEY66{plHy`ryHA6+fKq40e#?
z5WrZAH>mE?ymxmx$03(@Nd{8YO{s5az%o1KSf_PQsc4{AtsO_`Aaek;F;i|6WghF8
zHRb$rNwU1bPlA|zT0YEFMJ&o>CBd@V;ga#-AFr&F1vh5q*@Fr~Z4A2|=6uB8--1xB
zq7X|{&95y6uUK4}eh5+VD`#hjBQ)NO1FbIQfhX|ftZ`4oc4bO1haoI~rzAO8c5FKd
z)4zLXpV3fL8P(4NAAL@vL*s_9@r;hdFHP&`!}34}yb4tmJ9X>`DNnwIK7I1!$tB!B
zZyf+J@*JbaEioV3oiNUYmyI3NazL+tKCxI*?(BRidp7dkN6E>2Vg|K{Pgsy-UwfN=
z>h8_QNK^t39$fo+G&lV@2?;wC*HIZQQZEA@ahNlb$)J#+aAQh6H4y&{_ez*l*e$xr
z?`gsjIrQdfD|<PB=m1LIax^zij$0AXd+h1LG4dVh-!e|fgV-|PIF3G7v+md(gaX}t
zKv=l;e8ZX=D@yGLXSplTq>CVNA1rO_S^mS6+Bhxr?AiNCNrpaoKvOzDW)mOg<!KEc
z{+m<&`m)S>w{I7ne>?^oO3v-qtl8GqV*4}jbKmnx!MejnjoN|-FRmnw$Y%+&vnwZX
z*Czpn^!OvNDeQ6K|JcuvyUO-w&z^}#%0PTr*tFcSh*D8O$tZ2+x*QJM{(#T1OkE+$
z92HOS(~4Dk#c$rw3yTm7{fhUq*lE^zFV4{p5D@*MW9uJd6zqg{Y~v#t{q;W%fB7-)
zEwgI`I%F@LpAt2iYN$ZN*Yx@6r%du~!0{Sr{o`2N`)!8;j^Y2YgYts%hgOHnh_j+B
zwuswGOw~)ocp87Xh-zO3Vobsh={~yZm?c)R^hn*W(tQ`hxWJbVO5E}5iHVQ=<vTru
z$7SC1e2x<)|8r+Io|luCm+##>+&q$vfl4cPb&?#8bZono0aO0N-^Lt6QAnGaUi7j9
z&!nJjKTK@1-J0(&EX7QxeBfMs%Das;9=?42s;Vj5qD2eN=b4lXKmx9yMeZ}c6G9jr
z_>nJXdV}8T1`HgykSv#Zl~_gE5bMqa$4<pzqs``Ou@s0Kv1=*&PpZFxAt&G>e8b{o
zpKWKaTu~o4&OA8d3q=cz`qI!~KuK@y4HpPf{FC;<a16+OQ+eNCX@X>L>=$bCSvZxx
zztu|Pq^$JYAKyK5^#gG^4$UeHNqQ?f!G?wreU(SFSJqfb5nfhYY-;3&kt0PvPoyTD
za76JywZz)=H6sFr0Y&YJ*jQJ>DBJ^>oV7xYn`c<eo}D55cR5rIk9lqHp$>Z)bi_4X
zQ!{ttnifi4a`fG^onjPxly!A)WJbTjgNpJt(mZnN9{CK~RB@Q`+RdatymQZ<a}PSg
zSV!Y0P46deM;p({KbC(PkGDZk4A?Mv^^wsa{DjOplb>*AmF?8YPa&3(#wAAb-MhOW
zC{X$pwgSdBD|l*`%YmX(Svw2x%*pYRBggzjtbE>%%?o1HlCtud%*~7!WtDl!iA@re
z08V<tjAMu{=-lQ6XNXpc=;HLhbnzmWdv;_FE#)pUGQPsub_w*+X;YP2_pgaMiQIOS
zD*eYTKjJv2h5n;wDT@5f;+NT2`Hm>@H3K<4{TPm0X#dQR0}6>5lC*+zLAE^_Y#7hf
z)rk|0Y`Vr0D$si<f6#amjUfMVP$ysNg6-|?(L-&CE|gW;GbQ*+1_{o}sNI(Y4$Ms)
zaYtc`OY(wYh|F<9h(MO}l54wFiMb-$;H=iDs;$M;2`&v@j;G;_>%y(pd!{TRToQCK
z{ji^1cZM@evEX|?xus-u_eWIi8Nzi%D8eZ`n|1I0{nxKwTTFbIlM@3nF?n^BYJv~G
z<ni24o(vR4HLJi5YRT~V%eTU?u*7hNOcf5Wcou*BIH~Q{1z<ny+|E_+1YpjZF{6jU
z7e)f%eAFhH1_e~wOm<?rgqKZLq9$Q_!5*_>{TaM}9n5&R2MD%A$@9Pzd5PFNhG>%8
z_!~<@e%t;1sCynyNlBsL>(;B+5RH{W$5HZBgZ~^6lVPZdQ-0gFZLQ|cJ*p=xq3p)Z
z(ysE#j-D}ny6ETLYsAc6Y{C)?xy}v|B?qjBtoP}#upNHCF1W}pt5AhXZaN#3R_ZNY
zwu}X-b;!QA_uqUC63H1y_MiYpw{c9K8D*Y8@y=z*>oV8c8o*Fjp0kW9BWG23%*uj#
zP)?Uloua%iNOog>#LI^xR8@`6W|^t4G{WDuZ|U2&QKl-Y3bBwJOCIBAB_$7q64KD9
zeK%OFig<<Zas1tGIJ2dsrlK-Cz!va^(Z2_zq@_VsMNbLdxoejSYgk>>)zplclQWG-
zPd6TUr?q5x2$c-oC!Z<Z`uCq7{(EV0v2d$EMUVdg!zxO@(uX9C*akYXzmHF=$)uE3
z`%coC?+)xz&=c^oA_Rg{K9H%2BFwDw+Vu?A#9hw5)~QPue2b&p7NMb(4xbJQVWj$=
zof?${^u?~PEM#h;nHxFwE{=PZ7j#G3{SG*Q8Z?gt7*+ssR1~pOe#ZC#opHtqz+8ij
z3?#|W$cS-V5IDykHNRz<1F;lc2Zsf~*<JSD+wIt>d=~qO-JBWJ5(h9)Gw07!ErQmk
zf49F;)7g64Hy|KFM8Xvu{|tw7&LPp!tG<DRZynjAkQX9Z(^jMSG}wW8y-*ovKBl<D
z!+BZGap8zeq%vGl(#mP>J*Phi4t|@D!wj<6_~w-XD+xEnIIKGubDZcjpUrTfE)!~h
z!>N&%O+4tu!;V(2-UUs}y=~j3&C%iNQudf1m+p*M=t4!Sv6Amgn2YF(`OTA<WftU3
z1%+I2!`RoY2keGpBs!%AvJOb|6{Hr*veR-4IbzU0T@B7byHO<NoGc<{QqD$1ZS;(@
zK`=pz1E%m#aKX*5zVx?2A>53}9R5L8_jDhwqcaaI7L$wqCDC?EgAxyuopXMM!f%2$
zcw>bryW415aT{+&XxC@YnY9UH3DU!v2c4uPzkKHfUi71D!3Y>aXYdU4IADf!nU*%H
z8P9MJ8?$8Pr1D6x)E4w@R56GfotNUtxAF)gBGR$99KHT7UAi<OKOi19%^H8`6TbM7
za1rrwaXCw-+&afRE|*MQ;SnjYrmp<JzI|zYVKbE+&QIJ~St&^Q$LaFfwM!EPbZ9Uk
zYtpukgmVxY5z<xT101O+0}Qovw6%pSf$yon!kmdej?J|W0C4c7$*XZ}Q{t(cb^h@2
zqq&M-4<Bp1&UhVm6JZBfOf}cmwz9Yw7@*8Fu`4`lxhFn2$YA-@q)h~Y#0$gJsIT4l
zjp}OW`SYJMB_DisQcGts3(1d;u@Wg-^=s1eXV2mY%0zyGtE;Q4KWQiEF-x}#X|8cu
zkn+>94-F>&Wja;BkzI_5NsOK2fR(e=riK&zwM5AC6cTmv3esY+-beg;F%x#4VtAm&
z$}R+6R9!0DMNz~1;oE6zA8$x)6=rN>^Myr9A`QwcShPq?U*mUSTWJjDQ)kOYY3X`8
zo)ly3M`MPxDa(FU(_#uE!H$i%*J}8E=r3VOnBhQ2g5_E38kFGZ@ZKZFFb>F_NW~S}
zu81;?v8{JsW<4O>L$@y5e4a8LB`ry!d0<iu-$QbGurbH;?;n`#OP563lZOwVX}-i%
z6@GQG|I}1`hvtGR_lskT?DkwUceV}QU?fz@i?V>k$xA5iOjo%ChmdnMtL)L6Amf2N
zc(;BV9ri?wWn$zyT`*o^c|lGxUM267R9w8+XUd$CG369KU<jt~V=nHg=K~`Pe?sLb
z>TdLnX9D|55BK~?LM%wjEOX)0=<3>9c^19H(wub#CK=ai-@jK|J5yR`;pX;P>)rLs
z_=qw#CiX80v~GC?5)rcrS;4Lo@ZK%h23!PUWoE~DYTmL_Ltn?mq!=cQT4<Kzw1h82
zjFn>7u0>3h5+*%u{<#rcRq(KiVr^AbZwwNDV8%@&>@9gPG-d!jeMXInw$f}P9qII)
zo=BHNb%@r`$Om=HUudD5O9gg=ASdG*Lx3HbIK<N(?f5vZaV3_)mhtRZdKjn`t2}lw
z9SI)FuWLNxRR|(hPt_p!qn|$CoP7JX690LqiBCVBc;cu?l}B###a;jY>l6_E@4wcd
zmiv(^jVgjw{S3!?4I44SY-o32;gSixr0j9u<73dNX|rbS^YH=t(IDlUL^vM-o#Nm@
zorjkd#KqCIB7}DmxKRutdTe$l-jUH}Mz#_h5<L!GmOfEL!J(UJD#*(-yKkmq=HM_C
zA%=b;b7Ip;m>CYVVfIXU^>#Z`-vpv)bMRFR^@mOS7D4BK)NJKZ|3{TGhw#9sF{XLl
zfxFsvYf<3|U5XM`K|Q^)-_w0k^iwg5N>_=3k0HZ_p}{cYeY7{MUc9)tIkR~Sl&`Ts
zO`gwl#p*5jkGZBfWnneCd?YA<^1uHc&Z`fb_5Fr;*7GyZhK7=Nr~BlIlkz!dJ5&&2
z7fL6E*l%@9=`2i5rJw`wKEw(ZGkTnW#O;J<@#qDEf^I}ub{j2Vd)i|HJ1>W34soci
zy83{KLjf4GQ_t{>h<iBBt18<2td+tBf5$TV1&9B)-uMP*LX%b?KgH_q;qh+#?YC~a
zv-=HUlIg+cc0zeXMW?){b+!joLvGyKvxGSm>gc83ZfEp~H_LdvRTP0&93azSKN1yu
ztiPXM!N12$=ga%MBq;GwvseB2Op48l%=>$JCP<&eY}v$z$TNlx9ZF|Sr2skT?S~ME
zl9ca9P0i+mQ`|G<gMGp6M8`9`TC39t8BE2rzvrKi9zQ<slKm9cw@7sMFu6QTLA>=g
zZbqXyS_no&!qd!!h(T-_ZW;l95Fa0(azIW}==|d#cmC%fb3CaOnzKsh4jrIfuzKU@
ze}jE5P6#O@$=qNEPiiAu_*fcGc{YKVKX1kXKR+bP^R!z0zU_{KIvXjCDu$K2#2Zc}
zhUVYgvW2w0@J|`Y%-r2wd-Pbxa5C4GtDRjqR|=X*GcO5#D8_09Zb`i8(BK}urXh*B
z|Ky1|-zZ)BvE$@$SaT&X-6<d8+yAEw5E%8A7h)$1TiZq~&gn;VpkjGNH;b*Tl;V*e
zTu6H`(w9_**cl+bM~U*pK^<`~*Es$wjO7gcahRdUX9Zu72eZDixL_kYjrgX|P>Dj(
zF0KCD;#T#%YpWLkQWboba0qG8U;k=+@7Q|~8!AEpy*l;&9?tB#LP)b~!p7!!_B9>S
zzu$8!__q4bpNIJwf~Vx=9jLAvuSj-nVG4T<ZYAkYf-N?;cO8et0e0{Fa^37s+l3tm
zPD2#Z(_tppafB5FU&U${(|^MUzZ(@5wQRE{8S?GhP0SN0d7VA`8h(w~^5|?EPt77+
zsyM#E8GqitZ&AND`&))*n2PQsM^0pT{U4ZP3}p-{q=!!)TN_liyLou1`lakjiXveW
z-aJ1~o;p=9Q&c)6AWhfYHd4^Ja}PmRsFY-i{{EYOlTC>lO<{yBmz7&7^aJ_&`Ud4V
zk+we8HN6TLw?cs_D*{Nz-kC_RJ#>wl{kI(MP2wHV85wSK&G?jY{)gj|-GhIyH~mUs
zePyo6q~kGUa#R)kGY`UylWpAo$5YpZo)iVFLM(C9|H*~VT1$l?ixyc0ss=P1FkV?o
zW<!|~y#gwFgB|DzMJH=qWidS_#cJ3LZCDh89fYJ!^zwwL5}n6mth^m95h^1=8ZcEq
z=y$_evz|JrTSbXZa&w}|$QaFS4mHuKPX4)Wwb6D;b40K~nU7(s0o&Ej<n=ev=k=oi
zT^5szOtfc@9?dD>Uvj8YYAYSy_C1q_O{S8j{|bTYlLr(C&af<*25O_03SX1Q9>Bw^
zrF7r2+IrXzu}h`x)?Z|bK!A*s2uHv`VyxRCfB$xE+I*$dHE3qcyuI5i)wq8pKZ{-G
zlx?G(xt8!C`$r5-_dI%Cr>DHU=p5aA=j!4@B<ljH;q*tkG~i^pO`=@y-p_ER5Zp;)
zj~~x_A#@&Y>EMlV<E}D&nX&+ANDLuIqK?NkbEh~n-dc25K2^P94~UnXc%oEb#VkGZ
z)E3}%kGztw<d{BfT9^y8(|VpaFZ=RsyCaM_LyQ(D%e-hB`yen&^cbmT3W|pB)2>b?
zXw&|g(M>^mc;^Q`p^Y-RenHcE(3?=FMdpCF9qGo4Jm%{C<H?4DPT*p21`1!}LF){p
z(R?$7IfqXk-WV(W+e)tHCMIKuxeyM_>TRw2P14`mLqS2b>AcdNVn|q#&TKVb%ENrZ
zkz^TJ-t{u+OYnb{xJO_7{dYd2#oyp3Alt(IC1(#Wlv=G>g&ao=sct}KD0imFT@R1R
zLvv8)ihrnEC(D#IX&){4v&&~;wZ(U+`*Aw1_4>aG15bBQ8{syYI!$sD1tOG$!44Ra
zetMgar_T0_e*-SSu_fC<ooINzri~cE{}u&N8|mwNUb##afE{s@Ceh_BV>ViNThu#A
z!;Pm*NrFk#P?$mY2bj20&$Q}^Smh_C>^UAzJAORr!{8h$v1vYeq;QmSaVZNRrFrBB
zo6Gg4hL+<9-IBsyl)cLsh9q|-zV3~q8y%?5N`H!_mBj~V1&maXdP4~=0yA^tR8&+j
zC&-*VfBt+r#9%5`cwV8k6|lCmyxI$l6skUc9JkI=y0&ae@M!F#L`i8CIMbrcfoLk=
zlu0;euBfzBS-{rzKN5{uRz)pyo6VFN4HE^Qq?=tX|7e*s!}}wHn?>f_D%MPX6r0BJ
zv7Zu*0J%=>0s07F0&B2nzY`l5A&{*;E-eN;F?_+JBF&KwIxVHDct+0=;Jo0#a%pue
zsF_|)aE=rI4IEi$aAs(me;6~-g2-<BJ#D|Z$+>mW49vcU>FG_}8vO45FS_Sh9{!sO
zLC~2b{*dhu36KPJYH8hOX$yk~r9Zh)$}<8Mr|gi1PjPpTzcRBJUMccs@ruFUX;7Ny
z4p&>K9YZJ09x~7u>ZHVL*FJrm#2qJY9X5>pUa(tzukAeBM$uN}Nt2_Kl9nu9Z2BMg
zh6chv_$IptrU0N{`jb5jEa%Q;tJqpt&+FTleT3qo4y$z+E5WJG6crVhDGir895{Kh
zzoMd${XK`HDO~a<H>N@?T_-Xh8z(R2i<OL-V??va&q=h~{ueLa(!MrPpiMyeOx8Z(
z@sWLv;rK6SpF5J+@S5VnL!3>RFoEiV|H?C-ilJz07hTa<<pNUMXMuqV^0eVrGAn8g
zcF;-oD<6?AhiBAZSy>-WM8+EIpl7nZpZgow@J%eI{!PB)|H&COBkR)p-`{NZ7D5jR
zpHs#QqcTca06##6;y0K+CY1<478XYX1@eV-ma5oVN)PJolhmn??evYfWPkGo7^nTY
z_~6y6b=TyLSFSw2Z{Mux(|0mhL{Tgv<uN%=`v4|4NM}yEX>5XwLBvIR44Bm>NQr}s
z;Qr*4VD>lU;STX{H%vucJv$}EP{7RI-|tFLKOO~lrJ0%d2x>Fi+{%s##dq(bO;teJ
z1P=n%=d6~}Zw?H$AOm}g-Vif-;K>MA6ws9!q69@br}U?=k+r+slAg9&H`3dlJsl7J
zmIH7jXRa^LelX~A3kPEkhlYZWd8E(Io%_vSMw3wyH8C4JWnaGn{CY&p8Q$W1N-Ez;
zyEbqp6f85#Rt%?<sNrs0(u6m2FV$%+NWviHP!mUW1#dZGkvnA?;dtH3oK8~MS||z;
z%>VeKrMCh6$L-od`mNRg#(<#RX40@0E90o;16I$JYW;58F?L8QfImXk04mSQT1(dr
z&mSaK%vgyhX0&!+th<2rl5{BnK#s_VRtOp~c5+AdU4ps{D5s#HqOp<`X;*q&k}kLJ
z*wF^-R3_G;;D9E&6h@~{4|FWjht%<<Y1FfOcNwJ<Zl%2=V)gF=t?hH$!`rgWGzXT4
zK92?`^UA?iVf`T&(5RfvaoWO~Fs`$iW_d*gdR<hU3~Mf5F>G!Nsj=O%Z(j7q{vpjR
zz;YtYOjo*fqJ$Ud@7pEe$*6Rf6%}Rj8)h=NrS<I3;y`j5p9)yet%bC@7jO*;AWf1L
z%a_k&%fzi#U6KQ3Jdc7hUSaYXGkPtGZCH^8P}h^BVyXNz&d_)+zz8cUZh}_L6B`Ne
zj>S5K_(fy6nXmc$y}q=|p!b630vsxk5n|o%F3JH^azkk-)x0SxD{~Z&&}c0f2F{n>
zpkd_z%Bn!*=VDAxrs~gbvo;btobR`=QwIJq53Hu(6Qh2U1cxSrSC4(xJfpLc@er}I
z3j*mT6i(<@7#NU5?{6wx)Ct3%Nzyu_0?Y0?Y*!?Wy4D_ICk5m0W?Q}p-UN}A#aU$N
zP6hF`^K?oWLW1;+Zrp&Hfy0}i*(Tbow#v$^|9}0jlf7+Kvu6Wf=``TV;RGp(r%Z=i
z&KnRL#KmbiX8wssf93Exga0YA-O8v@qo`Y&i)?gkL~>DNU$CU$=9lwX+Oh2@A;owH
z-_&SSqG{f16;)Mhdcr~>Sx$dA;%!!o3w=c%j@s-N`Lse0{6~injBj<ynE#_YGQ#fN
zflb+k(_2`|z&=6^A^{~@LDu9&#xk-;b!!@KN^kaRY^?w2L9m9Smo<z76#x#&AUlVg
zxE47C<WR?Y;2V)=H*zcqayk8R;!?xREkW}hVZzqqZA^l@&~R~iBlK=SjuDw|M$UQv
zzYk12zN1Ak2J=omtzfi-8Go@~i;tKPW#1}nJpC>)yMy|Ni^mS_si%yS1K;$g>?(7K
z-<7n207eE)OhgX>fdj~5-OM)v$HABxu8TlT7-F%V5M5wzit*jMjiOFe0fbT?KJ0FQ
zOgMy|eT@4^spnuk@=kW9eUCEcAH51>7J3za@yg%4fxG-y$HDJIt@E4wD-_elLsd>#
z8XI4raRW}m1r1qaHE6N)3-3e@xha$a*O8%x9MNQ5|F>nR$uQTUlKwpVy&qR>-h2bP
z0=c)7Q$&Wta_~jLRUx|eEaz+k@V+qD%Xcv|HXcCitxiSj`1mos9a|>1anYxh%DnDa
z=i+HX))TxFNRFtRf(deCe!NR~$^!m#k@pp1`7UH{ja9#ic-HAD%(w91+jY_$Qik#n
zAUn@vsk(eecJH3LLeX23tJPKNpuXUa7wE`h?&+up1)Hg)njQ=}`GpnjjP<L{vy|4E
zp`B?KXZK<c24Xs*KaBQ7{#ZidYqIj`t9h~X0^t?D(9@$lRatM#2fJ_~{n4YS`VXz!
zw$&v~yRIvTE-uK~060w0xmQrw!CFQ*LngRC(aD!N_B5h?LsZ1R|N39ihixN?KA|Sh
zpi2@SwOY&MQ@Id@VX9-aJ2?L_Q#nJ2hQS+B)x;!fc91&Ern-gkV;wR_jTu0Tzw=3$
z{0|3-F$Fbdkm+QtJ%cXeNB<RSq$%rdcLL+2Nt4KqP!89gizdA^mpE?_QD|jtEhur?
z$1#Z$#I!5;@n^3$tao5cIqF0Q&ro^>X98E#w?zP*e{s?F+jiGB5woHo*0}Z<@MCun
zg*&l)NBy4ei7W(A<3o%X@s<L`J<&GzBr%hr1Lz8>+C(0`Fpe5Ognq|G74EN(8-*Eb
zQ57+6d_H#WTv(pzu1SBVpK8^A0Xn@vm`j$psd2Q$ZbyJPS}Nxu2KLZ=Jc0Qb7874p
z*N<Zb2&w~YBN?i&iSDSrn&J@jDW>*d-dW(Q@{uD3<y-=Gu6h4HEax(Qfgmu-lr2_=
z^LU%o3$%C9-`G?OeF60qgFgrb2D!>fo}uz+4<I(Uf$%4mgs@Sgm`trlaqBHdp+Zyi
z{M)fD!=fqD)i`~56|z89^cghB|DrjAG`+j5_ns;#ITC>b+G02ytEi8HQ6IHGo!$;W
ziEQ^kzKdkM{RsRY1sRG6(Oq0v3cIvVe)K|`4I*j0jpOw><%wz1b8~-Q!JWVU>E&)9
z1C6HYGX~yh%te8K2H3A3uNOh^^_om0atP*A`^Wc@jOM^p0ewXO>gJbfqkD>0h#B?R
zIEk?4XnA5HAclZV`y2HKQ!yzE=p<+MhyW0(sa;TxHAb2=i1sT^nZzhjQuF6X)<Qls
zKrhjD)z!i4xJhD#jMp2CF5U)Cl)`SH8zK#Uru>NdD~r!g%b82oe#wpYKY3~@X=!<o
z@OT4>0JD3KnRJXskWYVk3|pD9+bgepP9*@l6xw6iXY9JD(LabUbouhwqPaYcho#^G
zu}_<d7*cKU+xD`u0ZfRqpyB#$1f1S{%r)?hnUjzYiA#nSszk4o=N7-fKw)u*-1E@k
z!-gJ=gmBRN$jPl<)eV6nhl~?KHix}~H_gkl4mUMdUrB-Ru8L6xeTwr%_rVsSD^Q#X
zgLHCPdHSiX*TV@>Q=xOgGjc)-!)W*ct<IjBNe=4l^K=3tkdysC5<L*NdGKD(WlJv}
zcWiqXn)^AU{JZXT=dSCL2K{Z_clL0TBv-pjA6LK$Gp-A+8luDY#bdZ?z+t^9Xi5Ka
z!1(cXm(z5!9Mm(H4&l@AoHv1doL@U#!#qKoM-7fEEBbk#Hdj%bwCFacMVK`Pj}zb-
z@aZsdUu7}0J7us3yU!y(*C0<9hgYb;itchL&GxQYw~lV^GH@qHm#V$++}yDTL)%F~
zJwrl6A@Wus=j+j{KVqt<)Vc@9_mF~qPj^r!sq`{%sd)01p9AzUz7Gej3l`##9;>;)
z8>R=~!HWB`f|g;rTW>>nZmTQwcn9Zp=p*9iCPV|TJ?2Tb^hVEU_$cIp6;ZLo@liT`
znUiNXX$EX=KpF$Eb`!-u0(uO)tkc9u4G)>@Rei?E0ouvqT_TI429~Wo$K*3-g)Ffk
zZf%RONcyNJSR4yqSXKl8&~5m8EM>`-O=4BtCw91-<vRXTTxhJM%ATvxLNb2Mm%vPD
z2`WxBm2kTJ(|JfsNnM8ULqTXD15=9U&)!x+Bd6mlgcn=6EXYVOiAa4Ad6YbE6Esuo
zg&_U0vZT2r5DGoMA-)Q;_-xYf{f@b@c&vXx2E~<SM3B{V5Ao5--Z+yp-$Bge0LdHS
z$};cLRwq?}(|#>9?R1|aN&vz&H@$^M4d!p-ZrPx1AY^)rZ1r$1YgbIs=zuVZ7kBf+
z<>nO#m(ygG{;Q~;>Oh=%Mll2#`^VO;C7oMJYQ2Lq<P{a)mXze+Y87|cw5Ot?4ns%Z
z<P-gLoP_7kKxthPmaHFGXiYbqS;wo(Kl9a)XQwL4>Ie4f){WlT5u%KeS7PE<VgZ7A
zRUn1uJ>Idoj94Sf@k3J<<B)M@P&YUv%X5%ZB$U@gLW3R5jsi+{?bBc4d-9|n0~na5
zv{yn50UCxiRN+U}-X|@dU==eyYf4^JqX-qy1Xrr*vSFSsIi)VDve6b(c#zVyUR+Tz
z!Hw{RA)_9>dw&KbrUZsvg0}@x$|@1g#$aknp%^CK)<m4;9XF-WLtsoC|2iX@#N|85
zM|#x2+^4RBU?5b#@fly}tJPh)b?L~tO-KuC-F0G>m;&xZw=)U*0#=lJi*;!5G8~%y
zTBe)mAmt!YAVnzs%6)>D;j)!<ZWDB4B1wf%!`QpG96$Om2D5m5{44PdG{#V<kuD7V
z_jgdDFoMMxQEpgGO$MI9Ub1D&%}rljp@k@w!ukMJG$2SFJ#Y@DG^3BafBa9L?r_OB
zk$RBk5A|d3Yz3*!fBa9<t<y*TzHh;wjX)_MRD`kA+}M?Y(b3UzFS$sB4(FLO_vW8!
zZV7UEr|0C+u=zVLc3=4j&s;fv;prxpLeU*oP_w&5p8N5k7p_<_!fX{<p0=I)UZZ;i
z>P%HE-$Gl}BA2_^Hpg1ZsR6H<n&N@b@RrW$oWwSE&SayL8>=5+SrW2c)Pl&Bi+x&4
z7ltZZi++Dl3j{#NwcVV7q;f)P4PE+W+pS_MQcO#Z`1%g;a(H^47Fz?6c(6DtbCGJ(
za}>dP;D7<_^J>LAao*li3hE5&t4B^5#|WZ;y*;H{TkMxC(XC0JM2y3{1dixr{u&hu
zXk$u#FF8So^Y;RekN{`55QQ;O+1l22rM-PEUxfuHbI~%F$BA9vJbnWi`VK6east_i
z0Aks?nU>KoF*bhB^<kD4U%JVhR_FWStH`;`{j$%5U4BqPxE;dx(bx}aOyEw_AP5;8
zd-v{LPE6`3&lb`r3o(|`bClRM!nS5{=XmoE6lEn8+Vqc1hjw>98<tDNMKe0m%aX=1
zJb4K*tMiVXJ4N^0HoBLAus>h{1s^Vb0PE-3I$hXJOmx4+4-Tse@u8FV3C>l21{Q!&
z3awkWPF4)*X9YA9#<E`AAttW2kbO&LwhlTajIk^&|H3R`nAayF+uct07y`1OHQ|4L
z-u?+%TCYn>>7k{{w?`|%Sf8JvJ#K}R=X9EcJt5KM@JKXc;igEqnBt{k<KU+R%ILSM
zX2XM6TqILOCyGtQ;7~ygEKUmCPf6mGl9zX^e8md3SbAPr`8(QuhWEn?TdJK!^Y#Un
zFSFd36n^oF%y>e=R(u_V=`Z4k3zshaB!)B(vBLhvV<!H7j_Ov!5^s~lHt(a!EsOT)
zD4VT9lqFn}`FNdDA%)eOcV#ltz`tFhCNDY;zP+32j$1#8<tX?g5$fd(u0u$0h!1FW
z_1xyby+myKlYy{UrEbsH?L#5Op7sVrN1WZiVDsZH)2m;oCJ(6f4-C{<nLi=!@{wEc
zLYnLUhP{V4a1#dU<x}|zh{2{+XfV)YbvBC$A@Pj1vO5KytT~Dp9Ukt+H{`8IzTKJ`
zEX&hkBuzX7xzjNAh<9VImY(S(6<7VWXx~@vg29d<Ts<)GL9Nb&9nZG+>Ansc#Js6J
z(F({-vY=u+-%j<OE}L^9Tz)dZ4f~~^h3$U+gpn(BGZW5!q5<+361jEn3g>Yh6vBx2
zSlMj7{i3*7NLr!So8R0w?Oo3yRA9`b)R(^GfC54eil<dUGlMjL&ADWk+(JSjcyK=@
zAD?Pl3E3tPCxf>h47&Jt{dr2V-~B-#Er6Y!nmohJC-(0z!sT6@rSsn!4F)?<7cOG5
z9*mM6BqVQOp>E%my;ee|^6Y|_JU@N-W1#a2HF+;NlJP0`4*Is@tsQ>JI_TOb)-dvg
z+Atm#7Gw#Q-{lXJZvA9t(>ly}C}m|FDVok<YM=1Z(#L_Sn?8R){%S0a(G{yG(?m2Y
zM>Uome&{HvZy>iPa5TK-1k%FUC!ZaTmlk|`ONU8@NREwVs^GYeYpzS7Z_QI20D5KH
zhqoM(R(9y@5q~6XJJE|T6Rm?f8uD5=%3u^^fe&)rq!yl&snRL9`xBReXArd|dl$8&
zXW2nUCcUCoj3g!}uU@l8JlEakwzlU@Rp2QmCt7~#qyO~+U#8hAz$6^&#Nu|Ah|42#
zZZL+xT+E7`2*2JfxphImSKxnw4&O^o{OyF(kq`lL20MP_rb<{Y(j8uQ;*A?WXHQUn
zO0Aa*C~P@{Z?WcIG)U4P(VNabjV@KMBfMC&W?-vZnJWsff6kcNzeh8`Bs-esOO}5i
zK!h=bOH}$(=;Qdx&6P>3it>(D3F}uNFhRh?e_@{YJMS~yS2(dn6rxC+{yeOu=ioQk
zACV2b<k$ze-8|Ayob=js@)54K^VhuF4MZmmdAbWLZobl1xjRtt_){Df0N;)VVrCNj
zX?S7F3vwVn-s8cms)c7r`j2kLt|CY)G*1^WUim3rs6k?!I6s+5Bmt)t^P-IINw>6#
zX&R2N$QuX8qb{h;M5%|dB2QlYH4&x}Lr!ck;C<UIbWda%*w%N~KRkx9Dt<ihEa_){
z^wSPXh3*w2uv`_xS}q`>T|PcN3@A3JO^rFq@8<(*How2!u32ixw!IK>RK{GOq`moJ
zhsZ`CMl_%&+?F7~STY@m;kgYdL=cdS!Vp3dUA|zf$Z;=jqIGE1McenpiIpo>Y$EoM
z?!}wr?DY|1V%|9A0;crFGDQ^?qDo3uH0;NRDrX;4h656uhsntf^hx<)mht}cH@g9<
z4R!!*O)MlkN1T&JgSM3ZBp%|Atjyu@jQiT)EN*-^0kh!IBR+oMuL~;5Ww{Dn2f3Hw
zXHT4V?{F*A(%M>&HjjZ!aK<OB3z8MvNs{TuXw?}g#(>_n2uO``td;%*lQ3gh;xot}
zDyWYSohl9(so|3anooL+J;cg4o!aa<1P9L)FzAqi$wCoOy3~ZxyT;H>$AAF=0V-ba
z>*`kFBRh9s^U1Fl?$165r7a5wNyeeOB282aqj{Fz#4JOCK4V!KR2eUUt>OnaA<b0s
za$ppK!*+w&kK@Ptia<E&!3#P~ME0%*8{FIysr5x2)s9tNOYQC3E4A$?{{r_A9!!`p
zY{$0HU<cOUXF@{6_qAKulzGEv_wIw|^*eRyq+x{JhG)G%T2PT>|5ntTTm>&VNG77n
zAr8CgDX2gW(oMOSAR&-(bFBXq&A(>xlsyNUDrO_09aM=nO9%^_o+DYt6bYIIwnGh$
z5aXM;ZVwibEHk4OpS!(QtaNW3q$zO53b@N(c^>a)k$#m|Tv*&nQrG7*MqL{tNW7T-
zY)1^ToyednUz(2TNj8MK9@cyT`X{D#H4p=_D33VC#u{b*I(!W@075`3HTkg@3>3d>
zJM5PT%A>nT`H%1210IZg@}#NSI<EdLR|(2vx7?k)yaw-Hw!Ylic--r3bA5YuZ6|dw
zt!islb-0b>Frl&Jwb-B9EXbre_~-yfC#VtG6lN-`;S6DRiyxb`>x&*d*I31`TDgU%
z)`mBywukuaas>?8wWraE93qs`<k8WwLs+VeZp!LnKk2u@IyzqgMs6&%;|H@faG>k)
z_@&UGnRjA^coi~}>zLgs^XVu8Fhgch`G}UBP|@-K?HU`k3X%1M<?mKMbWlHY_G|}H
zT|L-$n+xDt@q4>|zT?Pqs;drY7V%Xxr*O}TXS9@DjjQ^ZRh{P|4@VLzY6{W{&7F>k
zwxc*WwNp`)GV^_zw#!GLdo*W$%#?_=-y9j)`wscLHce*;|LM9(wzel~3RA}?NyGE$
zRk9<lk<(UMQZjvpjYP-lY|JWDLv)tC<Y47XQz&<gw|uIvH|&G(!2kO6^44YXsO@Rf
zU7s$!=&Vv03{dFec93(abs;7(>rOYbOb2zgps){R=%aa(A6HQsqVVM@YDG!_`-gLS
zv8V7dv#~M^$-8(PC^dhqd#<B4F6wye4sJ1qeW=KJBhry)fj9mbuA*{a90b>+pPEnB
z?4jM^p9y2lC)-GIb#1ws<K3*RIh@JW?f6&V7kHl@@2q&?n}ukUO43WT>tx%vhqiex
za!=<rw+1qd2?&e5eqEb^M%HCRUgeLjKL5rgp5NuQ#IF2GmnucEoqphLo>uv;U9rP%
zsJHdpRyy--9+NXhefXJUjKCD0^+}vkXr`B^mGfyz{4RGVH#eZmNon|TCaBh|xmY9K
zT={kq8=`7QvD1zsGj7eRwr{P2ZV<KEq!Ax^7Mve^_>OvOAo__b@83c!F#9Lq{Ntg+
zhaVaz>^a0yFs~YS&p$~@PtOnE<nb!0lOsOn4~8!&4<QOdtEKs^7n&rbCzp!D{poUU
z!kZ+o?ytN&i;aCc=SQ{uG~Nl`(JJa{=3}9RW^wv~sSA&WO9v0?PDMtWsu<GQcjQ;+
z_3NqA4r_Jz?dP#$v+mzNoB5b>nXPWgUE8VkCO0Xyt_Wx)8K*IDqy<xogCwbpu~NKN
zj*$wE{Klje?3(BZo<*DPfc}DVV}xtiwt-4YLI-ECgY?2~Oc4so4`{|({slM`e6OL_
z78<MYB&OQMs&kIa07$5QooOSZG_OmmgKO8V^BZTmx1FRApcS(UmgdTuSPSXy7#%1n
zsNWt2=d5GTBL$(}I!Wu7=IUm0n$n-dXoAuFudiA3pu>m7Q4EKL2Pr8BHv6;NhP^qv
zkqHaGyTd9wd7x0j{F62x$BZcv;qOKnM!y5P!`!2(;c0P9%heW4EeZcWy1bYTu*T~%
zuNEqa20XleCt2BDJ9c0^@u=W2GI`Xw9{dbmBkdT7`a*a@G6sDSq#MdjKYtq-a;Hop
zJHIA>!XOGCAEGK{ITJ1M%42aQbOhsOuBX<cN?*L|debD81J&!MteY17(@E_Xmb!&`
zEu^q5C|YRNlXA~8&fY;u?1Aah^n}bfQn~f;IS`w43FcH?7<c359lfIy*Pgjj)mVCA
zqs<R3K*PJs??-qh7|d&-KW}!`II}7&%^U_)X5UyL0}oM$`zAvfpD)7bLpCD!#<M}p
zEXj23-(QcROsy{C6nrRfMLwtS6U;Psu<&m0cQcsKHBDSWu^edaUk8zUV4N<L%gy<t
z945liRI_ytuh0+YPnZ#<aAX5~pb9Cktv8%XP~sOhK}b=ONdDBZ9$j=l1%<a<x_B`~
zt{(oy9|O^R^NZ63y}pe9^&H|b_tS3oRndOo*?u=?D>JN4vvx^)B?e=OuqAsxG|gQS
zQC|!yM9~Qgs!JCpbP~u6n8532lGYZ`$>}8v2J+g>m*&#t;;!*q4Icw(JF!&j4xIfH
zz9S?Ewpqvi-ly~@eR{Iz+6Eu1kh&xeWE74H$V)L~KOOZpabpNa5B|ch>_E$~a~zf)
ze%_pTq*n7q{UPaV;I3BQ?}T9KU>Ioi!S@ZFFyR_q!^{JprNzBi-kos+VYjQZGyNiN
zAWW$YY$z4l&&>o0B8|ZURLm132FjDcUa$gXv_t(CeM;$`Ls=QmdqV{YQ}2wHvrcW&
zk|k%Zj20rJ%S(@q=}TF!*C}z67Wbs4#zpB*ICzFU_T}1k{U{V`m+W`Ty+pskiyrgZ
z#|MIEykjuOVE8RNXCx_9S8=f?_X_J8+6!jd9WlN^pdl*@RdUfbx(VzCD89_gq&Ih*
zRwHC>AV<+KG{UWva?-WX?HgOj;I8n6fkOef3Yp#X>GUinFIiGbc;O}0@l=`1dGck?
zfq?7gQe_HJoj0$#0vq5?c$mIPKZUCzZ6~q<UxfiPaW0u;K)b{e)2j$saG}&nn+s(_
zz_%5)$r-$8l#q<OGe*SK7Nj^#E`kF9!xw1A8SAz_Mt`>%gpk?4X#doHy>a=a;`~-0
ze8Am-p0^NSXlSgXxUPHOZ~(+%xQe-8eq|fe3E*-v<zr~L%Oz3lKy9LLm>Sr@Q_dhP
z$BEfB-uJX{Q+m+*V^%#{BH%?55bE9AdYTCeLC23D=kqCaY}>qAjuz@%J&<9l%h~=e
z;$u0QJ@xdI)|QNp&9m6#ubYq^kvD6>wyc86j~}BZrsc^EQdTAceBg^1>;UK#my{HI
zFik^J#g$O4G_+&i7jo&>uX|y+jO7L>8odzL9{jqFudI9&E&m7RoA5A^QDOrGjIP+a
zrdVB5vuDSSH)L62536>eS4R^N@ZaE1<`nPwsoM>6eZ@=g4g6L!HUmE`Y?O$Auo>+c
zF8u3Nd3mlteDmU2<l#ZQn-VPcy;I0=IoG?oHl1~&i))MQCfq#_60qrhVPHFh+OPXw
z=_M(I%H?*?xG^<%y5ldZ;nO^#Pn#vy3>;ySl6k|S%4tYw=B`5{mQ6NP>0!HL@$=Y=
zKJWU!uG78QaI|FMo=|t)M$Jb$Yb)bo*Svllm-a!MIeC0Ll8%u=*-PQT6gZfR79A})
zFuK6$BmQ=S3FAGkenwOLp$^S5?5rOkPz6OBo0`^?=cw|>z-r?o#U`k$sarsy&A&XK
zn~+qf*;zyBv}{?GYD=5DvYvW2GiJ;{xxhXudZVnU8qVI}%F4oXJ&~dlajVVC`@I-E
za9~Tl<501lJ9kd^a1}rClRc}feEUAAt*s5AlOlQU-8Qma1YwG9#;H?9tNLydECfmh
z?vB_#OB#ww)S5T10Gl3=9@58KDJc;#u2VU*DCK|l0?Tl-o-S$l`Zgy=Ebd!L*#M2l
zSJ8tN*}uQ*^C(9Lha<<19Za{5;3DNcdsZ}N_dQU<$_I+;Cr#R#n5fe^(a+a+wTsK3
zQ?Xu0j{HSW5R4>ez%?9H@3ZO;9Sg~G$=rnXBQ$CbJ8`~o47p}wYU)ySr%k|mA@nG^
z+s?~($Btwem+5atTok5c_V!-J%)zpfvO~h7#bY=;f4qLcwb(xPZS1aHDQIU_3n2ye
zd}TiHkj%a-D21FqUwCJ8<3u_jO18bfJ$`A!Y%?>eDrl75J9pv>Gj{y=*R_j|Pr<Ib
zePJ6(PFY!5s7~=;4-72eTO+DyIeO?#FWST}?-1GtREbgCbQg*$ZFV_dzrK;1&wVHJ
z7Z&P>K_1D~B_nprA4E)c6K&tNZ6$?3r}g*k)EK?!)3fJ#>R(LSuiWm(=~nBTs7~6k
zG@kBB5^!Nmt>^C0=;$pN*<l}X_{tQ$4;~B|M?^$iJ01`a;Ol#<ZZD21H#aqGOGTN;
z7`#m+`DCQk6&Zh)D?zY4tn0TwD2*0g876N`(KxDL(>Gea>G{`NG;(oIzGkipTi{$G
zJCdt>nIho$x?g|&l{v!QspLUg7^OF#%h1wQ)zj-QI{3F@_ENzx=r-~sLAuO`&f+y7
zysa&!`fo>+756V8B|=5v!GU9Q16V6BLF@a&PVc&N_wHp54qvFJ$(Ih5UTJ&_+|oIY
zouB|gG~i*!qEXAF&3%?nTu3nkM9PgE8?=)e6;`aP0puW7Nprs+_j{I>mUQqqF=B8z
zv7z~SpJ=Rklc(3(_E|fnkl^5~8#<rACa0$I;tnk9oLs6fc1g!gZOS+3%+z~u^2G}~
z6Jdf7PE*Fk<YQG;k;lFxQuE_NY74@fGUpe#pUIQk_@iy4g{gNw2M9IW#N-e0ysukj
zWn?IoT>$~4&f3@2K!!cSOs(|y=Z-VyINw%d`cj0uV6@yrNk}&!{ks~Fq3H$`B^=yx
zh6MpU#3kH9x;_($Ef|-3)QJgn@oRH89t)c)P(gC%(Ie|3uR+G;;VLWrlJe*gW{Dfw
z_p<i+1b7uJ#5t#&uq|KfV?cdC?5K=M1N0fNy3+>1#F)?hzx`B#MBpvWQ8#ZAJQ<Po
z^5q}>cL%mmfGns*Y_?!{_<;kNpo3#3bEserP1l{Kd)=dM?;&{WX>-<M+Kh4OhECB)
z)sW~{=}b}fT?;5YOauV|T^Od(qkH#h>F9|>CuTwgWYx<*IM_3)Prw)Z$d4YVsi~k;
z13X3!I(_<sC9aC6Nb(e4Hczs%Ln!#eF{{~@%B};rtuw3QE?js|IUsWitme3E*`)VF
zpM!1q#pJy5tP{%_M_#@>pWFCRO!GDW`f^9UV-lP)x8h@Mg@y9LofE1zjhj4gp5pL#
zoizhMF8`0NH-XBz?b^66b3`(fA=5=gDr2S0QkhDl(jZfj1`R|-3T2KcQHm0gBt(hI
z7&4@ZR7epGqCq6#`~A6}cYSMp>$}#wp7(j~o9p@y=ef^gAN$zHzJB8dUNd1wj*JMf
zv$sbf8S(rM5QMbh_2i5WiPV~UJ{#J9`ew~vcYwnGKDUw*W_Q1f^!PO+X(mseJY&XE
z6O+qu&IqzO@86BJJ=2V@%=9g7n!|?=N9IDq6udk7ry?k+ygW3PGA*A8<%1Aw(tLXS
zxUH4dXrcrqUT*aOeEY3i(bXeI4j+CpF0Qi}X|QI*#tiO(Tv=FPj~^?msvbsybNKLJ
zN*@SLr~$vgD&qE_MJstAh#B1`vB^23S$@;e=>Ur~5~3*kbne)ZSIEqa_-ogWqBvc!
zfOsdIW@LohCJ&+uVC$6v9nEgCPTBn@t9u;zEiEJs9KQMaCILp|Z+Mu_+xFyx<GXdb
z5;c7SrO(C3GcVIyN9P{Oc^c?yYisT8>{`FoO+lXkJTO`rYi*6B?B-2B_7OW}`SKWe
z;(hy`z(kFN{-A2vwd-z;+Ih9SaKx$@3mPt1@c7xYBf-HKmluH=SWa}Xy~KiTQRc;n
zy*}RMtwH9C7RkuT?R`uD<c`%6nLRpaP8kb_p#TvPx;LuN7W=|R>6r=|S~?p!2PI-K
z`J<I1FZB~vAy!r;NGpYfn>YT}j;s}wvia~9eZlC&dKNK#!|OJEmuwpdCkCS?D=i%y
z9&TxFE<9?OH#QpVDT<`Lo2l2Xkxfq-H*TDUhHP1+;1CBGU_cy6k(O`a^cT6g^%(WA
z3bO9qv$L~9^S`k=Uw7WTHNaRN3ZJh<llNuYy-8d506Rt2ik6lZ*Gzsxx#}n-g!uS1
zsB9yA#{64^Gaa}ulo7KeE-gNP<;oSn0CKSyOVvVV$AQOK!E9?jv^TJv*RCBQlCYUI
z>o%2v>=}}hl1)ub%X@!u-IuijsbZHSj6@O(=a&RNug}=Mz>uB94(=K|$47i6^l{<H
zha{(<X8x0_0|J<vnp&e_Cd=t~Lw*@C0yPzXA6Rne*fC|K=F!pPAP=!+GBq`Ay*faP
zsMxVnCxV_5`8JorgXI!QV3q)%n2ls=MM?S2FC2G?YA_G@qzwgWMQW|19?{|z`5mPN
ze_HSXICV<L+<XJBiflQ|*K}jc))F3p^-xr-c&Z$u*d$?Lc(w10i~GRS2ffj<kKkQG
zkh5dAMOex7`*Zxr5u}xqP(0uuj%rBFC#<7~1BV$^&RzTVG4^(}l2WgJ{T}p`KyiTX
z1NCZzM_-b1G{U{4-r`quBqXGk5(D&%>{wg<khwzKmOihsC)2l~0)?z<>0X$m*RV_^
z#Q+o&YwNvBRa8}lQJ!SUSWgHYS70&Yvdbin3{rP{okhxGaencv-`<?~a4R{Ec#g8z
z)qa(GN(h~T47HY8nT0BV4;5meefItPd$mDF4;~aYX|e6*rVl+nMs?Yn)%g88=Z+6J
zrn#9=1T+u=L6<JXrQc*3t*xzJr)0rY?3N^$*(d|_pvH}xVulBxF5!ckkCV=wvxzW*
zsj6UFN94yjjf-F9^q!P~NE(;f2*V7jF<Hu8J>#EvH7B1t_Xz0l?Ad|V6+lHtN9Ir%
zl88C2Tv_mt@0Z3|_BCQZ68zac6lY6}4}Ss|f{qp4g@A5Bm^W>DOE?yOf@NuOIu#eU
zWAi@pJ-KOJfyA1vvSQS^_cnfYDeY4@vKz+a#9S{tRhzRAOz`2GATlGu!^4jqGxc-m
zTH~YZ8<@3%*2iYB?4m`B-e-Y2*kim^Y9Ydwhj3feG;^_)m7b1{(NfxNGz$-5Wwb`~
zaHmVC^$Ei2_Wj@@!3uO(yg$s}M@j=Y^6~O=E=>uVXt_<aN#X2Rm4I9@A(8lO(#6Fp
zM2i$h(isvuKWS}328s;3QY6`za3!fxMw@_mby>oPj~|N;NxRil>KYgXC2RA>q+E3+
z)O`jUnVTzCo;oG}`=o39P*LkNL5lyWxX+)o({XWJNxuH>t`ZUwQc`*nlu{Sy>FqeM
zzhM^AZyTF*cm|i`8{b%SwSV)skz#QSa~GVQSwR&`r;QtY<@k;ro!M&O)?g<|?J95@
z3-bH7Z+tPy)kpQn2nO8oC_jI;U9!Z$cN!{fKBOU_Y!(k}SZvz3xYMV@j~*3*Z|e#=
z82D`7oX!9<{w+{(*_t);l8?qC)B=ycG;u&;%7<n`>dSNmHNg|sP1)1!Z)n@1+r4uv
z=#v=e`dN+>)5+1XK2d$hF-pt<Ro1!M`ijC<2GO?R$f>8-d2d$5plPu?ALs!k&XD26
z#c5D#crn+Z{p9^5IS{ep0FoidvBEcu=JUVLL32>1eKEr-=tSAm)<l?x9j`q94Oua9
zPx9gd0xN9a@=fl-E@uNKuj+C9a$x1hk6d#3nE+t#U)QD{{JUu$?<L|*&CL3e+fpvU
zw}g(rl#d76Zm&$oq+5L8&?Q1LVl+;ARu4A}wG&hqgg!JAy0>&MEJjU@FxhoWLq#WV
zW>Z}T=#XC!9yhwX#~9wcv*HqFm@H=-W8+!2b`!#3x*;NYO=X<9T(j1*bBv+mkbY^~
zT|(k0`2xittYzlfKFQ9;qmK{$UH930WLww1vBBSiafHol{Un7m5l|);m-F<{G=g+x
zizJL~e=$Mqv{u_H{`w@`EiUK=qx(91Sf+PxAytH*aiCwvcco;{9tr2pam9#z6xAbR
zQ&Q+R=JNxIH{~sQScRgRL;f1<({I0e73mT%bf_-sxsZ?oLOE{{K}T6tvGC%p&PIJa
zh6@|wS46=Lu$1IajSUTS5HU-szcz!AHxfJKAIAGK#Gr=7r@6`<buiBfGpk*D_B4L`
z#@);{Fi@W~Y4zT!ic(fG59z>cSA;4;V<uai$cdOG5~ntlIubE#eU{lKA0LepD?G35
z?Bum~-CpLFF>eogcJffEj5>%5K-sUhHdtb&@@xhu0YJ-z@tlk;d@nQqrH8)aIx-%4
zBM*o+&~^`wS4l~sJqJA#)18j>PfcBZcL(y&NsLdyK8dx>3L<7m_2@w;s-e;gTC-FM
zV<bvYP>;P?j;HUgQ6T=^?gS|Wsnb|QZN2}1;`2zf5)xvQ41+XrovL&_<6`K4VpKpb
zAr!7?=pdgdD9YfF`5iT;Y~H=Q5~2z4!@zE?3(fz@<72iako<x)(fRV`NbnH9RaaJm
zNdx~_358{8!FY*NoIhigB$t?Y>XaSAC}Jpk#FzO7z&I*B$aME>CY?J+dM`AWly3ZF
z@#9WKhmdUj1q<$0(rn}L)XZS8r=!}FBUW3B+!n>2b#?xlI=*-%$&ZiE<RnLOe$D(P
z7(}V|Q(h0*L;J_Nbxz6CQpPHVRKI(dOm_m+1&GK-j+3A*MgB8_X#~RjghP`e8~6p}
ziZ%kRnOn4@gF_X^>&uf?ySZ%6vNR4lfC2=aK7GKPn`h6qe66Ow-Dlx(#*I&Muh2>9
zHS=V2^fK0g*Rr*|Ikq|`mGKD)eD5Ywr@+8Y+i#CjQX(50^)YvxJ3UN1vv&(=+T9Tt
zNH7A#;c-Y3Z-AG6w%rgd4?Bci&c=8ChB46<0N8v2zvF!4n!d$~!h<5>($C965{Ao}
z5B}2rW&oGV)sYZAfAYkc(-5mU=e3d$uJPy3w|%QS#-w0aLJ$_lNkuQJMpsGDbOl;-
zFV4%ml$YhgZ>jACmCl{}korLVwq+bH^n!R5khi)29*A9y&~eN}PayyG>yt-}ct@GT
z>vR&3k-JIVoO?Xmjz5KF9<}s@{`<hkKvZ1zxIv=153Meo^_;nU-_qn&2BsSxthhw$
z9c|{`=!n6~<>^*jx15%Ty!P3XCs%N0lXQG&{&w!%=-IRHvb`+K%-Ef{V$^ci9J;Xa
zbP@4(bR}SK!XqMp9U2Y&>G8tU2J=2DC_EC5@)i`&<p#t@o9!2c(aWDpQ^%5Q&7JGO
zDaQ%J--e2!?*<*oE#nfP`!O0_2&@q=2ejL>Zyyx$1riPPuwW$j2hf|_3!ZXN3Dw(i
z<AQ59b@Fi7v112+o5qO#XpaTutlu9_Ifx|YN>*Sf##1jTu_8+Kj=xjQ6SE8OL)-o4
z%^UEHzT!k|l*mb2W_{#7d38_1han;tNvH47;9E6G**!g*c>zzOrhF$Uca}Lm)%)b{
z8CbY{aK2qsbi|BW>-1lrDe7?}xX0u<_|buuW{&7!<PA!<=MoZDCXknFx|FEF;+UvT
zpMJdPGlDq%bM6|?Nh-e8S*Lpv-c)4wCg0&&-2w^p95X>h`!Ay8v0<cbbEGxsTiDsk
zkaIXuu6qH4pg4G;q2ZaeWNagjG3W1DLQf$%@nmG%tpq`<KhG3=7bydPA(lLxM~z~5
zQk+{LYZ0RX_qF_lP<VHbVp%jf2Dn1b4)4;tfB#Xp!bqYYu6~Xsgn~hJb@jo8bR`xP
z61hg6NvQm3U*Plc+^pymqiSSGQWiwYz*!NU+ylv#F1pCPZGM!KBb7RlcTd1_H`m<w
zc~G$hel|sV4`uY0Ecrz-?(46Zjg-z_<)902ICt<_DdE}sdCa%@2rDcsCaJ0hB`)Nt
z1Mucu#iZ)R=SACxkv2Y81{!Qr+w`N+-u5}Z;B(JazJITF+WrD<xpKc&uJ`m*@MQ5-
z$q~Rt9HyzarOE^`>zd^p96>M2GB!<$|I=8xpw+RYq@=Lm@bTjd-`sF28Z+JJ=f_*~
zB7O`TIZ_DnDBdni(lA3Jper$N!GhGMy|!w^B_;;V=ncH;+p8CcncV8ZX!Q>%ZGgTb
z!%Hve)u+$9)+LN{>3K*ji~_USq}^E$^Vn-!RFF;ipO6|jV1T>J*2#HUP|C^M4jkxd
zxqKNXp^aR-n`+l=1zk?`c7oR40|)$o<{YZWn(wn{Qx>hK)Lk+^s-ue{zsNq~_#viJ
zka7833dG6&w%{fYMZLA_mY(uUu%a^G^u3d){_#(Me1k$IT?;PTU#H!;5i<4$&^XsZ
zI1#lS-lTimr0xg^2#yozA9d{=`s7_}zUl7Je_mKf*?`4XJo-Fh?oT<)F(NcJ$(@K8
zeh;PQZ`{aw4i(FzoM>)W%&W}q*xk0nslsG3V)+pxni}5CNtvv#7;^pA7tWt+8oD|D
zXU|pv_Oz5KDIZhg=K5R+LQNJj8Dv!rN?p5lOwwQ?p2EnH>s%hNbc+5)YjeerG@5{I
zVpv2(G8?(&wT3fiijo`+W?c`ECUuC=bSELvvW_AQLVmoR%)}-ezd2Xha?>+1=5@MI
zJ_vB{H6aGMPSMIAZ9gVjZgi(e$h9Tj;e0JZsckdM)HIoLxYLyOMfZ!8c?pB(Z2FMN
z8>M8Wc>IEn{jy~<SAAR;wJ0_rVS)MH%#B}$JRL)}TVK*qP*)^{aAl}tvMNckOG-=E
zjTt;;*)4yb3dYn;sgf7y0w*q{<#eTCTs`N~vlW-fy|l}u(pE=dc~Tk&YNzvsGbSLI
zvnV!p#KMY)_j7ZDiypP#)|-)D_2si|@McI44<E34aEA`tA!F2j$pc$%aF9oWBN9S3
zRgjnFouX~%Yy8%&x5-~onv~A8Y>v(l$jk4-kZxoRuJcR_Jq@97^-P<<RE6(lTMF_U
zy4Lrh)rJ9k`yS&;J$&N8c%}eQ$C+S5Wn5lq&-htOk|5a8GrM`i9@toPzjE>7t*yf(
z1RuVoY6Z3n7v^y5z|IHXuKz0rD9Fp#&mck04=Yyt@%v7z0wtpII|9FDcp~Sjl={t0
zgt00)`9B5{;+od!26KEZVsAKA48e!A{=`wHF2+`G+PVzdk^WW#MvLg`+7qWpXI#Gw
z*}4HCzB+Rli8O9&(E*I8UWLi%LpPa`&`s1M-80{iEs%M>Ya~?9@NK<k`4OuA+C_xe
z7rqOwy~ASzBKeD8$bFAFj^d<jdQ)Y5KXc*nsRQKX9w(@?2>4jdW^-Z<JDx0^^g*LG
zh@!Fc?6{{c?ZkF8kJEfoQPD_Ej<`E?OXrj%Olp|>=Sx+R*g|m|e$`>EB$*|T|1~)`
z<s_CRBBHeE^F<=@J-Z12QMY2iitiM(I8W`d4t-HE#qal2gQt*T6i$Z)Afzaio%Ztt
zWCg*<eazZr_V&7uz*(%l`^C2L0<9l%Y<R%jj`DuU6=sITlLdW*CDE!CT9rY3*Eav_
zRBzSD$jC5EEdPD=e_DV@%~`&Cf*|k#&W9WxBM|nupj(YV8|jG+*s@<yWITO30Htef
z9N`@KdEY<6Y{N-a+8oO(DH+(0e+TVhSg<WFPO<q<ppOoD;@w(NTicK1x_)i+xix?N
zD$GmSBox)`x)~T7_xjkTd{SYjUgwWI-N)*YE8(9xvMR6EvJuF<23^U;#Re1^Sr~qf
z^bykV76r&EB1xlT4_=G(u5bG?RaI59oQ7T`uZL&9UCB6(oNf@u?}v=h(h6Ty{~4^Q
zub51PWR<*63IV2rQ?EJO+3BqP^b$$IiH<M3GBi<-^Ulj9YT|0CG3jtonVH8e)MfYL
z=d$N<WL~Pk#p&zcv!_4v!lX$rf%g$FoTek!K2&dB^JJ=q+Od~&q5d2l8%yTr*v^{O
z&N^ED;L)RAEC7nQ@VcZ|EkAy!yB8(O&zMnm_il$HLbUc=(h>*2nV>{dVP4^>oU^lU
zF8rD(nqf>4Twk%z5rW-Qr-LfQ-^jkFVq)6;d8ydZGh_Hx9c~Z@5*urT(sriHmzNV!
z*GMJaH8C@TSW_%A0C2I6uSG^hv94aN%Bc7$nea}sXHSYvgSF?{*lY$_rB8ofxO|u1
zi6kTC`i-xnGxs*Fo;cER@P)ioP#Dp8mITXSEi~5V^<5?Ik#<o-nn<adGZBt@n1aIT
zj7=R&o|67+p1C57{vKX*5>V3Mcv}criuB)2m~*0b$tzAT_n1?s0xU+TJ+{J{HPR*y
ze&1_}7H=-Q?lv8qURM!)@tL`kp`-6=YlXH-mblC7>i{A*z}Zt5VbU_1F9-~D(@?V0
z>E&2w-2zf0nsJmLIdrIz<2J(jd)XU!2B6wCyX-^I{)Um72W!91@Ug4W3_EtL^pz~e
zMn0@JR)cd{=4-wmor+yzPti-T<(Ls8I*6~(+?YA^34N`b;0Q-5EkO+M@LObP4SjV4
zcz`SmaBadHaX>JpCPn9G6I;$quU(qDcJS)Kf={UZ{hm8o^ge&q(bb)&Hu{Xf!dx<?
zdef=_y?6<WC%s9Sn6KL*74mx*?v8i|IgP;H+WAp-_VP`~AIB5HAz>nu7*H5s=jHwL
zo2r4N+K87NXpqX+KGoJPDC@IzqK3wY?r)hZdSV7VrwwP(8!+fKbEoVWdS5Eq`)zNc
z2(ahCfd`eepf)^8nMmxS7WuZ32ktQ8vVbvEi;mBh=ln|l@KjkzsazYk4lpner+75U
z05IWQQZ^g%;M<X-CzJuab`{NT(BpRYg|0EVMP@;io9f#ddNaeQyz*^zEooerewYSK
zoG`)b^$^;hDSAqag&vDTOHK^Sp4Cgcuz8?n(13{$ai<RF>qGc653TnRC=55Nr59jS
z$eeXgDtN8=`pMWZi|m?d689~kx3>E9`Szn}b?>I4G4Lk)womQey*q?R(VZ74lY!R_
ztM8VT{lSTV4e<W_eY*+89#>707i?JEzD?(OQ4!+ljz^|=*62($r|LlN7y<7j-AzJ*
zCcd8Ptrshc5{M567ylqK=ihr|nk?e^dqg>x+~&wR{9zA`LIzvRm}ZU5bMNzEo0ogq
zQ7e+~6bMY(v!`29gnakGV)@ad=a;E8HyQe_EHEy69{ND5aE$xwcS5e;)Wk6e`}<&y
zU&rCFP}8PmWPBSV4ch5vW(w9pOa|E-G1dDrJl3U4<HOhW{4K5aeIk#Vha#}P+Ev1#
zU6!e!)pcB>uHkG-C9AJ}oV;J?64*R~F3L5;2-2pU$(S(kiA*Yb{+vok58>fD>?k7j
z+tG9A{+;as3$<AGhE{j{yW>Pxl1NnDgC;u2ZoX+ye=v0gfq`|T_=lq7-4U(^D3Auh
zBh2XzKO-*|3tfM!?46%NvhCUxop~}b@%8!2^`sma_E@UO?pKAO7wXAHy9NvtSj79&
z-1aw3LJZ{2qk;jmFft-podXRNQQBT)U`DOAi`;<jorhY6VXix3+@wh}$j!H$-3TYc
z8R_u?{lPVO@e5BjO1JkT0eBF8xi61mXpWL1nDc6>gt)Q)D1h>p={j@fFt@t=a#)z`
z)0bN)tnjD%4IUgM&1h<1{$s??UW@3BzLnPfn$^W+?WTW}5M$tbd9;#t6<H$Hfm`+v
zMk(HN%;a_ZX;-nn$l1#m^%3h#tOv)5rba1oqBsT88n4zmja9TqQgvUzhYr3MnG`CS
zx#ia$>2pa*H`jfXvff`BHgur(lV6CVuU)&w4X=BFMoA*+^lqln%q?ao#mTXkTl(ox
zV8puvj?Fh!*pZo6SATVZO!9U)`9@}unVDPXiNr!D@(Qndq55+1V(>&c2aX~L7xf0a
zb>Q>2n<N9Ix8LqhNH(yauqm#G(VVV(_w4b~Ed$ZNZNIvrw6)d8!}`gxaf~t2NuC1#
z#Lj9eE!M-aHTaq>xmxQeWp#CiVz|m7JHaZgJ}Hhq<O`nIY|XEE>zCqoMimoO&#9@x
zj0T~I@4cM^h}t>gQ3)}~eAD-}WT7cnUt6jOL)VM9`26a<Jdn-j`=c??E=6c{@inFD
zb!%gN%k}Y8x-Z997Ae<wJpWp)M_)2b6{2)$Z5ZGSNl70MTVA|-@@D>_u16$<@-g{*
z%77w=?0)7uvsPSU8||E-3Z^^r=fWwyeeM^k@9wbLdh@Jr&F6>--?v-sTz7!gb*Eq@
z4b`vq8y=<dH8Uiu>Dj#%mq<+cU<Qx%Z}N()Zb$vvcCs3~{OVK7#;>Td4nWS+*ZQlA
zY4Czswzfk)zCh_$e)u7;b}~klOpT^m`>y%oK+2TX=H@#dgXg~X=}ixBV61Dn{wil@
z70*M0GKbI2OS^XMf~PzAr+lT`)hky*TMbiU9>+;YO0HaRX_cd6$@!{MLiQ!qZZzYm
zf!5<vjg~Hb7WUfzfOnd6L8l)N4~m1u_M(UiP!{xxgc!nOdj2Y0Am`wwc>c_po$EGC
z_o^pJ-8pfmG1EjCF$(taZCT|M?Y&Dc>^U^&&?FVn`vI~8%HMbJT=NJTA<sfFFI6H2
zyW@x|>(;SSj@N}j*2X7#nuMqu_4BWkM}^2qz3vL;Ed-Ffscomzwkt>`U3z_4Lvp}v
z3`rf5A4DxKF!ba>4XrB}T7r%ftm!Y!sTeWA{t>Jes;FDauL2Dy$%hXX=C%K6h#i9E
zm%NT(BjhlxZh$YMlxI$Tn)e4=luX4)ysZ+o)z7ND5tqAn+}Z`HUc~czbE*YgPMyHb
zk|N|5=l!dO$P7g?LiwGi`mc+DI;dBVguGKS^4}xCXiyXK=Z$P_GY9u^H~tuKFfube
zck?Qtfep3!mLH7@6a&-KBim+NF8BN<G_wL;f#X!DXu-R(vO@6nV-*JkpF39rReel-
zrv6GYM7r9LY3WHYV_+;W8ukUw3r|<j3)&($VILggNp{d46I}O}WjRf--#Pr2Vc)Kq
zb0nH+0L)aXd6L7car~<4*wB0~ZQAXIP<STrlj_LM5*>eCHDf0++G%Mf#W@795r^Fa
zTvgld)Eos`$qpE>Tbd^UF?}jYMM-Jiqx1`Bbv!-QizaCdZg!mQ64b9IeDw{JMds!(
zApON^Y*ESvjd=rdJ3}kNrqaVS@YnjSWgU{7D*_@S1}cTvpL!{;pa6J<<{eG#h$9gm
zrPkTY<G})9$Pb)IWRBLfX}%DDWFAoKt)NO?i*8i)?cI}?-pm_oS(DoaPbB_tu5_E&
zYKUhlF4C|x)Xv}u6+^Ng*jeQ?1cW^^{&;xX-hMd)?rh#Sn1)_!ya|YUR#p)QRPKnb
zgwxDU8Dm6%kQ{&dR3hk!Zry~lEUzkMf3eZm<*d{q5z=;mUUIp4O77`XQQGsE_}N3$
zQssN;C3l`1FH;qge(1Df8vVFc`KKkN29Duuc=ANvaO<%t%l5{)!lY=O_-K-+qB&D>
zhx_2oij;czStN~6lEU~PE2~G%S6J|b(<}B@I{#|fY})ywwTA-@4%g6D*!o{TdGZcr
z&+P1ZWB^N-`av#pacYz5Q1PJ2xW<muKcThP#qD^UBD+6Vn^nGG;_|2nojpatFQC<c
zQBmRHyQKv`ZlG-Z-d;$XAf3el`+|agd=}eEcXLvU68p8&vq83irsv&r54o(`Hu{I&
z$;5N&W@+`18as9CRo6JZttZ}7eCpDr%jH`gr$w|YVEL+GB&rm2<v0ay<T4s+YP?e<
z%Ui!_dR!1JMnXe5<~dJ7!^5+~y0sUdI)6SLIMLR&xwr3Qt+ZrSDvLr9tLOzT-x%}{
z?E1Bju0c@S+uI$c_DoW$Z`*QC);8(vGyzZ-G%Z5qqNd!+LrB)_H_U>AUT*vfodWB3
zhOue-m^<iic^VcPdd1+Yi{#CVlcV8<%V*R9JK;noO+FLWT2xrIeR#y;mx?EUyJ%FV
z^}Q-pTGq^QPp3UhH4%PJvV*alQ1wd6tZHKk%$sa7>tI-z;70_DgI<C9mS_IdmLT=k
zP6kiw?xonNYM;f=KR>^bQ{N!1#v`!+kWN}pW$W(a$C5oI%`Gfu(2&{lM>9n>7RQ}C
z+uo0hKB-vXxo%zTnDx3zyT5<?8@WZIfLy-B`sE$){^}=3jcsighE=UuYr9=Mdxua)
zUn)+h(l7=k3$7RFs{wGVf`%iDw<?b6=r5IZ`*yo7x~Sg|u2LA!oZA+Pss(V+p`m>P
zf^i0;9Hc&I+{B4}l2Vv@6_@hG+Vq9odshSP0AVZ|MVO>S0UnK8-d-EK%oFJ^tL{2E
zcB+xnQn$}Ndg_y^ii&(*YGFHp0p`=XVb`Zm&Jl8|5f7~2TlP$9SM#s*OZvp?Ry#ei
zZJKQ)-<32+LlD*UEUGe8|MhnB<DLc#G^5Z+Qctz*K@gNM8;}p8m&Xqu_Lq|*T?nuB
z>(TsY^7!$Ws=IDq?uPbwW`g6hs#4j`x5xGGPB^9aYFGeojbk;wx&msGGi(NqN~DMl
z8Im>YPwVZx-%+N3D720;)1WT7@Y$G&pb?7a@GE?o1UhQ2Eh<PwAiGV_Dnwexf1h0^
z*Ij2?cF?y|c_EQUk2<VfyWheaJ3oyH6C#*Jx@XUPo_&1kQa0dVKbKk#Fvgg$dE?KX
zg(SMFw*p$d1KB5fZ*~EtQ3z2Uik4Oy(8Q3IID$IIh=6jN*O*VN*6i4_`{ap>bGHwC
zy>LU}q1wZuP5BM#t*aOQa;^ONM9t!-o=dk#Qb#Vdmo^Q79&T<9kMfK^ws<VLAGOCS
zCa-UyjWEz;)F`&}@BmIF4#TRum*Zz^6i$Qt!JS1~e1&rYm^0mH_)MZIB4^v5-(muB
zW$Ps4t2}1Rbc|Cu33;SThhLg3eO_<ul)@R_DZ05F37+fMLn|Hlvu4?{Wk{`JV#Fgy
z&WWnNkdPqPx35mJBwLq8KJrTjeZuAW0lAz}-)VW(M>9H#WSBC{%N_MCwxK~=-2Ris
z4qFTwb0|M#M3EykC1veANtj>h?GnF*q~zxM>fYLO5$rH%H{%9$lq#5znP|IQJ8%-g
z{+;c53`wS$g~b`RVtjn@7GGz>!gl$3rZq;&MiWQ5pM17*k5|uusrmjc3&IQp%L2-o
z?m}^ila9zwh4kd~iuG=8^vVw{yW!BawZS{$wzqur>^?FwbUToP+(36kz5t!aDb^ss
z(a5GdPo4<+E$8yr<20$sM6VRqVwfOTqKeZnJd2vuh&*TLS`uGv)didP{{9jgmtxsX
zYs!?;?Ch>xc97~qb%!|RB*sM_$aML9@=fB-h*K8>r!Bv<7#>aYtNGlyRbcj;pBpoY
zj#O9C!eI+}1a(DiqLQMLQX2I>eyr)7|HM%v0{VB?lk9T7bO(e@;fy{2G-$`LD=!Y!
z%3K{u>4>Ip#|1u(39-LZyN#=EIq?o+RMYewKhSf?)R*^et#I_{H>mG|?N%{qmdho;
z;rR<u1ab4nF3*9}q1t?~h@I7Nr38BC-RbD6PF=bnur*q`HOryn-rLLcBQ^70zVt$$
zz>{+Q^4Whs<=;J_4BXLv%?O|c+OE8L1_W4NjwSIQG}Vi<XFD<j9uE?J`Jjp6`q0hv
zuYVyuDoiyx^P9GxGD=TQ^Zut`7x|YV>g!T}G4b?ig)&3<!svH&+c5=ZaLy1EhTgq>
ztt~8e&XFutXmIH=Yw)nsQtv%<6Laq@vi0sU(DY{$oPbd5v>U6s@y~DH-X}2K_yr}o
z)M<$hZ~XR<i1X(^q4uIJh{O9Wae}=~Q$z0^wdLgv#3w3VHKralf38yLoO*|~m^&*s
z>lL{k2|K8{T21m`Y7(3?dW(uiwiP6an-+-;MMWzsrneTWI-dxYi<(yudis)FGu4Lt
z9FE@~Lr$SeV`jLH1d3`8gU5zJA5_MU{pdQ@fK@#w`6_(+saI9=;%;s~fT!R5`FHG*
zXfK`gRb$MUwLBn@0t_3H74lh!2RW#iD(>LWF)-@p%X4&Ed<{!w!M}4P0W$uhZ*_d-
zRfQtu+us82(|@#PVdJh|BU&6nes{YSpLcXkjl$k9Lu;PaEqZY8fw7m5&+#c6rNnA9
z=c3=yTiv^lP=zemj9{xu*;KiH@Kr;D6oZ=&xLuriY}iiJ?kZgmL@z>OU6`G%#3ICD
z4T@aa=jTqR_4;8Eoe(*Pg#v-xYt{w?1AE|pqpx$xSZ6h(z{lQ>0)AsFwPxyO@#?^t
zdEsB;|92RQZn?e#=|M<J#BkXCVujSbW{8-y2u;W#wUNwy($X4mg!K97;ln^|;AZ}o
z3@25rc6+27939urB33F`<GRF1&r?T!r-{n8-yiI9>&5fuheI*(dF~UFDyYz#Pv{Q4
zrtNpmJ8E^I<4)}GQqc+k&(T5&lh3E#Xsa*pVErv<|2IE@_g#kQo-!;P44=AVa)*bc
zHMl?&P<b80fT^BhHON)jEtPn1S?KF`KLV((Dk$q_7x<l^f2ffvW6Ro|@|n-`^D82X
zgb{gm$H^Op`2EQ$^z_0*$9qxifzOLfO-I-tT6MTzYbPai-t*EFRwwzg<mmNvf&T{P
z68zd!OU8^+ktKpl9f?0UYH&EA01cvIC8=!6#g%6}6)A81t~2bRjDOaO@_nyu)F)-0
z_-(~d>$w#NxOT0T1=jHvF8aePEEW>Irnpyh)D~2rf@I)@1xME@mi-vCj0Jv9$ichz
z<AA<@U})L-g}oW2RPG#{z|%XFc9ED`Xu$YcwRV*+VI$}Lg7i*ysZ46}@6Vh|rcJI`
zQNe13#}a?<dJFVhr5s%D93bsE+>&xI3ivU2?ghR&E1Ao)LEpmTn`hh5*m%;s|C9|X
zN|Pqb3FO4zPfHzE=oM=1D6Fi_@N6R>1Rea8f+#{@=k3!UzF~WnPFiXF^Q2Vpp?izd
z-)>&s-}}iunP1WF$oRZRDdGy_>;ERNvomFYAW8Nsy27<(Ic~bV{vJ88FJ|DocN!iU
zueuP6mz_U$>E3^4ei;d5@J^wxYUWSJe4m2B#cZOvHYH>ZRK37K9G-yz0cbT}-MP<!
z2%Wa)v5IUhD_8BXfYW3@OZ_TaJToG4zna+oC6Lm6)|@#zojP^iK7Q4c{`n<PB5SM`
zVA;>Wk=$H?7i{(f*qJCEl(s2bSkQuP#k}KpaM^D)+y8ZB8VP-9e67^Q+X=xN^@-R*
zBci=MT-$qK?%A!Ubnuaew_uaWef`KA(d=uYuYW!;{8Z^aRtS9#)vwYfj`92XEiv!r
z6jof@kvqyYN50e^2B;_rCwQiuimcGYNL2+Q|Es$7StSd9YL~6vl95$^OwaCorhaLo
za`QQ%W)mW^$xOQ7j1U2cw@uoFA={h2sd&c7GUnHSE}bHudDsfw3WU_e(cX3n@MQSJ
zd90&no^uSw<H|mv2hb}b?gR02t~WuotxigUK}`8%^Kd4}@;1M{)h^R?*!HqTj*g?B
zX7v#3WIQyom`KO6ynV`=@6HbQU!z~-1t30hsA244fWGmtTG2AX)cY~N2w_2Qh5#uU
z!Yh2<Klt0qAuOldnBeR>XwFxdH2-b;=(q~`wEX!dj!HHef+_DGji;Fo5g4e;!}2vA
zDqz%Hi#YSQGO7MFJ%_JS_Pi&w=BBsKZu;77<)y`lVZkz}8BQl<pCHH|+8Fkw*QXbH
zCmMGYy-kmna=~zqKOi3wSinJq%znpX=j~093aq&-`?itul1BstCnP8ji4+=Vd=0qD
zXoW35V27l4TcOhDN3|BpL)0n-Z&Gy3d`2n~5rMCxYT01*RiUe17;o61VK9O<sJS#A
zeWyXU#iWcTifJBc!@Df}yy_xB^x%Y~&5=|Pzs6677=D<z0Y5f_leYo#t!D#4R>2e0
zHMbjp1!CQM*9~Xd2ev)bWZ>L5d*tvKuC1#Dyt6hqr%or)Jhopl!94L~SBK9=fLV-(
z01OW*!M&jDJhL${6+9G)9)mQ46<3b;D@wrW<<P~~XU~p0)OJ|#H*NdjS~pIoMrIgH
zs?*@skoz*zgVK%s_PIZO@M?aBqxgc>!|K`)$$?K5zVb2+oa@O0$sE}AKsgU0F_HS1
zx|3|0)Ksbi7FQvdF0bm5{ryQ7N3&Kcz3g(Y=kiel#;>>s4cqc&go74in>hBdSla^K
z2}&sJcwc0M@SW!J`V#cL^D;99saMoC?WfU*yk;j$M(piuzI3T!$kAE!Oj}wGI8wZ-
z7knB{`?5lWr-NwCH~9a6BxYj=nJij#`b%$Ni3y+j$Hk~Ua=b44{n<ia%D2zB#Yptt
zeVJq`9lmzqW`fAioga^}^@q7<Q^UbaG2Ky!IR|awd1_2fL^4-(P0Dniue(CgaNkat
zfwnAm>7)tt2TYpO?^oH}uI&zmhMpResNW|fw@P9|ZH)K*ozgySE#t><ji?=ii#ly`
z_?lJ@2^%08sk8-}Z04gDttmmgLU914qU=6?Txqm=n^ayZZM9<K`9$G{rl#4dds||_
z-6Pg*<mmQ3UCnG!m-RtE<A%j=a6guQ4st7Bt+%Y~4%dzJ%;+Z%`$B(3_O#DL9q+_%
zD4&8%TbT-Zxl@ypvU12b@23ahAZEp;t%B8BczLP-h%MAfAX?)(dVJf(%VOwM6=>>&
zNg}uI!qP*W%J#r?L3N%BWB#sV(MWp5snq%;3Aa*KW_k)Gowv7Y(KRGmChxEisv_3b
z)-En}*bku--TGy%WN`F!<G(ECgcUd)qdUNtPO~ES$&-C?1Ivn(8?Cox+vN}HyEp63
z<FbpthCB8syxr+gZRk^u(}+@M_^~9NGrX3Tec}Ly;N4bFzjoN?x%T@9;=NgbN`<74
zm6he2vjx%hrpI7b_j1k#n;B;*w-~h94`G_l>ecAnD4q+W#QXMb(drz)+%xU=Idq?)
zAJMEWq+*icBXfj~hO;3nipeE$urOV^^u&joE5X<RzH6JAoO8hwP1{3nhxK~x927tS
zY&_)n&z?MC>3_l{_@Qopk*#LZrhvVBCDT4zT3MM)qBCmSsZ;Eh(-#KLY9=F#laI(?
z-6a_S`jKH?dy*FL0&R}L<#$6h-TYVeV{f&5yvsj%?I0|QzaRgl3l=Z#zvV5vT3b)5
zo#<`qa!1Gcm&2jw298v%xVq)-P}D?H{fN2s*Z11smlUw?D;4<@nNKNG+WxqM3>h_)
zn=rXAa%=Lb8Pqb>e#IvxAvqo@zIW^7|Fi(wb9Hp=K#*U5mCMZ=X$&`}eAj8wqQeZM
zRQ8NEhE#bkGw~PgBn(itq2Ub-Imt6Y)H7(6+YqtNkXas5InNmU+|l|31=X=*4_J&4
zuzGWijJ5;5OXzchHVEUG-j;rm_Ei>($8Y>>W3y0SAN-B1urNQr_uO}=fm;z7yK&C0
zCp<*PH-c*!%@uL1BshV5f`Ze#MtXf{Hqop_AQ&)Ra%@^r%+kzRByfmJG%rhcG4dFb
zmGf6;y{~D`+TzQBs$mvLm(G8xhu_Rm3jv{_P<2(ycS3P2e|!1*^=d^A@1XWV8Q2<f
z{=$WYYuk|wqdV};nGEZcu6^z0Ir<n!;U0}uIWR~5ZPH<MPP8e~QH$>gLU6j+nqHgP
z+P>kTPzEL8<PChOuAXO_M#l=v(x$^tb{w-m_DUZL3!A>-Q-6d0-)Ur7O5>ri@xjQ*
zHk4W+kw@z08X*G?-nQ)ydK0%FBzUasw?5ZtcJ=xB%_Sh0!o|lY;j;^dv3J;~jVl|W
ztFR@B^jxeGhIGg_+#ZPz;o0#@aTvD59kXY73AbYO1>~lvsHiciyxsQOn(M2ik9*Ar
z>vBof#EL)Mf%5R-`KyTf?0XZ%a+_V3_w9!_F13Dm26e6fx}2HVH^s)TG_7#pWTR=}
z?uttkaj;8`pH_YsX#}%Vwo0ROMBMwXdl{|7RJjIfpCz@Ue4k#@RZ42Zx^>E<N7E1l
zxA9(P<ZHM;?(p4BQ$IhyoD)LZ7zp@YCB0Y&%SxkNV2l~@jh1IDX8IrlsJ#!Z7F|Un
z$&x9v2x^$M6j^Z%$*M&5LmU6n!bF;xY%kk&9RpQwxp=W#&c(4~#>9oI>o9(NysdJq
z*5t_*gSrBkoa6gSYeSkUuJa(uF)22?`P*B51YMH0ghBX(EI47MCI5$5wuo8s*RX1s
z*(41p-eQcngAj11lD0fVTd0d+canTJJ?|xGOHTEe%ybBpMVhq;pAPP9L~8aOQC3wc
zQe#0e&@3KExf*Ta!wyvxBjAzpzdp#zJLnWV)7Lix!SX`U6*P%ZN3Fl>5kz>NO1FP3
zo6~6%l{R>!>Y~YO7ZHGIK6$^dJ7xz{vcId07%_XutzPY?USSfH?SNds&ck(XDP1~B
zpcjk*5x=vsK>bTQs?EBf48K1^W+(=SguH8&7w%Cyx(rfZY={2n>E%_(G=mo}*0-=Y
z@@g@CK$z)vK2riG@}uG1PP(m_69PX)>DM|l2`$;78(~08dj1KWLKh1p^`b?{0P^0v
z5o}}RwU3zE+gAcIrL_rSJe0NTg;jlP<4CA=<SdM9Kz=}Chd_r*rQ`1I)-$kW?;?Wz
zaEfX%D=I8xv(at_pI!Lk1)aeNuV3@lsN5`Bl9AUT^~x3bkkhDwSs}+M9kTzpB61`Y
zfnSkleWs0xI?Wo!i}eD+^z8b}-NR!tttD`va3|E42<X2_agj_=7t;F~5*$3U$A`Cv
zi!Fu`|6$S_?g_Xa5ag80xZ4p1PEfyvn<{bqY0VJ&ze}zt!T$+;b{EFbQiEilmK*MY
zU`siE8sgXN+u1T*hw4bs3zzzC!T_`OKXjIJxo<A5?D9Jj`4wb_21zySCHW<udyFjS
z^XAR*tQC}^2<xZv_V#4cQ>r7^7#;vW2F)Y%n-3nmgoBI&>%@sauv8GOj7^pnvp04B
zX!Vky)sq~*!o$Qw0QOLdOX&^$C&0x<a}d@RU>f-;%lfcgAo!0)L{OsU8MA3+`}L#T
zASn0Qrl+KmPMu3EOpp~g>H)qRJUx*Ete6l^_3N&&CEFo#{CDu@$UfttxvHwG!$U$$
zsjI;^Q!kKFHb?U!5ZqgC@?KIdUAi5j$0EWtPS4Lf#;|^$G{wCsG<|VqNNbY-SQ5if
zxDoQXWte18k<dUVm<H^EEr0s3Q}gw_+}xS#f&>v|xSun&7N0(~ad<AWut0cO#(9A<
z1X@;G0)f_{OTVS6;9*!GE{|RvSlWL3?W3F^L?A#a01N!7A{OVpiE4lIC{@6*i!UzY
z1VSu}wF!Oyt0shVVH~9qGyJA!I>I@_NbYP>&d<)?{UqhO@6Qy{aGZ?6kT@?uH=yvG
zfTt_bK}|t6owFN`u%u+NqGBA2Br|h}q1R%1C$k-J4H}=*wYP-0PoEG9jPE{2VUA1P
z9|X_i1>vZU9^I{bcgCOD(z>)a%f!@_G2R+aU;nTFUR19PF6y;VX%cg6fn8TOZUjy)
zaxS^DyVXdr*BzI;6CG>kk@L?cB`syH0O5nInbu>_Ar*0<r)$NEZR=za`l3XN&+809
zgG0}Bt{y0L=F};)C^T9?G*gy6r0&W@f_uu6y{iqmQKCNP6+;+0NF|zZarm$&L>~N|
zkb6tSP)EnHL9%A58{DsYSM}czj?}eL)wLnoYTNhh!Gk%JfH}eHjy|PeTgm1myCDOD
zWC9~m+U0%mKA&Z0&yrvc9Kp1?DsOvT(6(t;h_@7GRfY|t87LZgUwV2Ql3P|S`L5=|
zyDwiJtv!^CXk#;AJZHh+Aw#BPM$KV_!gyl%6VgRoPT89Vm<~lp&onV<<m^t2N)lSv
zq)%FUO&dMh5z0p*hQJnFtlfqNd_?>xEn4H=W-T5%&FB5wx3>~X1X0zCi*K&1>2o}R
z83z@Jv|gP1Py<)2nKaUZc=4cNwizTEEXlAn{o>hQD1k}*8>&y@!P|9Vl#`tuxz$G7
zO?1x0<$M?BOi;(w4EeMIeuN`O@rUMaPATLJq#V+7hEYSQreP}m+gutJ=@adXT-8%~
z`|ZUUK3CJyw30-(>Did9rR82%$IIbZTjT2b=Tq^-6d|u9C>B~A)sF)wBpsH(^R$`+
z?Vs8+#lU&>SZ)BULKwCwHO8z1qMlf19j8XyZ_Sl607yXV!JASn1xhU!dR)J8gG~`p
ztEd6VM38>J`Sr7BNqJW_G{^Vv-nMmh$gyKL5Y_eezropcp}||IIHg&wsj990@+vLl
z{MkEu!OA~2QQ`a6++0@}-=`=-b%w>$;b#r|Qvdd5XGFZLpjHO+L`2s(?fUe(tuZhi
zzXW<$JB@t*jz@kOMaQ~z>r#gPL2Qfvb@_9ZG>J+5uq$zMdpr5`B7J>kd<%o%Sr*Q_
zozW}q`KaPiC(LgEtMnYCc}r_$Z?WK}1+YAHZ)aj&f_dwm#`^kOmr|TD9e@V^+T5%)
ziTuA9kUHnc<pE55A+~|aR}aO^YpBN-!Cb7RVWn-<dtFo0kfP0|^l^U$DckfJHzEK`
z@$v23aZE*ykfro|Hr$;sI2kcaL@TEqkl*IW(bePn6LG0gIlj;gMEwk(68juRpfI$n
zd=LwTHtIbT5(kJ|b%p+F#%i*>*FDiSqA@-U(`;4Mr_DFz+V*gaX|8NQa|N|4_<AVM
z`@@|vL!37L$DU5mfh-CNQ@1?Pdz_+7M7CHw#*_v2yWw4zF<m-$#@(6D$h@|`hmIUU
zIelgK?ygDQB*Yv`Y%X%YrgIl*Y$h-hSo77|Z!vbu_zxqsWiM8F!R~HwZ6cdI)Ts>8
z?jj8?`$CpdYPcx~*^W#XGR%W|97&i?wU#$Pm#>@dEsyo&mYPDP3$x?QAW%R%xacWA
zjg@`XS9`8NWSRaZw-?Ys0Ks$r&>1-i@q}0;fst`D(kWpw5cd<0JN^Cxa5+Azw+76b
zl}_vW`AWIoz4J3RZA7TespIhiAu`=tAn}R5e!}<Ex>NBY`(V-$NX6MO6f?EC9+@6K
zE%oAsAF+PeQ2RpF_Bs1zZrmYK81R96=g4J1D{hA(0DhC>N-^`4Cs?rvBQ}o#<}gZm
zP*S2ZomW~b%?M}E7J>*c{I{&QkgGskTW@tPHE`z+>DJcf1opt0Gb4*`$#wr@NSd)r
z@#QbRo25WzQNCz7-d)t$`#fBksaPZNw>hD%;=AQAVpsn;$p$B;XH>^H8?0WnN&||T
z9t2%-ID2~{CdFL*?l#iz%(|ydAB5SvFjE*Mk4*zdCH<zqTN<ddX@eawdf|7daSoiZ
zWooAJhdr;B>mSod+<n8;*f<Rj0>I;JZEYG(=;Fr%Mo~fG`5D8ZB(@|1kspGO0N5F$
zU4okhrU_?O(N{QV{0fYO@|b`Tu{n1xmPhniJ$zWj$qI7-+UwqVHa-8$R)(R>eg52K
z#g<$_RmwmM{s^oZ<tlh`m@gKC92G29F=@};w1J(@U@cRJI~zv{<rpLoZwK}-8n1-+
zr=})ymAGryyx>n^dOupZU?{-%?>cbc&(~^GHW)%zclYtzZ&%Seaq?sZSv!Z^tlp3F
zO0TTpR*hLbU?J#o@4z=H(`NaeyvWrsc3F(ID6x^_5C9E7+44-7Ezg7fdTEDqOQtR4
zX5!*YE6@YtTEO=J*!Y7gvI#(Y&H%{eqqr<t?`Rp<e;@z*8UFp8k;G1l$I>z~b(k<y
z9mM-3qNP`bFgXd?$dA@m5GPzz-+uiT%$@u7Q}H_bm*`z^{3@b&pvvNmL$Op>7Zn`r
z-@%a<D7>s(R;-wSO~VD(0q{)xQbsz^<0LE+7>YjuF8|1}$x_)6iTRK!Egx;P>d?<j
zL7sknT8;i$r#%aw$iaLg&;A1s=n^$-njL!eI>_?R+xcdA0PYFwI;;eG_3w{MfiXr|
zpe?{Q#udv6wdoa2{<qX4)(-HGW0XCO$S(+j93RZKJV;f-(h~f1HYVm0rdKQ-W7GK;
z2jG6fD*FP?1W&R!D>`*#A;aerq;J5TayY-Ut4}9>@Tq8f${L~e^zi9ZRZJT~Ls@e7
z%05wb_u48kkQc*Rriqwi1u>HWH7Q-)KpKR2Uc7T$i}S)}Z~Duw@E<deH4yA$QqdKp
zdmKT9^kq4~U9%OZkBiL{OxyM_bq-6ys~hU93=N-wN8oPX##ljQ(p*;y&W(uBi1@JE
zkuo=HTOjWm8wt18{3^L5Q2<0FOnhNcsFw`oMZ6jcyMTTh533hNMQl{;>&@oOS&g-F
zkj8Ln64-BzjU_xaR1>^CI_QjnTC_xv4N{KepGu(+SIjb$Yk@353GT=Basm?tkN|U1
z@~`h<+f44n-cEn>vT?Ssg^0BvyJpU*K~ZSt4`BDiQnj=!<~9U*1YB3pZ^Ir#rm~SV
zRa8Eqnh<~*`fCUZT8DWr4}#=U83)hd^3aTeT(rv-<ImQ)8oUvj?f{eny6?Do)2~l~
zn{XsVH@9h|xt5CFAL=M-8~xS?msH`?vhw+h7x-#wDkyXk_Z>7yKVOy+Ik#qw$u5(#
z{#+V2vR{Yj4%=#FM54A!5eHj)t=|ll=QXVrqxhLss{g*@|Nc`tnv<v^w-nqs`~Uu>
z|BL}TWxve{(r)cm?fAb6)_;Gg9UdTB&>z~c3soE%f2*4R{$2F6-MP+!XIt_X(N=uY
z`M7+s75?4ZKSF`%%@Y3Wuw`y8|CPzx!l|-DK}%~BZb)^g#lsA&lfJ-RR^AF^yy$PW
z^=YoG7q=LW5QE9ou_2{z&=9O6!I7S_an6=@ydsGhcvA_d4x;sLRAD7z1mCNTbA&(3
zC%TTE4jCjXMu*MBxLm>cpzZg=Yd##6Kfj^pt|G}S*JhP*K=T($`Y-va>K(f4gv#H8
z+5f|xo2g>$1SJ;K4ne8B^OBipW)}=d_x;}Z{J*o?f5tsFSi{sN#9-)Mk_7oj%ddl%
zXs<Z*tq$5Qcf=8;{%V~?{xf>;?cpKNgC&Ylf1hnjB2(wzsdrp<&dGFrU5QLTpmm9G
zB{O3$T#$NZq+s#Cx7JbzCtOYvCLzXm16{2^$iF=>^S*7pjzj$>0m4%9;j@3gl(+Gm
zIbCjIO;Ehww}Zdy`w5-4-{;w#=@nB<otU!Sj75u{6GT%jcm91$TjWR<IFTe`2t!vk
zF5S`&@4IXC)9F0o>hlmjmOD+cVf54AkDnC#P<C&iG$JOt_gsV*_gjMEQC1*zrvL99
zClar4<2>g1cM-pxG<%D4LVWzM8Q(rn8MDX9{ZC7=c2YOldwY6y7#()Ceaf;USG9Yq
zk2l_)*I#{6(D<-dP6liD-CL^`X5-Ok!O0unwsrdCbi2KyPSBpSqeh=Cto;7q)#IX}
z;?6Rfx3;|TdkT^9Q!~?6dEc|+2alaHt8Z#D)u1!&$0OyWhdnxo{3<yt9_f8P!#ov{
zDAa|$i|OS~>5~{WlJA8K?&5#?tku<XXqSj3oS}BBXT?}z<-?I_e@4_T+@s(n<PEOM
z8)QW@FPS{1Dk3skwFGa>;%HxMX~PuPN*uG#MEeQ@a&MiqTfG`9P4nX8oUTaKUQHFL
z%tsCzD2)UQI(z>o6;bd_g7-*yd5O>cJNWCK6#k--8K_u3z$o5~-0X64XnO?h$*HOR
zw2|h`_C7a76grc?(@*=X*=UBcP7!`e)HjXMPvumFw0Gh0LEL@F9D&QYy6P7jnZNDq
z-+OM_N|oa+4;jTG^>-ADZUxASGhJH(V|zNrirwe8qfwrztm1pDW6R&NGqcyz0Y~Gu
zU<w<BFVrWdMR0XjXsDcO|ERD1&)TG$=J948O`?lxfDKa(6i=hD_HUDk%{%K%H{X)A
zy+tz{V*lQg(9LlM#33y`Y|4W8gPX>B^1wYpL^CsK<Y_F<2{IGDJZu>VL_lrY+zS2a
zMjHl+WcmNO%zl-)L{j|ZiGH!WXqYe`lVX^Jn3BlHFJH#u?;d^a?<Cz(y^+{2I%n2P
z8y0ZkhA@-aNkxn(65qRQ|Cl9fk0E_J4Nd2qfd!^qAD>Z<v<SBr-TbW(z)X~g>Au&w
zQ<a0nhRb%d;eIf&RpbZW3tq$2v&A|H53X7z@`wB@^!|?KnJWwe!~E6yii0sv!hW@q
z`qXLDkQEIN5axI^E?F)brG-bYdrSRtu@1$yfs@v%@RfY}CSsL8TtPusqJh~fK8%~=
z#c5|W<xF{WAL30|PBR|=a)8VUIo44$tdlS*!~f(-^90YyQJ?yrenK2R90UiSHcvF{
z0PU@#<OHAWSMtJl8x_LVq7_mMaP#nhd)g-4mj#82!!l#&8>7ge%0;xDz>ADLrkfaL
z1E_Zd|7>Razl&tlEDAmqm6ZwsFCoAnEbRn8e_<FILtaJ&jKaAdsT%)~AM-!ijN>d4
znXY1{jU~@NJUW{@@rXAYUAZ?GQiAreP>lP3Kf^EgU`WV_#Yv0;J39XdxPez7h76hR
zkULDQvuxS0PBtP19n2hHHv*-Bjze+6{crIFAW$j*ii-n~yHN$e1;T!0`(=5gZ(zK2
zbtUTJdmO*K2r<Jwc9Epfy)a?q$Oi=lO*~FCYnK_~AiwPI#F0=H*UTSr4AYYyPAgLN
z^>+R?JRHi4QPNcLR|*4r-w9VYvlsLZe$0lUb>!5>8J&hjG4d5Z)6hER9o=)*c>J`3
zXjQT|b@E3ZJHfqiK9VBYy?-aR&zB9-n}~CP{U*K+Y%c5;jP}<Ea?`b^SQB}-Z_nB=
z{O?21hGZoZk<nIjepxTf*y4EwNUuPIMh>98i%5#C;3J-DzTl-m+NCYQcxPeVu}%Ed
zMpya?1iOE4p+$kCsGoLsIt|c(a~BtF`Y7rjW$K~yow71N^cRlmWh;b`Y_tliI*8wU
z4<1O0dC4y@0WKEiUoiUh7h#j~pXQV)0PgG9GV|;%8u8NOQKB#o$|~^<>gHw3qCe?~
zL~D-!@$2CoEgaN)a%k^iXO0XIMvwAe*mcWdSaFDh#Q_&nQoz6CnS#Z~zLd-(w8n*A
zQU+6uGUk={la*Br5R`Y!4nZa$yvGHou(fCmhaVT;W5qLt#6NJ;alrw&iRuq7NIw|w
z!3cF3T@}7`8Sxq$pWfPd+4BT0Qe+jbT!p|=P%ZcY^wuUxs}m-XFx!#O9wQHrMn?Fq
zhbFyhB@7ru&-|DHLVr%651r&#Sj7RLy^B;D+l)E9E!ow)O&Bp5VRS{5ru!Xq?)+na
z{b$L}&rk#*tHEOMGo+%7&oF*qW&^Ikkh_Z|4n{<jkmQs2@{|_`Lpc!`E#lt@lXi*k
z!s_IY=LxMLAo@UQ?t_^h(H%Y9-Lc#WBEym%K6UD5>Vki7i9F5pTq?^N=?hbPL|(_g
zxu8}IJ>vrJ<wh(D94wR$Tat*K)xzi)_RI~>TC^&|d9Hi$@*6O4AUQi>3EO_~eN3C|
z--aA}f#$*EWK_V#<YZwY7#lEnM$|%Kn4eN9dYqUGqUe|XyYJXo=4(-m25-#LMeRkq
zK01}5zR5%)slAIn-T@N@JI~mQrovrmODuPCf<-Dhw<w{Tm}#{6WGnFX#WA8#cS?c6
zfKMpaKxxWw6Z>~(bjRh{|DFaerSxxIm#)D0{DOI61lLl31S9My8pR~Vg!MA7cR_@(
zM@-`SR7!2YDJg?acZGl!hNy*(oW#g8cCdG*U=qjjRSGalc+P$h8tY;h#3>q84VAs+
z&##TxXB5wRlnHr_Q|IVsey*h{btGU*Y;5dywl~sXtEB(<aO~fxjhvjd6kMRb!Wccq
zXU_|;ChK=(z*)2Se0HkWz6t)l(W0$JpZ0H}4rLuC!Bmq&e=-t1Qek|HFv7^^>s*QP
ztZr%zjc3(3RDS%K)*mAs66!$dRmnew{%P5ne~-<2Q5sTwkh)_b+A36ocB_?s67-Un
z-|Y-*FTN-Ie5xxcje+n)X9=~0{L_0)G8lx%(&_QvhC5#D%w&oKhyM#@Mvd`i=SU^1
zn|;LhAWm_k%W!-?TKEP7nmMYNabuDpq}l8Mww=gh+MgL+M;~@XbVQciPn-CF8HHhl
z+U_pxc#;d;ko!XX0VUPYLNL%5;`4H!!@kV+!$^>!Stq51X;+h@)>0(k{f;u_j6y1c
z8El|1mYKNqg_-51_o2^ifVJ21|1JSzOFW0O?U*69_Eif<Z5W}k+r3AR99iOjw>nTC
z`l_DQS!p}XUz_BSneEqLJgD_`6*J@LCLB$ox$=~LqUer!%&Ys6W(as;;9|bx)xSZP
z{j!|ENS6O`)-AG!?xnP9G1n(DG*n(V>RvJX(UeIs#W5<|cJ7?R+v(9+u=X)Eu^nB$
z_kUUdlPzrx59nCNL{a~=N%sEE(S>_=?PBT_z`LKe*;ylD02DJCy<3&IRZP1U<3UFT
zK~~F1VG~RoF-u*M{TI{)zT+r^>7=@=z3YVE?=(7!*!44K#Xon<uw{Y_q|b6Owo4_b
znaN@-@5<fU3**7BagscG%<?7<Eso85XwXSCYK<AE@Yv~(>5<B0Jg$9=Ct7B9Zz=`*
zwgleYqoXjJtQ5gM&zoV0uZgMWILAFqEh7M&J^PPa#_{;*F`!w+@-k*7)@w(LPFp~Y
zGT6uC#KM<wR;1WI-rm3Jb~A0QK^T_#en^m>Y3kY~jJqhPMlK77_hjNjVZVgB4P{Ue
z6|O*OF!AHMBchqrmr;x3n+10xAx3qAFZ}?lx6kzu9XG@&gSauBv50h7Nr-8PUj^F7
zCAc_EBvLRgzv3DwjX>x3*8;<_ORk?vhkE|Sk&X1Eo4p4=mqrN+GyYTpA*#bsn<)K^
z$DW9aLfUN=Qw&XE1b#CRKxxJ!x^8HKcZaA_(dzm(oTWu<_)g+~bK{remxb+2=qJTh
zeOS%DzAMa%#I}H)u(=CtkQ#j;S$}xbF}Wv&({SeU7V4w5h~f_(T+S##S|MZgdI)E%
zmLQJ!YloKb=rN@j0$>Unb`R5bqde(8Qs)v;b647f|BKFB7BHALjWkTbJmxC*3XOYw
zhTm@-@|`t^nWGQaa06o=?wQ3;Dufj~3wF~Wo^`IJhj}UmhBKJz8~bXR0OS~SCs+Vg
z(o_EtG&Y`gc#^vKG*M`K!VO=8(@_P*-wU7O&EieGyjp}&j)RiP#*vZF(bWy74uaVw
ze|yT4yLV?>_L)Q}31rmiQ;U!l&&(sj{bW=?fHYX>_`kEGUi&luuzcrG%6lf>@#YB}
ztPI8IJ5W;8ZTR%*TM#5N)Xa2kWi`Ww1u(t{C1Y_O_hPX;#Z?AAsP*HFKi)(amfw5t
zZbFu<bSN^?d0c;C3c;&4iSti)ladmq#-g>aT(!Lda)vkhgK8tPio?_Jmt>;YLI#cF
zy7<ajG(V0tEf{R_m1!BJcQW>(p&P&uGc|Vu#R6!NronM+vy6<xiWe51X=-k+T)vB&
z6YVMHz@{}`K7dTo(Z(i{;kn=+BZ(AjEUA;dS|rERJKLn88qSL$;R+qE7CMOLpM15*
zYoFdqgyokD9zS-%;fQx1e5{Aas8MDM!(k!!7```9&B2HRQu_o$CZf-hRxXEd2O{KP
zCGfhKa0J*XW8l(?aqnv>WeGE?Su<fjLlVssTtGK&Zf@izOMR*p(i%je9f5(buemdg
z>=oU*Ax;Gws&bbD@O8hxy~sACer>j<LXWBeGLf?7lxm^9Oe3!0OLjg^xYOvf#7<6|
zDt?Oi!_--1B1b?>-3;I`1SzQDiQ_0ZZa^7EaEjr*xwb&abA#@AiRN#^XaJpR|DgMn
z0Vz5uZ1fB4CWD4{29u_yj}nPG8Ot+~0+zG4oZL_SGiId3#LZ{xAHhu^UXq;tCNHnS
za)AbcWDYTAVpRTF;}K9F+YWrRlG!}9=-k_PY37}VKD#=)#p}YN+Yi<ycAK8r|H5-2
z1W^ifPIz8JD$GE^i&fVxNA=|~9%pp+?PU*tIbJ@=FCm`b%BbDKlL{YR_4#1Yo;lnb
zP=b(*`Lgt1L(7mGd(yNM&G9C4$7?q!y>b4f(|C04-@BKF5R@2@fn?H;Mm+ib<niML
zYbt8iU+0$@;wED~S9p-JE67?1b_WIUJ!ACfJrCSPqc(v|rH2g3q$b2cFe9z({{1+B
zDpv*RON*W9+JQ~uB|=>Y+K&dm6S_*lSpN)14aXD=6INdh9Ten}(9`nDBSq1LO0()p
z*>RAV@52}V0Qzw3{n0-Iwbaa-?KHHEKM$&#USQ2g@ktyJJZu1$GO?<@!6xDV@4@za
zb`=FB1k`et7*A#iGgwwn?$?`o+AqpeOF0wqIfcuZf078z)P!LwD*COlR`7FhFY#*b
z)V;r2{Fylgi#rBDtf?RbrEuL@k;0(BnZ@gP*q9|?TW*nPPD@<ym&gvI?h>vU;`Xl+
z`{VuH&%{v|m)GcU`9F+%$>|{iM5p(eZ{SP(4>@4FLbLhDpPx_fCon-)(^f)>H(}}u
zsx|MPnN99u^eq(C51n?1zF70F#P_a0oEa2@N`Mcgu=COkVVbq3A_9}EY`g&Dx-Y<l
z79HpQ{pPklUS15rLO5gZZbQ52ntwXO(=YgT2wvp?%=4tdSoX)v6Br^ZknyG_Cc>@K
zMozXDh5X^!h(gigzD#GzbFlTP7F60C(HvT~<PI~}EZSC*irQl{lfaKxhpzUOO=&5c
zyrI+3FEsX?cg@4wakcNbs^savk2)=K<8{5Wk{QG<xMtqz!PU{%D$>kB%+Gwf&Bsrj
z&k2S^nn?(jI$%W7jmk7_9BGb3NzKedm}4?zAevHh_8fg9*$cmZKW7%-n~K}OfujQ+
zqA+D%hf4*3Qe)#wJZ<&~SKL?I;i?t&uUCp#4!Cwzk`)fOQP~{Au~X}S*VO_}1(_S^
z1N@=!39n1V2-!-Ar9zzj;<s3~->6bL_?G-RL&N|)ALF(2Z^6@#se&-7l721ZaRsnF
z8%&7rv3%{pe_W~P{n+v28&E3F?=uPQJ-g!n6|$%uZDv$5iz$<DyyE+FgZLifj6%Pt
za*c~a8HwbYeMrIk#VxoYyefqZ>8p<~36Gt92T|ylE3N{@6pb28CV?%L$FXbF+b11+
z`eXms#zvEfYm=icfEv(8WDSB`7tQQ?1uR(^E=Q!H4Uu-_ARz%3^``Y0Z<@PU40YeZ
zKB?3~X_Qb@%K?E|=X1xap#dU)ldXSg0amUFR?v8zger0{ZW4M&zl_5ajD4YG@Cv54
zh`n@lop7Fx>Pi`k7$GP_B*ko@=%QLk50NNXWy+MF$PV;kF`H;61bOc_2X;_8^U27>
z6NhDM(!0HA{)E3|4v4E0<u;z*=+O+RTuCI(*?1IQFaI7!dsxn=rlyYl5XldUPQet^
zz;6UdvsI`N3q*t{+HUuO1C!|Bi*;wiqJ%OW!t-LJDsf+`Yu5(Mst!wwjPi3c|5Clt
z=V=s!DvHd%ryj0PedovT)}%a#niq))YVIA{w}))<Iy8_UGg4GgROE)g19q(vzj0+f
z2C4-MiFtGYOf&~p2qzfr4*ZYfu~4EkbaH=eaOM!r*XDK15=c0Lz=Y2zg%M^MM_fbE
zz)YxxnRg?jqU6*ZqSf5#n6r%04qTBV<ipXsI2YXhPUQSwQqPg=Z-N{E)zG75;~1BQ
zS*Mft{uh8+&>jOAV1!vdVTYte!}|TDkC-~RfWY_n{`ghg)*t@&JfByu=PXUWbV=#o
z^XwLwZRhwVJ+0Vz40!~pAr(VA;SG+mXZx&F8%QR>r8k)+Vje>dmsBK7!n2GwQLyaV
zPL%a`4mDRDRtu(KX<9^F1BeU5u`};d)(7H-3K#{`Puqt3!ZxUnWQ&7NJgaiW;(H(<
zoY({&g)Q}sjD|TwS<A*@=7c)sSN80IL2(cjJW|pijN>n}WcKv<?+)vi{#!C^xyh4T
zIhlV*HPzSW{xW3uqF>#s1r8JWx3^03sI+#?Pzb1RZl1>ARp!h7eSs=oa4Faz2kE^-
zZ-Gv&X%Ke);ItRD%|u4}MGx^jjS3wErwhJlGnGr8^TNNlB<ueD51V?<ip?ubw6<7N
z0j~hTyg1RkpC2;}MIyhjhzO<;zQ!m*e2?`bV6wmWXt`437>j!?C1btVY`aJ63ZFi$
zV^lLT+RvbH(4epphim^^HKMi*+9s%7j52s7G@>aeR8GM%I{Vo(!+$5$imW~Vom3aP
zLK-0PlE{N%DBJSW0NRaJH)i^D+xYU&#mOTlPHf7a%?LsAX%;EP;&{l4^Zb~$&VN5P
zHJQsjz1wL*I728S1WY;(OP$lZ8R48uC7q1y-vV8Hp7PuaYBlInN=oQ3DqsY~LE)L+
zGU1>UOrSe&=P2tTxQwl%jAW5i)P+ySj#LGdSTtkz{}*Sx#AM(U0_XQBvYQY^L@!%z
zmH2|>?f>EH%)@d_!~XvaW;E6!OZFN=C0j+P)FdQZk}M%hNm-)MB9&1l>x8mPB1wp{
zYqKXyilmTGl4wzBLH$10^YobB_xK%ee>jGDx}W>L?&~_2@A*Byr-)NV0C*H@2}3@*
z@33L+;Qb~6xGUHZbHuX&yd65Uj#eg+3c!ByjghHoa9AM)&5X`;`<$IY@WT;Ia+QMo
z{6dSVXnR=R#Aia6tS3wO()~S?t9Uf&Gg?qfkv(A0d^B#vWGKZjJP_s~XgI%qOdori
zwph1wD-Q6W0}o{;rbP?Oj<-%*hHn;2=^%e^%E72#K-A~Lly_<VEV$?bdwyW}rl=={
zv(7J`51Zq`6fC>1_B%E%MGcVNP9|tIj3AN0<5DL*jFZCjW-ZoTo)ZUeuJYa}rwdnE
z0nWQ;{M?8q_skU}>Ix0^GgKtV2Ms(d9pCmCvFZaZuBMIas#OI2M<zwIY}INhU<BBe
z>`+*o6ofOz^D3Lbza{w^C0)_H67GeVk+(;l4JeT6dY<Z2cl|ZcB_ROBl2?lD*-%50
zNb;z*TI|0}Kw_s2v4hP?%H{Z0bIZHztgH^MlX#3!Wh`?vryq$B=D*`qFs@SjlJuGd
zeE{>G14s_=<@^<yf$7<$OEv>MsJsdT<I!NUC0`IiyL0PS-cX_WexPK~X=`Mb552fW
zhxoKeE1CCGzgOWIFQS31+%xm3q!4*-77@NZHi5^&zfz-|+=FqalxUt7RgwjGF|XYI
z&t?*J`9W{j_y6?fL^857r#=M)IUUU|Ra2}r>`VG+q`oi771Z(#)L-T_3yK8(v)}|A
zn-we0&W(1?JaY&WglW<0)?mmi!t7~hfhak2rLYY{50rOD@jG01<?0c0AOV1@ZW*r0
zTK1X(hZjT%__UW_HOVkz0;iFz<EIpGcd*{(>2`r2rbw}@-~@9_iHR#c*MVOX^&IQn
zOmbg-)Pwp6R{o@eygY(|ALS?8w)p@1_uEU+-zIMF_&+v^asm|T4cGZLLdiqH!IgQc
zSO?||K|Cb2XU|NtHj$j|))2fHYrtm1&B05W&}S;`tqC{jT?y*59C$u4F)==V-QWV@
z8a6)#nR)Z>T{-GOvdNqDS0w{=V-Z(jRp52PMc}X?lGiJB3ics>ZcjFiqGG6JB+uuo
zE%!$1e-NT$+xNV<MFjM)DuJoum0!PFOEXCP2nVV9i7@N_C!l_C+7(T_Y6~Ms7I%tp
zLUJ4Jq#}wKxUD)8OG4581)|@DRHqx;GjeKQhNol7LQhGnqWnh6aX^Efj5T+BOOr@F
zDH`XG(Q1!F*QeiFhRiz&=4Po!D*YQ*(YaeUEx%Gm?`_+*+Qnt3FrJcjlpF3W*6q?t
zFwzSm{UF&Ueo5&kbAAA@y9r@Z+BKZ_BsHoc{U*-db#E1LKP5-6e>~jX)hEtAGzmOL
zr=uSiH2%E`7?~Ytdv(<~g)(rKcxJUqVSM8lN=e+-ZLAB^ukqcJdss$g5Kj{@;Q$!{
z%Nta0+HfW$TgoYIfHL))_>y}1oCGgIT>90?2+iA1k9YzLCE9(VSWur5MwT2Nx@^B<
z7)}&E4}p->o&_nhx8cSA6Vt1hulXXJt)<iD|2(bvK>4GkgXZAz5C`Rlm_i2SR2hM1
z!%y!oja!U+z>TRYFQ>~F$qOcc=3&mXgpZY&tHowP4_+zVpcDH{4LM5k)s{YKj2K28
z34pZQbP3(W=r9SqB^cnUk|vVd8ycKaQvJ{1kVebRV*9+`H0Ed|Lu#ebD8moV%{o1x
z?mS<0b_GEws0c+HKsk<z)Z&CHUxr?X2_|$R$pXI*F*ugCn5of`1W#4P6b9#fa5hBi
zmK(dYB(s>f2Sh@vqUM*-VbV%5m#!WUNY40A5irs^iGYXOURoz<2Vg&8$JmWBW3Kg>
zXdDGBb-xpHu6?Cf9Njyq88UVRgNj>tw+#z1sIveK3LuL|&y8-|rp?!?s)0e_%^xg^
z{pX*%tgD|Qa^e7nqgJda-7>kQBt~v_)ldANL4lJZx2nDhPhVA5#(3n7iZ>SPQZJe#
zd*{}aPK3AYvW*-8@rC=Y=S}=uKwL~Ks^m1&FyvM2mX+w^V{B|(JU-YQMCSzL>+I*x
z4XO@GdMPy+NZ*1Ta{LRGUD^izH~#h01K6jsPoZV2`zf?;{Ud*mt+W7atiqUEXAyI~
zV;RtG4>5rL&`D8I5$n;e-P^*#no~MHU$Y$G6(+}u0jKpfsXp`%*sAMFl8qbUqfkYp
zDE3b16TayhurvWF{w^cq0^lNf3n(L044GrFPa<_Drr(GBvXsmHty_k$D!GLaZ-5Bx
zbh}Xb<-d1|7vj8l@d?}l-2uPs=}L@>!TM3|4bsl5BpJM_XwuYpu-wd8B+K$Zj3?wc
z5WcLXkKB$y;FN96rzdudIjZsggm@{d`6RqmZwS|_w}7&l>DyH1EOz6op&4OIfpd5b
z{ZrGr3~^e;(B)y$X|-gNK11%ob_cBskB9@cPX_A%^KRK!SE6j#a7-nm`w@>qS}|{5
zMbU?P87Ui`WCVV}eU?LC(S+*oU^qdJg(hW~el0hk?wn+&O?Gxq*mTMGmG2*@kh~A-
z{I%ru;fU4mp+P0ATN^)DOt&z4{o?!k1I{C<2SrS#N#Y?5BKVaKE{i4VT^js~Wb-s4
zI%$s6XO{-)(d+q;@-av=yJbFlbViz?hBiYOq=LN(JU;6*dP8CR`D%%Vs521)kPQO{
zfVvuam>o?!k8nwJCPwonEeu&%xH7<!`p;mdUxig;4loCQ^K*$sfZQ&c?UHWHT`z7C
zNi0pc$wy%nEpMMm)Jw_NRWL__n$fXR3mC5Dhk(Ne##T!=rEfR_vj5)%1eJ^*qQJaw
z-^`@p`l~*ib;&npyzGrOEhYY&8myG@I02vJB`XnPcJEF%<-w9c#F=0h8WU&x`1t7Q
zU0U#{W;*jN=aQ<4;u0>#IO{xb93q;6Dp?-IEoQV!O@*YcpT7QDs$R0d9bdcldX^@7
zY0&WVBWxw=`{WNCWXh2yoxd{iCGOzB!}nfeDXQ?D_?<WpS1`~8+xHjUY-{uq1cFjK
z<uiMG7C}9VvYQpC90<Vgf>BZT%*I)^5Ap_%9GQcK+IELERQ9^}!oW&112lz2TS8yd
zjcv^({(s9KTO*K8rnk3#=MWNmu)p8yKe&GV2$`~Z`%63=_@CyZKtLDF4lE^#!3@h@
zMA3aqISe~V%EiG8?|3m>*(3nVOxN5=D^2O4@J$rR0fsfc@|g`WVyfOKHXXQfgedzj
zA0%;KDc^PGhovTw$lhLiALI_wh&OMJ030-J3YLO#-!j0G5l{G;<?T>$qG}cY<LiqG
z6INmFY}7t!!UjX8%?pM_*_ZnRMKK?;O)|x=5o<nGtLZ>$7|4J|hz$N6ewVb6@@n{m
zZ+d^C5;A7i9;UdLbF~!oa+YQ<sO=oJj%qph4z3)A$qomHdGm6?!2uX70-X7Rd5#Yj
zx$<umK>ZZ!zZV!8)qi)DJ^J5WXcwcDvL&^gBR62i*`({&-`tL9;%m}?UPwYGT*2f~
z{Q?mJ3;;hQ0(WpIAU?xUL*oJ4dGqV*DylbKvEhfSY-Y|}S9-lUK<C>>P_vTCvFeYS
z_w)iAo*qpUa2Kdew)0w~+`nJMs0n}V3_SSv!}Z@I$Q}K)yGr7E$t{gN!Y2PGmf`VC
zkLkL`OgXgePBNB4PRjeQJ|$wj8JBroa8ZcQfZc)5nXxDhDkaLIKSUJaR+<pL!UxzY
z%xyt4--1yMCO^qikk>&`&D*<Ce+3T3#zDT4dSQ?hs$cz*^9IIOq+=})kL&}-*1#YR
zJC(Yh6g6pqkb4FGG6~?VP(`(srdhcvB<iavQu~iwMq~-&YZj@e$zc^q3wdJf>u)a2
ztVqWA`H6-V`v59@{`j%BfIjP&krqcx{#vffc|?WzJ&cjH+w)hm$F&}^WnuRFDH1ep
z7x7Eq!EB$<TO_j?09jiWyvN+rj8gsWTc_K35(`E7&h90Pnj@s4`htZE%j1_2w=}?F
z()#Er>nMN4F-NCFXLkDj8Sqwz<VFZvX7?1|@W7)qcYSnp2_d?_A|au*(`EkI*l{IU
z4BlYx!}n_YZKE;->yg`?x_n9#<H7Rm+&4}X9i%lb0-wFqZb}BxnZjU<LzCQCLjh<O
zT|?X58=6P3<}}YYaRYl}^U2lNk(}N84;}<{{mpnVd@=|rD_5^(Y;p|9f2r;tW47)!
z@<~t(19!wP$wGYhq3~0>YTKFZNbujVVH8Oc43qf_7w%fSr6f9}?AK7ObFr~BA+KJ!
zGLj34luZtgN<tv37Km0s$dIc(h(U6b4v!=cnl@t>L;J+zQB}=ctm{9-;V*J>7rqGx
zMS+AGS?Z%l10K0Gw=kg`D@5dp0@250LugJva}cby)Q1k3mnWz7i_M~=!v}Z;uuNts
z4GvI*VQ}Th*Q0RJ1nQ8{X(mHS9a231L4Ge%bZ&d)N>GcH5c*fGI?sRbSCFL(ZTek>
zwA)KE&h#&<3p^R*i+hG;AiOLJ&tXAR6~`1ncS}uMh0xw`rAe+U%gaSc5HRZ`S^QGx
zi48|XlA24DgM<lnCLRt5j**eK3znuIyNB7|ujgF%cB<i_dv>#~u8K@sj0(Tgl*CTH
z>7%1EYQKG>xi$f^3Uxif|5^nKG~!S(r_m$;$lV9SAx%;wspuRNs1?|R5#Xg7eilj4
zfH}1<oD(&S1~pCE#o48tHUy2r1t;F5%dlN+jo|Tc1Z?A!;3B?;ibIJwg@HE~e;XpH
zRsOoZt_sV@9E+U=fiy1AlIoOs=JVUPH)8hq`kPIj3_4UtObq-f#>E+oMRlKQl@YNi
zxzv5L=CFmz2rVb=*Bz%IN*_RT9m}VR?8kSfBR~U~ETd;#L;B8iydZ95t%nD7Zc2tz
zp6Pt}ML7aol<iGT9vqF?-31<YhsPWhb!kBK*Vx9e`|^-eGBEF-p5C1e+*cf#kpJcl
zzH2L+tz*hk6NT8UZc0jn$$PlS`{MJlu}d&e6j?!Ei=IY~I`r#VJEC#~xP|sL5DP^N
zmnsY(MSS|;!&#zs?|wNuf%JD7p(hL`>nKL$nLz-RqBD}-FL1C}J_2sns>QndVV%)J
zMzni}*oKyp1qKoqiu6p_BBXZXTY(cm#jgB=i86#l0vZtJZY#Io?y7`^f`Y0#={aE)
zOfOMb88|GJvx_eL*Sz*0R*iT?>~71Hc%X<*2fI;xWAg)7!h`w9$5MI_4A3v|{aw#`
zMwAB~XziQN7vi!>)7qJAWh^HKNw#sc=K%T8{3IA1uig>f-|6ZcNA61rE2wiT*u%Uf
zQ8X4?0b}Lr_Vq*BLJ%%+oPIB><iWOYr*!yokVBP=4+5`)Kz5}L2ffnuU<gyOXyKsr
zlJ9*t9wIUdUXJ|skvq*Sexp%)1vMS~JWah9V?;rT4j*oA*UNblEmJ%#X}P~*PEHOK
z1h0g{{$l57CoLU{(uOm%B(Ry39#pd+a2_LEGI-D;=?Mstb5=|~NGF4d<6`dKyXVO2
zz#*L;mEm70FB3^hzatz>JI#KKk@Bw0$f*ug=>fyMj>pm&*8_=-o;{=EmOeX`b!uiT
zTXTAKjU;FeUr5MJ3HlDGXpkZFin_$Q%7H!Xeh)QcCBvaA(dXiHL5u_CVC<H{6@l;A
zSQzFlTen)xwT`s3hfRlfJ?_>rG?&yPlhzjSJ>G!&S{fwL=xnhh|C32nnK&o-#wIW|
z)K2EKPzB9;j31_##t=mV-w3MSny{-QbHETE?j8J&B9fK!F9!Wkgolp&7T12oRzm01
z8Ah*>KZn0>VsR(_Tk$pSQ0ix=##7_*y)Ak%S@ocgkS{#nZPR1JGaS*RvSX4CATAI%
z*aXqxl-+T!uDCi7sE~x%s+Y$$1e%hN*R0T;?cHqx+7lxZe(#4i#XnI56ItraSjf4P
z@$ZMd!(gpJ?SQ`*Oym2-tpNci=+;+V)O6l@T+wc=nc5sexP+>v<m}9{cn;zWF(~LS
zIBf2)e*6_llo0tkMbjV-fSYedE!eBzkQ>uu{i(zEz@UKDghR|TkEi#U(KT;bdlMmI
zPR%@_>`vE#!}>Pl$`XS_4%iuLM8hXNU98KcBMFW<LoPNK#f{I6ze-3D`)D_+j~zRH
zB~n)(h##wkomS@SF7eMt-xyvOo%6~jgfkSI&Y-0wWy6uMA}2@3SARcbK-ZC+Hkb<D
zMloCZ;i6c@FdiMffPtr)-)fpfxdi!;bIz|vs`{CSiv{d^*z1d0`~OUJ6la?Qcv#yw
zDnr1Ag_)Y8ZNlwsPR|;`X*anSNV_4m+M{W3J!4|%nY~9TXPUZBQ{yw(PedabJY}PR
z+b$<~e7voxMZny(pD);Y*Nm3Z7QqvzNMkPJzeUgAKJC$2J+$qTsle-Y+4j;5S{b&O
z|5Gp^Pnc5bc7|0;<<U^-!83E?XT^K2wttbGEhxrUqMmEAf^P~7A-QcCupw)yfcm6~
z-Pv>JHU$KHfDoP*-(50(cSuMbCP!=+1*iebbh}WgRM%JbjhTlP0l+s=ATuB&K6|+n
zFI?*}TsNq^>h^0fTI5xG4m3AbLGwNia6#h*Hn9?j=(|_B9F<9Oy$VZ&_Z$?qj9BQY
z)9NyZ&QxE?E&T*j5j#Rd0RJeNH=MKBymgoJ<VC7}ga&j&ve^Y?^r|7EvP2Z1#O+N8
z2P=v)Cre?Q(98X)iNkWHk_~r@kh}nw+h1SFR~zXad#FABiYDQf;zPMf>r)5+F5h}V
z*5Xzp4tr|~CUwr)`C`%%VZOyz*g6Qxogy1mc=P6sm{kn*`hJ^VLzTM{d4MkXDRRFr
zU<66Y^u{Y+P)H)U0ITs+6owb249<p@3C5Ek#a|Kdkxl2dJ={>9nTA?r!}w+^6U4G0
z<N`2l`+x|JrxpDf?hoVCLX~n)EX7xm5G(syYJknr1qug0E2{>}f(4h+uGOunEsi)e
zVDR863QDLIdwj7YSDF#X4p|4^?yX1S!hvrbeR+5MB()S}8BFsTtxbdH-)MI7l4SEP
z+ywTOAb5b^b37@Igu`N?iIfT)7}J^R4%8>GYa%h@hS{}4!aYSB0~Hde!bvdzisuLn
zO#-l>abL5S5ay?@8u4?bM-tRp6yL`sue9x9T;}TLW(y+~>_|}%dSLzF2-Vl|9j>Tc
zshZ-SakHXT1UTkp0-1GwCnp*o>CU*YzfV`+1>tFn(H#%<>I!%lPyO(-&Y?4w2^LbW
zClE$Q9>klpS!#|Tyt%l#vQWPMwQ$D;Y^ZvR^T$A{-@~`Il>Dkh0<EkZgbqz^?h2;D
zu@DVOZWQ!o(Lq3=smr!Scn_dp+=;LM2-uP^`Pr-_&~!A?GB+)$eGT1V(jW6(;HKLc
z&7D<8D++Y{z{OBxuxjm0P?%tyM7=Zo4*rl?V?lqq;BrS-ipqP(#n2x~pgNlP`TL9Z
zh$*;b097qNQU292+`sXHjA0?M*f08A@zN#%kY7NwMYl^jChQFMldcGe>;Q{k_Cbdf
zj5SJg)|T|2an@KNjXDn)8oea4WO{+UZkI~Wc*QUxQ+DY2OP8Rf`Ho%-AZ=msm<ZNS
z5oi#_NY}p}rNi~Tve=bPAb$MwfQ(UcrdBWd5U{XRuGuv63V~^WeyQ(>A-^FZIk;B7
zENB%jjChS+f?>miGKl|o>2|{D5U1|hyI0+Bui@H@R2Id_5$OpDLhIi)D5{YwXlb45
zWg9fD)4D;jFArTY;qXX9(d#pqnxRNJwl@`VZkEGgtN@6O*37U|fxpnEMk<RQO3G0U
z7_c?z;|I!e6Xi=-_ijesvlT>pI1EwQ@27FAVzwbf;93ePEnQmt+`(;#I!KJ#i@PUy
zo68HV9*9-^cVw-Z0#9}hkqKy2L5m4`QECoT3}cx^1Ci81_%_3iq}+j#(K6<0Z;5hK
zz+-fF?e(h`esrERC1JLF=dK_=7Ad2@B1ES-AR~gH?}IBlN&Pz{1lZe~&K$})Vnt2)
zUJ!krn2d_x%<hfo1ep;^bo7iyTZK2DKX+GA(GZ<UJ_I!&Xnu+?4XsC>3i<ozJ{plX
zba`-_0g|9@T%1C?k_ml^AVALKI_jImV26$!k85qwS@KK^OF<D~0uFQf&>YXA{&Y&^
zrjlz!FNMXN=iK~BLr!C?v4TuQ+V3X3=&^C_rJZJ|RoXAa%+RZ*zE7O>vbUpEO^uC{
z2&Cz2;o7~E=?-Gy{T121RBnPPZ+(F=H!Z}8ma!4}EzeJuV?2`ZB<yT(Yx~vXAY@}F
z-xU{QyRNQY0+_|3neS#}GMzv{70_ICV_26PtrE9yN{ME&nVF)=d8S#4W|MA`<oj7!
z!+Z5IR#-`dL<?6$+xKp3`7A#{O~^JahhAOtS(AReyzxiVsFl!RMzswIOYy>cu`$FB
z+c_LKo|?u*Sy0sN%BAYL9{C-_8by<J|CN<Ir+r?u+$FL^kO5RpafhUZb`HPze$V$d
zG_Iko@HW5`+<%say2<OcC5D2XM^_LLU-VMZiboZrdi<517FT-C^zoMmfaX_;!k|sH
z6fK74;Q}@IlF7>3zjN+6zIl=hcqPGC3)ODjy8|Aav@Z8?<)UY2vu%`31e=dT6cZ~k
zt|Mfaa%T4MDZVyd-#3c!vm#F%P}QeX%hyfR?_qEjIy(=<i&fKk!%Uz-_G`&Jfjf2|
z5JiK)57aSnMxvj}HfyFN*Kw|QjlJqE3>11R<w5i1;fh5%b(L{hh^6eH8!yD<Xqx-l
zwJlYH#y)78guX5W7k@?1cyHXgv`8h<&aaD^9Wkl9SkFQiqvJ01&T6**UF;Q<$E+1@
zZhOSVCe`dsp}le9#Nb1}`r45+CHmClcT)93HVkd5hOIV!(j@DlDSHME<h>fqDZXn}
z+<bb%Pw#w;qDex`o40S(#Q#17sWoP0Nsb}ko*Wz)I7k}&kgJFkU-sMssyZG9wV++F
zbk&ufQzs|h9xI0){+v_XZYK;fmtCQz<0ee7`Z10-U{~<>Xa)~i4u#v9*XNgBQ()!m
zm&6^_$}ysWWxDOUY0}`vIN_y@7pL20SI{p#y-L>?9ENR`dv7q{#*gy7&}wJyTW{IB
z%1154dyBlBEHS@|TCe>mnkFVTkxY)TuCF?CEo<rS1I>I*acD2&JP3U0M!+P=*7i@6
zV&{~XZ<|`&$oHOL_@iS_xz&SOhY@zIiy*HRAMNF*SXNfX!R|oI>`4+Ykiajp99Dfv
z0!D+n894AdC|f>5j<cgO>m!J;Z>zKozYmG>DP>H^?8JK0XeZq$-@0rY3xx-Z%0vI!
z5Y&7scv2k5m+Y!EC-G@a<#RTh2rX!blC{C~k-vMfKTS$$H-IWSUExiB{t)IF$c~Jp
zx+}IZWemoocxVebD9<lQO2b4~fMa@ZRlhMuXOc658%WQsL}D<JyoZ?KBc#qq%WzoX
z&%h7Xgr%GI(-*%g`?0tRnvh5;L&vvC0MDk2{gA#fnz(aF0s(Z@^+!kEl0ypoT2f0D
zz2!jaTNT5=j0x~X>y7wP?%()R<sz;E%<)qsi$~rs=G-|cT}k@E)1Rcj!5^i1EE#;1
z`trl|LVJc33up}<6z57scX8mQOI5Lt))0(RilP(8_VNnR^^Py?plKzd5%wT{*TLD@
zR@tYS#CT@#!Gi`sQ|uhoTWr6K(7$B$H%aJ<PeerQ19I+Z#mQ)bAo!*a@h9p>V)h&S
zD>$<^3=#-H`oo4jrMyc52*|W%)hex9z2Sd}%7~p^9%McIn3ZNQbLqLce$jF1P=~?G
z9QOAIIz{S=cora#oRj<nzSk^~=v9^2hMNcf{a^(ch(KEi|G9Grj4>B4BIQV+$25!2
zOl<^ms5<47g!bAxG*I}rYaZo@Of4kpf`Vz4bYCbn6TgXmDzHCrM79M_6~%^Sx&?4l
z%c_A*Cd<(jNvP?y()#DuN~*h*)Yrd@-qqc--K`VxPXF%CwmC8D+=6DU)fCLnpBsOF
z>_ew<pDp?Y+}domRbiIR_zicCA2HdqBcRpFo#o%&)q15CeAjOC=WLaKzu88+&M7Hc
z7qZFb=$W`jMOVGoF?5u$B(3KrtOI%jZQF{s1lR|g(AG5y&9jn(Z_uecICQleQ2Udu
zj%Yjq$D^h2IkdK$QDD`pfUH~T%|1JCJ;l|!d@>ZUhb<NNkrJ9gVJNfJ?~>%1B)4Ti
zx?9XxoUz-wgiBLRNfy76NlirCAcO0|<NshgmjeM%Rk(+9yaWq8v$MvaK`S6`IJViY
zqI2iT6IX`ntzW;scEjWlgW()bP(ac-PWMK&(VL-m1DK)DRg`(_&Yg|WV*mMv+|Yv;
zC=_|W`O&dPUM6}}s0hC)BkUx+EWo1@SkwcvF{dtlHsXCMii&~nbGa!t#t<$q@?@Vu
z5b%JevE%rx@3lqIa3{Crjhi?T!pz*Os~jn@*5wjJ5$w<=3t<@IMg2p)*I;P<ANucG
z7X|S5p`^rU{tZ9&t!8t&|3q=ssIFR+f}xoR5uJ_Wu*yQO3ulilQt1I1fS+rmVpNwo
zn+YH|aJr9+Bw`apTE{lEfa6SkOWxCVR39hVvBoe>5K38CK=Y#0{{FXtjQ>zm!?hvZ
zhU&;m5_>W+m7KSFOlCYsiJpX<_!sR)(82H2Xy}>XzXXQ|&_{`)Rsy61zCP#k^j{^(
z{Q-F1(X!8)E7N3lh(d>yQ6jNhFQ6xo5g{E27PNJ%0i(GvO2JcE4J2zW2Rz{JiP10*
z`sf{UIx*%WX|8VBvZYcD!0xM8H0FP{xg%zd5T0@LyLam3K=a7Qk6qfeb0Laj`6<)G
z0~>ub4bCU=_&Rn)bom<MA&{yXS_}J3Iw;Z>&VLd8Q<yRl#0Ef1n1Hlt^20mWbXpLo
z;Pp~fz0O>un>YVJ^93tKLj_zNj)dA5zMO>S2fb*~HAASmtRMnGMUJoq7Lfsv9g9j`
zWpo~xD@Ny;TmEqh_50Y7nx`*u+)3+i(!Auso@X^+!%Qj6q1~h_<+FePO<l6_dZI~H
zgEoloQqAGRukfhghV7NmN+bm51VoV<k-uZUil3i*hrMHe(tOiXMWrU?m0*Ug?Ee1!
zdsxQ%jC{QL{OMDtT6ECN`|zPNld8zX?el{)rV@TvbjrEBa#X|<#xqUvv19RJEk!(0
zX_L@l2c?OAcmHXnqNaxNo-$Xvx+7>cdy)H}6e+qI!c2c(T`nwt@_W${O#eD)3@VwL
zV~I?=K-#3bJFME;lm$N74L#p;0m_<IA1LHQ2};Sneo5^u%#KQ*ib&BM%?b*&!gb$9
z3>&uhzyaELJ=p3vkCeZ`&VFzjcvT=t7a#$W)1f>%K1dz;^Z;tZ8cTpiMSJM(GUju)
zgBPZTdMZ%kKEt(ZC<}clNM#~*dD_^Xq?_~Tm`}Oei!yWKw_^R9OFH$Rb<ArHQSFnS
zLhXRpLf);5x)bXql#IbvxdGrj_?}aqJvg9O!rNDQzYjGk-tOpI4<0P#j0!J+r{JX{
z9Yy7a5s98ZT+h9KbO?cPC--F=Uw=fk1B_N!8MdE&O3+|Ey@6yXROvx@qO>jpgi{9G
z>az#O2CwGv#bq|wD;=F!wkd<L7V#~$*AB77prdB48MKQ94Z=+6CKzdt`fs;5(I^A(
zX!Q~5iymj^VUulRVqt_^hMR*`B~@FCz75X5B?&#N{k1Uj0lSHchWY2>;t-}wxjzFY
zL9`$eAa+q+JSr*V0;aux#OBdtxsm3V*NKT=@}6K;$xFt67GSHyWh2J@qT`b>k<?7d
z&KBF9a~UgqSjvQ~sz0=~U8vL&=udL>0-LL?<tU@czDxUah6NrDup$`BZr-v5tdP21
zmZ=>^TP5c)fhYxOU(0ClO^o@SDj2kNq=4*${qHYhxu|%<>kQsQbzy?hI6ZP;!ktK{
z`YiGNif_o19U-7HH>AiwgUmY*jP|qe@tuAcj*G|8(86UE+4^p?V-AQWLxX*@YX&-D
zuV_M3!f;Q^zP#hYhlbQq64;$86pEhe7XS~umz#D$PKux=`sHV%`kH?&XUv-Qu|WCw
zd?rpBRWc#&$HR3${8LPAl*Du-j*-sVXmXK0y?a}onjt8J>+A<fjN1?rK452UG9$F3
zTCny4u`Fk493-J&lc)^ASdO*F=&(C<1`k#t!v40okJ^Iy^SR=`6?esPI6LpJANP58
z;kFB%oMuJOH4J~|bEfX0)~xep8kjC{_Ta1-tj@=^lYOe+G=C=^S~s@s<Sro`LCZzO
zi}v0{Ecn$c!<z^$PpF7M^)$HeN)@H{?RBWBR@|L9aU#Ud6HhVFdNHFweOlLBqzjEW
zOHp|${I;oVccV4xj(#E$jQJ}nK+tC&SW_>^sJ14#fA;#A*!MfsMr+$-9U0Yp-J$m$
zK5E*`5N*G!K4<E~rQc6aim*Aiz&~c#&3)}9zO!@j695JsmDv6uQz|v%r9JrROE_bk
z{-Es4Og+@NZwgLw`yCkPh@PXqP09c%0Z>&5A@~Tg?0oT`<5P}LPziU>e|K`L0xl;Y
zz~UooQdm%Ud^*PK$K@=18@u#XGF#(f#_QMPQ%hqjI3Rik0xX_QE*MFCKbxkT2SXiS
zUA>U-EH&lf1VX=x*Xg>XY0=8W8dE~9?Hq&8f|?7ozwMJPohae%b>BVg?2eTUs=m~C
zg#dSKi<&R8JHd%0kRH>K(IBR_oYSYtDX{tqef9}Iq1#jl-&BDcX;fkow7>uO3n4kS
z5J>xPZF?-u+a{3=&m=Gbls@)-jTf-Rz8?wFNoLxR!7AP@EkURqT)AQi5eIMKPc@ZC
zW7b7zTAc{Yifr<03-!??5n5VW+Ov1z9>UQ9^m*3`sQ5`H^q(OK&sSUrLfv`_F8$RV
zvCx6iGjTw0HB^>=yVv({=d4?g9u;qBJJRAK*yp?U4xpDeA57}NfwFaYjs@51-@mdF
z2>$ib_I4YK1^gp^vUa@LRR|$!vDeB??f%?)5bfuK3A78xl%1yM;J7jgdhPm-o=7iS
zZKg0mavqU5d*7xP>>n&(SD!z`nE~Ro-)+fGBw{)tixr1kVniwr?I6vYoR~xhCM7U@
zIU%-Z^gYw9UXNcreOeK`L`;2DqXl4_PUoDiPbaT`xb;nwn=PL_sk*)BgxL<Y+OKZ`
zS!gIOc=!cNqCT3#TsAX~cqjD}Y6HOuDDpp8Iul?J3VK9@0|oM;dvP!1TmUD+9;oa)
zf4k~dQa9p2?!%!HEtXwMtA^?LPUo56dp?l>Z*OHn81=gWQ-zU?s9zH5&{*nhB_
zNh|+U6ZVa!>bmJ|)Y{=fm6dz{Ht{NFVi9{irQ|lH5M<_B*UNgMGLeSJWM}yLN|Z#<
zOKCBE!a>m%ELxx*-^yDz`MGYly|qJ^XzLOUV1yT~8LF=Sf#9;^4&s#$Z6S(wejPjS
z`rozLr8g=nTs?HAh8nNu#P?fGR#>M~zp+Y_z?{05`nzu*sLKjubKjwdL;#B-T8(Yl
zb<&$5tBA)@sbpwJ{G^=^hF<TwP@YEnelqhpU{hB=j@j)w-z)C?y9mbKl>LA7>C;CP
zUIn}>0=CjmZ1@Yamcb*SNxF1ViGn2g5fR$JAFrE?->szquj8Z?aLQW>dEc_dp-GpJ
z#ke^Y*bJY2m0>vSfoW#DTf^usqUV3fyDhDqdyHKB^+B?^PVDrp{f4|uDXlv;Kg8#@
zyI$#w^VW3Ipq?|sF`nsqhZ#%9`J#q)v=#I8%C}YB?KfDClNhhD8}>v^L!)rnMeT#$
zO3KPlNeeDzArtobfOEnWHL8`g6O9{wCc}xWuYaqAUwvnrM`egM*`Y(NO)2*p&P$S9
zkm|l9XTbxqA&uwgZb{}p#U^&vnHF*-jaSI|k5rjrxbfes&@FWno|nEq^5#&I-n`4f
z^MAYP3C>UG>0Ca*8G%_R3Ias5YHxFMq2JC%26y!z`F}n5&s}vGFnfmW-CMUh+b;F0
z_9Nj)@P0S^^2Y15%LR`>k^q#_Jf4ggLf4BIFRF?%OZwBI71)~2pdXlCH6c_%lWBuX
zYCRI3Tev$#5rF!jTB1#Uuagq(B^f*8btv0!N_mOBJf)=*r?=Sa;IjR{-uV`6o0Yu8
zz5MZ`N0V&3z8`#WQ`Bzu=Eua{1kP!mQ7x`Elnf+xV|n&xnM~2pO@J$Lcg&-YCD~id
zi2h&Xw;Q~7Io&=`9nU-!IQI^!=<#1>x=Vb!6`-_2Y?}~3u;>i^zQ2Qp0Be22h)}Vt
z=BFQA>!1YOciP$}cG1L}2WELp#3EEw5~$gGm2oQ-7v4;~UBUPWr@W<5eCkHkk$V4W
z9+W&Nm2`cXLF(ES^ysa7MJGUqU3vDaWG!qs=<WN5E;||Bx};A;70g{$qU+4Z07#iH
z%}voD--0PiTP!+0vi1vK<5P2vQ%x>{5~ApB08(^!@BF%_B=8+-3)e1RCf^7U*;^MJ
ztqiNV;zyl<5>jw22_^0EG+w--OENG0`laa9(wuqBM=!h00nN-LBpn<h=3I<<G?#f9
zwwY1W2gVv~yssTZ?;QJ#sLcDzy}oy*Ck(#n;IH!N!^BZFd$e+t8ZADHBK4Ns+pXlL
z@BiMgVVm<b(6V*0l!e!BGwNDQ4va31@(wVH&NGjf#H{<0w+Zr|cJ;Hgv~!eD@7yE+
z?09IB9)L0xrcKEdFKglUoRm@=jL(prolILC<VkUo#i~btqD4oo=hP-wZ-|T{K&xFD
zi;4H9biAk^GN%zG3Clo&aI#yMuB9pc4+o~!MMMLp1kVg1`6(Qmx99_va!TjzcHkYm
z%=|jE;*VXUdqba!wNEixRJQ%GRG*LC&`V!?$dK-2RhEhFWj9{D(7$QVu@Tjb{N;$3
zY_lZoQ~wt!Mp^>^FHA)+4{$JM=&F?~(<TQC?G{eTXgh-R=m||gD`dh?%at@{o&SJr
zWJRK~$f|v{rhqwub@f|qVG;O6g?G1>X+Kf>$FHlMY~*xdR{o2fstCoBT0JFD&`EHh
zvWkb*HmD?*fIQuAwcxtwJ^EPQyHB4k_Yh48d+DvKqvPu8suNPMEhBR3SX*4WQLH^S
z++*x7z5d&c7g3_aF+@EDQGxn8qjm`AQ7|ER(6ZAcVu)4@3J)P|AjV0L7K3?9ONY^h
z7eK*CAf;o5KG-wxa9x=XJL=zU+xnC2fo09D9P*3AWhYAo6>p}gZkLDD=x4{duWYbn
z{uOTf*m6oedJIq_Wx8^5Q;a{QH$-HNBd4F9wOSf{kk@^tMfxe0-kD^q%#dXFJD}(f
znBb1vc_j;2Sneq0KDiGFs>;1syL&rZ9ljg9bEh^*YJS_an>UZ*%kPxO4aU)}y8&uZ
z7*XFPn~KWD*SdBc{@ch?Nq0%&`pt|v7Qky0oiiNuO4u!|CP@s_u)rgSf%(9AKyA7{
z_!<NCNUCj2&(B)M8)I+wgv@~oEPYc9hC$eyp4Qsx>Hss=gM6?or8T*Q<FH>=lAlB`
zZStEWuNN2?C2?!-8o^{@FGtGueDPBZW!sU}#`Q6zr!c0h22cK41_@wn&E>TymLot(
z`b}o3*t2In&>B;!={?;_ZC1sr=+h_U7lWoRA4buxy1JTGh2bh1?nRe9rv!o;p&Ef_
z<f(mw=9}>Hw-uy0U0{(xa4LxQv1unWoIq4MckcWgZ$$vT$&7d<E-o(S<F~-M1R52S
zi($up$cim!YxI2uzJ-8`><f=Tc3(^#dcx~bTeXg*O`h|cEQX1R02E=|$r#hR@8zjP
z11GvSFL?ok%QQlPezZzOxmJkLVOd$=Rr{wxx&;CnY!LRO5KwR!**JvjR&F|adR2^c
zN6K(X&R)>iRxMjj9Ux|o=_p-K?bl0B1wmd;mgzCl8oc|Zvu7=&_)*}7({tmqU%i@R
zN{v1D?c1<52bVg41K~=iJ0XW4wFIAJ_*@n_#y^kXJ%JxBr=MfTMAX*7A+Hn6)1Q!D
zZLRi&;CjyG6{<If5NEb+-MYgLXq#d*MeJ!ZA`lCw-jf$Wrz0}u%y*+L;7t<<&VmJr
zvT|&9PDK{RHazFzvIFt)V)&CY1FpU8IZ?;i2S^>h^RVW6i_^j$eA&BiAB|;5$%S9J
zUcLYIN0yE3dt|IVAl=48Lj8|}fs%~M0MY9f6;(v}H@(~Rx~?=tg?TSsAsQBIso87m
zA1ErUSEl=OW1H^#v$SU&XefKzEn$xtumw5eP+q)1PN>f-t6PMUfFjNH0gg?-o_!tY
z&)_avt7zK+%>q6lUFD96Wk=FLNIH-2gc=Rqv@rAa3XHIMmC#aH9RI8D%iiSHj`961
zLX+@y{0rNnbkdc}m%oF|urakT23#;>`>t)Y-;ry9EpQj2<xQxHe6I%l(Xc#5fd^hA
zJk7Ph;kG_E!R3ozTp9?F%Sj{z*t&iD#h4gPWPSn3>7{7>kD|kF2S9_39=IPw>K8aT
z2>6G47@@D9=(6h+vqU-r)I&}u;TSt+j39&F-xJMq^E^ekMhI-w0?WO3=1r6bNXBdK
zYK@Z&|J5kk_vIVRHoTxRvQ5)lnVCW@%Sp8EOI!>Vy*BCZxahU$N`GaLTsPPC>*;%I
zw>Fa<2>!HNg8UYIrGL{s^?Bj5WD$+A=TGCzXX+t**wU5_3T74gh1@@~p}b-@_P>t>
z4xv11$-1rHC2`k5pa;fc2PHI{ut8*NC!|It`OoS-+P>ZHR<|7>w|arE@>37nieao7
z)YR@6`qa(V`0%``OBnn7^0jOID)Ik<dA^)M<&s`{_ptNA<DH-!x%EkXpBvTvNB(yy
zP#=xaOzFHjPLgza@7~VgkLz=r1WpyU6eZD=`lYDmXEHxiiuf3x39H}Z0#YvhG|8cc
zf0>QfqLM$Q;(~0A_e+;83pRQE>^Di)t}hEyZr$?Bx@5dvhYST9vnd%y?q%3iviS1K
zwz`fzd!D`f4OW;GUZ01-3t9l_gyOY($&o;v57(S4jV^27`qYq1+PRU)SZ@Xej}*`H
zxNud<zht!L`}VH*);)O~4KqRpPz`{TE*7dw4R7=Ss*CEku~1LNauxbaDD=_Wp}MFc
zvg>s%ibA<1I&pu!>Mfaf3))Ih)+PG#r|7rz^?5@e02jeL9$ALE_QERW`16ef+9GHr
z8M4-I`NfZtB*)F{vvTqdAW73>sQ&;P>Bekr`&xA2^A@f}p}B>XFzsz%F#9tC!I4<7
zexnJ)xT)MqKn-RP+jtz$BkU6Cy)-23m3b1fWWY;EP{4Xl6y~=IREtk67}T-1j(jKe
zx~4N^h=58*=dtnWCbNcRVW*|5Yo#%=Ca=BNUF(FX6#HFOjf8FnqI=mQt@3?2p}r&^
zSm`9osa*~YOY?R==d!ehILfzMuIM$`Ol6mA7%w779DOT2Jq91ky_C)ZAVg}WzZw;m
zs;a8#i$05IbstMOazYsv!be1IA9OQtx6q*GW#{$hE5X>{C;69(sw>0@r_t$6ehic?
z5<VqISekIKxUaZuQZgu4$3^TP1r~ADgWvL<9gR;V#8*R=O%-J<Pyn+c{5<}zqte>7
z-w+1sY&$`Yh`RW)6auX_B?5qVd0KpY35dA)o4mYZd~@_4BS$6>2*B`H(<X!!agJ<{
z7dtFper9VT@$NmW*;B|oju7xDA8j@xwgz9@I`FtWO&sovZi=gG`yC7;ktI$^_4L3~
zN!g~h0AV9)Vr+KzA&_$d0u+=Y*=gvp$QD0NY<2T<B(!36JyZIRR9flg=99nR=i$_0
zfY`nR2eu?rT1AFk+)BM@u_SKZY(@gi=$wr0O>;Yl@RW%%N|fZXuC9)vks({Ow)@6~
zf$Ip><Q-t8wDOPs41vdpRT;VZV^Dw<hyxopdiLa>i?)%>1Erp+ra1Mps|qc-_aq<o
zChImn($K)*Y3ruB!T7-8Q1z7H0*eR~r7hXF8Ia_NSjH6pNog;d6|X+b-Y8l%LTQTk
zg!r9fX0{gZ9}DV3&dh+!<JM8+;fV;BBJRdTjENXNpi8VqcLsrrCEIu*cVdBEfZ(FE
zkNH%f1{srtEzTQG1PSGVcX%d%e=Rur??_8R1~WV^&?)`@#MTf1>PDpA8!RY9ujPEL
zUM+D+d8FY)zLU+(5sD-fpAJPcu9y^;plCdKIK46S`C6aa%Wc__*n_53vN&q#1Sur0
z`}dXxnQ<4n>*z<jf7qxVWZ5(iTR~X(Y_YD>KTQH71Yfc1&`(qg)V{zK7TmqnXWoX$
zNZbV3RP(KC9@juT6&j^1b#f6wJvBBhHK=S5p?9-SI_og~LKGWK&G!t5UtMI)jAvOa
z?(vinvLhi7DiKsY;H%jRy`pVcH)222V-$)AA3PsM>Czw4ePhG7#$;381pm=e!@%)-
zTN%s26Xs2kDdBVks@)9!d5=N>he5jokt7-0p-CJGQC7$Q%F?^!Cqn4;_{6UKuZ0U;
z*~lX7<yXF(z*tcd=tE;7k`rX>jxgqz%uSZt@Vr4(%<$3ecVGYshvu>;rN2ZVIU|y)
zjA>E-K%rOgnQUSDm~<kHz0ZXGH(rggba8X*w1f5^UR0u2jivZPwbR!6u*8v7P$I=~
z|MARbz8r;e2_wR0W%~a4=cZ)J%0$yq7*>z*z<F$bHnN5q--J$*6|a8+jx-LK0_<H4
zoGylkkgc?$(xz?Oa39M-b^|z)VjQaY=7ZF<JgGjaS)xD$BntSQnYcSkZst7j!QXKG
z8W-UD@J^mS<o&21#-HwK!!K|&2?~4%BZ^^Ow2WsXz7r>$O_hijy|nBFs@s365kUX0
zpg=I}8+TiX*Ti3d9!$KK=oJ4m4GJ73rrzVEO@($OdfgPwM~ge$EJ)C;y<`p%<_-=q
zFcLSA_WHFgqq&#jG0XGsN67Gp`l(bBemHzeto;}zOn$i}Ss&vmz^bQ!aYA{n?sJ<i
zhbHaGK(A8#_*SWcknxCO(&Xr4SQzTY{Q=>#k4W-dNFu*hR47wb*hS=9Rrq8qx1?Q(
zmmy`x6<Qum19p1?ipIb5qn%q=NabXK(z%!zAuR(x^Xa5!z4qusFZcbWDSQo6p;UmQ
ze$#d7ji39;il>8k$WxsQV2tw6CC@%#F2xBvz}FI<uQdO`@q6sq($0bE{>n9Lycwk(
z0Dp9!bYJy)QCoJmr{J}!VNp1F%qgDAa=-k9D~38_HOrYCF%hIx@5MhqUv3t5=`re6
z;U&+mUsvtjTjVX`J6{|-etdgZy+<hVc!+b!3yAFcyRQ<1Q@I->67+!ckotS~zJ==b
zXA9|bS0ciT{Xi&fQ^3A`S)ksa9^q@$C08trjeVg+SVwi(L1syF^Q#9BK0Um+;O7Rv
zh>Q_J)q=yL)~<VZdZ)@sY@)iiFh28z0ELu9<Mbuy!boYK&uNnNk_;p0*$dDc!tqVA
z$3H-7M(fl++P}~bLprsFV!ip=ZW6oSIauHh_!|u)8)i1K>Av1Med)iIZ-bl+)jHZm
z0=~EqtixM43S@qgm;W20&j(^-%dxDYRQd_@r^m1rq3ny!v5-^I^HI_ur=Wj;{T(UW
zf6;#q8IpmZJ2;Z%T%vDxN=NvoX?h(f3=#9t&4J^yp36N7xhpy}oMZdOHsFbYU6r+F
ztN;xrd!91)41&b)nBnYL^4s0|vrD1jF9VE7_bkfIJ?mC7cb4<|r319IXu2Dx7M8Xc
z9Dh`isJ#bnpdJAIk0FogH*T1iij0r<45aved=vMD@I`<dh=8Wea;EBwy<&o3*rlcM
z9e&1>d$M4x3)*12c1$fxBO*^_5GPCyr-CMdH$Tzx1=a*NZTejH>QY>&;L=@<lyvSy
z6`wkb0m{*)#|gd{-f1N>K<@2%a9~O?HY`{WVGaylIEIo~#>8KH-6bs&AQm0bN8+@m
z9)G}VBI?_vm32^Fg2@|7Sgb;`01<2O9z1eSAQ5&4CF`fHf9o}N7%?S~CMn;^xbrbF
zwGdhQPS`t-D2V25kzT~X-4|UgJLrFhe|7(s-#^B}p$yM}IEH-{pnQJ#rcL9>=7ja2
z4tug|E932=qmEqMn^aK1)+{qcPMF{`sH~mxZ22?JX^E>ORR`s`M>v+EgdxRi{@Z!5
zV8(r1)HD!w^`AB0zD3VC_m&&2-5~5qSs9`K=FQh7acyXzr@Bq+z7qeGBGd86Xd`mf
z+-20Qeb}3@lT*f1Rm6`o{{1Y#lh?~O#-7}kV1)1tbk3g0a0=c4BW;G1gSYTmLZ_c0
z>jkRY!#E7&+hj&2KBW{=iYY`BX&?a{k+#@sYx7?EXpUsj0m)e=REC3c(r7Mrov?K^
zT0cFNjkgOS>vb4QH9CaF^ePNhlngq0W@c`V%1IW8Fd6SLBZgrXg8c|_O{b^HY{c8{
zl9&y^HLN(MrxXGWP5c=>-BzsFt8Kp-S`$<9b|-55ykn5g1ViT>12*oDZnOPQpB65L
zX3aTh8lZCts6nsk$qwz3z8%SMf_TIa*Hu(fS_zYYL=t?u^O;Cmn2_DX`a)pMNclb2
zmgn;!3WkzI<rv{-3OT1_$9E<{m;;ywtyuw^H)|U#pFdxqxq?W?e5stAxmRxA?d!Py
z<-~|5h!toJ9V!G@$o=C0Be7Yq^jl{JX-;$EWpRcHWj)3Yme){?Lll`3i!B9ntRc26
z2<hxwfs-WGp!i%0;2wGC*QCT1IAg#&w5|e@MOzR;#0>{%I$$3dEq!NjJN8e~mnbU1
zE}}0E!@0WnWz8I)^C3TvOMkjQ5qe?HatUr5<`$8a@kIKD$(=atqj`R)9{BAuJbC>j
z{3xyp&__*;Iu<j~9qtG{$-qvuS9Bk?qSvrt)%3@(@=u&D;DAHBunmFV&Jt>1ZPDxd
zajol`1Hb)zbJy4$^`LDeCx+{4tvoufV<-cbGfuQFV+HWz{3*KA5Tw5rvi_IcS2(Uh
z<kdwGdW^Ts4QkFfE37R)anAf-pv=v1lxmE&-u~4^k4L{&Kb1QM(1XN~P~TaeO7i($
z1uC>^lBEdbCXYMkM98o>Qn!8+o=WGNhzldMrf*dLJvQBQ$@1mJB_)%k_LB_ou>7_0
z5#vc;aQL3+D+yGB2G`S4RD$Kn{0Y|+NOXHgJkGY5FlkaprGW#Zc`Y(bRRsEk91&|8
z07)*Ho56EJ<liMSDk`Wq`K#ET(gQD%eCZ~)#C*0_b2FAEGx{(w7wu^DQ_PY%3}4O0
zH`!(oKrb<mlE&=Ir8MPIY?WmhTuxnXrnp?-tNtj~;}6spEMG2YG~k4?fSCK7^S|!1
z*Hl_1dBY%lYG5@ww28}hqr}%>n>>)<qR9y~%P~EoXL#iryb*qyf0Rfve%2!SOV(3h
zE>5M_kJ{Br%xV_%jW+#$(_Z>X-;~DBvW5m}fkVTL-eg7r7soIK!pH^b&;^M^p=X<)
zOKE&@7c*0-)11%Qtv*GqcW*ywNWgPWZ2WSTI&fPcDr!4H{D_x>HKvY{g=_#_qvgpX
zw-VC{M3z2>fPl!zLI5@)(J3<y65skG@|;;bPBKs+sCwj6g2Xj2&-G;Fe&a6xYYmM}
z^GH}IwtCflzY-vnqCgjAu#QAxwnctqxuji7I=6V-t0Es9hJtEM&Az>*y~avcDDZQm
z=aC5Mx>I-nJdxrK!UX?~8)q?sPd4vC(yOW0?9S5P5`Sn4Np*osJ$u2shIF?<zC_Ce
zOmgO4>yzv<sV;#0cZM*LgM6Q%<4QoE;z{X(f7-A+12r0Lr_kdlXNb-|Q(B|ZK!L{C
zU`!>Fh2(9MJ~!ZMTsrxvH?#o&ZsZ+_sC|8X@e?wLkR#Dyue|ofM^`s%-kj3iY?Y28
z)@s=B#@mxVz34{3?TT5m))&sv_vJjmUV0v#BE-E1UJ|%7?-xdfnl@t=m^~%3K<Q7k
zfANcMOUc^+!x}!`^*b~MOrnMCVg`BmLmaaYb;1A|B2#I0zvbsJRJxme53axV-=exl
zGsaKAMCWrz6~$zNhE4C=?bd|PlH?|ktSBZ%W^0J=&NBdA%TDx3Z1`(`d;`ZSjt21n
zDmq5YkxEaul*Q6)$R@8YUHOZRK)e&WMoJ$VKAA8&TS<DY{siJ6eCTuk9RMv$C?fva
zLfJxJS^AVoi8n`iN&idyQ92SQSW)u29vYgOWtizEM&1!`%Qs4{?a@YCy5!)bh^{yz
zT24kEBUvD!2ArBMzwm+3#-Du&Q53DYKhz+U!8@uawIbD#esSr>Mu$1++65Rx<Piac
z7MoBYqcuPZw(!xMZgP*oU)=ver@r=fCtInRrQDyv>>}|~`X5sZ(*HHYpgHgRA#19M
zLMUOw&&zdee7U~fG^zCM8$r|;kPMhlIB(Nnyd-wNG}^*}mCRHUqi5)uQ=<+=+!86j
zWXw;W#*HK|z;#Ce@53$;DUdg%2-uqBOw#*I?$X!=^&^pq#k_(KbQCl%=J29@cv5~s
z?>9QlzFI<PoUrZ{gWOH2rc?b|Y;P~pC-LKZ%Cxb88iT9{L211}93!bJnwbR%&{66r
zv0t4VtwC}VvCl@Vy6jaG6JT}Hs*sS25}IYh8#R8``0YFQgM9?VO+46s2ly~11yF9b
zd{dS*Yjk1ve)J)*RKwPM+8dw@Gf!9dJgKvMCETYs-T(*2j~ho!OxTqe@H{Jvd@7oV
z>gQ+wY5du)Kw0R5u%SAM5E^M0pNh1xKR^2)BaNMK0KZ7h?JOda0W)lDD1=^vWbyN}
z1&yzp$Xp?9DjkH9O6Sg>XY9aO&=a{uD;?SRZTO6hcp{oj!70w3Ev>35d3<Dpe0g))
zH`+L*1JzGo;xg@&sCN7K)Z?+_?>F~HV_y+Ho+}clK$raDbNUj{;Lp2^ZhV(Z(h64z
zl$&IFG|BZUc(YIsrpgcCw=s?O!CZn+lz$dcxD?l9r%h}4oNtT9*NlQ`;RueXw!q2h
zDouZ#I(3qtf)g7WIiI*2q$R+K1JxinQJ&}yfmMDAtbVd|*^XQHQH#I^k0vqHBIBW6
zF85{@yY^)_)b;pa;JS&l)KV=|BQO`y1LLxPe)ZNyUu|d1qYxQT6BP*-&88*|ahPx8
z_fi2z1*ALy!4@UD7HBidhkGWEnF9Ma{%k#{sHj!bk|SM763#Pc&d66ic|hY8s$YB|
zo80_cZS7`(T>!dM84i^nzNLPR?)3X~U^lWNDKQbl2dkc3SZ*PWzy9RD^Z2HwWY2Kw
zylG5SqJZ_9q<<t$8uBQ{PQQ`4FlK#_aif51+MA(aG=qrzTz*d1G%^6jbD7iff?7Fv
zct8L-5h}d2^vkz$;6!=pL%e|qqVEa{Ao@)GmY+T1(x2;)&}bc!uaO`=XAe`+4KO1|
zpySwr4#A)IyX3z{42;qU0p)~^ZW&&A5nO;;i9G0wYuCt7c%)J8qU&W%<;Q7C@a!JI
z0G7ff4W_M|?|-Z7Yf!+%xFzVEu%@GrVIzRzgp8B?@VWe^bD^okxJP(~6T>SPd2Ne~
zbi@P)*OInjU}~p!*>)#8I~zeb@-aFmNE*nQK_U8E9y6ms;NgSxy6-;xhX4_-k_M5p
zO^)pal&7X<5-F&lAi%k+b2_nb2seS9gdvdN)Lgoy<8BbRfSezH>2eAYv;P$mEfkyY
zqoB?TQbY7T;B>IzV4(y|iE(1$an!o#2!P<iB23tLfZWK47x*$^-CD%WTtu}EoQ}*{
zOeyi6NilQeWMXt?Pd)jhL(&e2q#-ai<kIIG7)W}xW$YrIv_jn+As8L^(p;!e4WU%v
zdZL4?moL+IyvYpnM2gEu?Gy41eNb#@i+z;M30iav?3XP&R5y<@Rq$J26$&fRneIVp
zM5D4DltG|<-7V2`t_krI2%t(DB5AV>hvKc$y?Z=sNVR~mn{hlq;cGFulf;dd*By0c
zWPFK<^#|6ir-;)BB0VijyevVjr|iQ`Plfx}u(#@#@KUJZO`15dgOW`4ixRSY<K+$F
zp!lZ1t~^T>EOYi!Cnry&4}aW;nntAV#l#RopdxrM7!m48l@5%z;NbP>-hG*);}y;-
z-?u5iI*R`9G%zDV;abAJ=FOW&cx<6n;^o!V_fu0-&wl%g3QaspkuD#%B{Jk;ZaG*L
z-6199EzW6S{z&@qJuN$E9+NF)@)Pm{J(?cF(WOO4>cT#iL`_*JP@O4FGTmKCcOh!K
ziSimHi|Xp?&LN0u#jh&dDbU-Qj`1DfiR+Lp0v2uIBqy^qpM1(f0o&EJ+}BrPU+ces
zccYdLdgfM>L2W;RHa3XHtr!qvx~UzZ?kk`@_K@t5^h(8rP@SlK_5zj`hk18HtnaHQ
zY<Ql#5WQ(}^W0^%7?1Kr!i-XIFmt^pKvfgRs3BHN_5o5LZ|{y^<M6NyCu*_F9UTkv
z@`5t55ac$)0D?l$y(N=PYtp32&Rx42jIhVwO#gIswYj|FNC2Wy?5EU2`qP=YdPvrf
z-zy@F5Yi<MxKHCY8>gZ<kV){OmmGB{*t!q@WWGW&rT&0VFvfmt#HV#oBHzu=r~i{p
z_8sO+R(AFx4|j|}(B8e5Ia-`U0z7e?(D*eUH*Si|X6DTz1HjMX$T}!dZz8E<WdW73
ziX=M@BJgpEqqib_ke*&<&p5PewJiiaxNzWL3ei}C(W1Ood%uz+PZ8q+lRLsYneY2o
zaJt>v!+~_ew2Cpbya94M(X)q5I{u`v!4O|bVL#IxPFIQPt?X=1qAJjqpN6;WfdC;q
zUIB##sa`0gb@tYc8*hX4zf<KipMur>ah+m~hUi@nd@~JhCrtp8Cr*s#HfV8gHMxV<
zj8g3S3;NhVgGeqo<di`&v9@B%4oT=m-|Ph{j&S$Tl48Lt3!gZ^b#|;<F5RMW>$(2w
zPtVK1@6@2CT;i3poqw*7o(T4>xSK?w6mWAYcoG%b%ViV_eIO?<6{S=1sM>cNiK#6J
z)mjbIvIsTaAH0zY6^(t51e*$sY`}m=Sy{tzY0<To#P~KCWUK&14N*X&WIAWg9O$Ld
z4JT>`P&HdkxqJ8iYOoJzn;H7OVx;t)NpeIe9;!{591rc_uK6R^22iDE&;jb!N=oO*
zY6v0Nb)*{9DJ>GUX?%*Es7z_a9R1(JbxU~#EtS>Ja&JTto6T_JbL;!1#8R|7v8I{C
zB8$6<b&7((B6O$dj+xA|QZw*`ODn+MY%z8~;u4Q>g(7WyWt`{)Me>|z=!AS<RY_Ov
z8J*Yc-T3!!Ijg!GTxj+AR#~J+&mft)N)J69ohP<C-&oH&ZLI;d|H)xNNC({lI*oBi
z?ZH|C8OJZW5qFBc<usNYw-4lhnyo+^IIW9cU9BXJhm3xt=W8jFmHe2pE{ba7^Y6?9
z*%v2v$&x6!y`O#C^fUh2CxUKKcIr~>FmdSU-W+`v5*W^Lx0H)RaZ3AT0bK*;J7%_>
zN*N+pC??ka#Sps_i~-I^c!y#{i?(g2%LY5!*`m1x%*l8a>Px&wa+vWAahvgOL8H&>
zO?JdhrzL);dy$Nez4C(dB#JCJk=m)EG_o0$?^c&^l=JroJ3?YO!b6i>V5jcwEWMHg
z*)(H27>54Jx3`ELV%JPY8Hz{>|F-Jm(@m2%h?nB4gsUgM695kZB!!1MT2#L?%G)rs
z;9EWY#u?*AJieFLxyA4E=g(4ke|h2`x(o6TmOJhpI^kK^gKiyTH@kNlweQLEu%Qul
zlUy&(9QR<RdGph|<|Z12ypH#do;*A*Ju2jO<(JIz8h68xe+zz8e%kao<MV1CP30C>
ziext5{ns5Z=rvD4^riCqk<gSeI*h78Xs&ODKo#~3MUS`c>B9P&f}YZ$rl9BLPk(26
z2*tU~HTR-p7~ynfyhO5Z9n`CA^Vc6gd;kSn>+X)Re~^-5-fxNcrlfwKWHo+D#wQd_
zCQmLAx~t%wFGtH_@3@#%c>~Y@=tF@FzCn!Q0@j_3aq2dhy?TouqsxKxa=7KpPbVng
zX#~F@uxX*yQR3Ulb;SyWR;}VF(R+D)ryvs`#qp9_Qo0zfuRlJ`X6n=@fEt@Vh$<7n
z=#?;hij@Z^s&&hj_vx0`ahQa6wMyMdLPSe%3HrE*WIjb{u5uIvD%E9xD$##OP1}Qv
zFOnRN*XSbKZ!p<_;uc21L}ApZn_};;@Ft$br=4Gh@mjWQ&PLB)B>Bm}5jStXqiS)H
z0q@tOO(ZAG*$NCJNu>6vph~eCU`SM!Q|2^)A^o=JT#{Ji&zv=D5-LFo0(yf}8hIyT
z!oIMmsD5H94E+$(wi$U<Kc=9p!1vNoH|I@dT|E)`wI2l_sr$RdK=^X3gR7vs@9?2R
zr$>yYaFF%!SIJ^q_z93E2}gEdH)?8Xl%(`RU&}0tzldfY`>>3?fKeM)A-!KoXp=xT
z$@J?SCc<D1zr8dud3z`5E_QaNu)0+B7G4Ugh?*SK@lNzK^KJkg#wwg+Sacylv*upt
z4s_aL4cJWyXhYM>aHXHv7`~cgBCY6~_VDmPKk0W>3>S*5*Ew)0bA-+qTW85%lZ*N2
zEDB?yGzUo!q!mdSv(qd56)|_3nwqtU06z*bWOFUf@cQGAqj(e~BxWDy6+}l8$zNr%
zgr_Imr#ViKrPjy&cQn1pRKHQQKlY0IC$K6iEE_L0VY#UmH<}M7UNtm~mnu)=5{K+7
zVOaTs1z!$^rf*fzVVzsHXu+T2og6*-ho6PIGR1J}5pbo8ut#u%?HnBiF;;cdM&%_N
zjzmWf%y>c{vcsVpOrhL=^mNBASo!q7#r_#Ju?=dcyV{()c=4HC!p9LyycHzIMXg%5
zMgV;Smk{l~UF!U$SgreRic!Rdb7(+NI1&}bsGC6!kGfsi(r}>T)q~8#Uk=Ir^E5UI
z4{s81_Uu_?5|ihM2cPT40FDeO(9=+Ty$TqDIMgaPx9BNHH*WQRGn|gh?mc=8@Vg8h
z0K=O<K+x@zTGHi1UR6rCXF`#F3tkfO$x+2&Vy(EQeucCXCe)tTg)oi|?BgXUC`%-Z
zr_kp^hF)G??ytxeMxf;qa{+&z`R!&>F5u(saL8z)On&ktmg0;{ru0#wg9tt!9MacQ
zS2xQ>*P{wY00}Jbv@9ITy)e2~Y){`ct-6n}Dwd9(&+WjJxuN70%&~FIynELHckSu<
zeY1E9PX16YT^TsXUo6HkCd<3}%|DRV_b7%DM6tD1kZk*TbH%colm4pJ7)%k$yUD~y
zenQ9%kA(vC7Ws#bKP1nQeo*>HUE6R=nwl%FH}7ed_5$_%gCkaNmYbhH|0<Dp=t+Fd
zdm9?QVt^$uP<k;A7?qlOgvjpMh(Z=WRv-R@mn~+JY2AUfu{j4Q5k+X)zhoyDU-v&g
zsDUyb!eFX>`;Jv$Y>$xpCndbsU|uAfkN$_q_&)gZ<qPbhq3C~VNX6U;?%GN10arcP
z=<9n$5UIfN0;@L3t}H8K{L|1ug90yU%I?&lWqbo8fWO<76WTH*V$$Xuh>Gyn3@CBP
z7r=gu^u0-c&G<L-aMbLQ;duA&KT#o#e2u&XnOn=^tFr5sHt}?k*#YtQ((Egs&uCS^
z;pzh?Mpxay;3fVb=cMe@p}%{|`-WJ&6CiQ=!-tPcv*92k?n-1C)<6=P_G_ct)ij$s
zHwX4Zmu}roS!wz!qJQy{1|al)UJ7w)enOQSZw5c=1@4}nA;VqAWK-s@=fu<8Dc_iq
z@bEv}{sWYv6VP5E#l#WeD5zs|CO62}Vw!2A&EO)&vJ)}zHpX3p-|#%fehavB+%(z6
za>H@evd2W5ejtZ~0UU@FpFCfbpI?c3bJs>2C&gr}u$c`%bN^y?f~fecT)89c5)hD3
zx-d0umL}VYz9)slk^jsRs3Tz?$JqRuGurYvIbOZik}0xJJZ{ryjTcWfHTB~rmE$uq
zGDNdHCSK-%DS&D>+CHJ(iSVeLtOABUPs8B1#QGnp=tlf}b90V4{}LJl=<7nW4hTLn
ziWoA*1)zWw9fGbA97un<ppz0ZX#NfQFT&@LgQAY|=+-UMpVwK<br$!g4h02nIQm*3
zw^(YtEMHnwTr3j$h(`!)K)o4DT8O<uu^?w%)hnzUl4^TqUnF1u_(uzWp@zpv2htK$
z2GA<2frk*rrYp}RfEF{P4RU9OGj^<&WCS#Z906;ywx(+KIZnJTN(qn$Dz77=dgbFU
zlH5PF=3K2Hyn;*vW};P7B25tDZm=Y}w4VS1m|r9BrgBNf^~GBDqJ3RVYC9%vS@!(w
z8_~vIn)}zevec2^1a}6A#Iomu<mAqjKFFA#Hply-&o3+*c$Ce$I8b1|ZnB{xoqZfD
zO7)e*vt3kFepeO3*;ZC!2oJeE=(DJG<V}H_LcZ3?WNVipCtHkm*3s5R;KCLI&TotK
z#ObLS<BApDeE6X5=7=bo3B_kBz^sQ3oW@_8{>G6Ik|_8UmrUvX=lAQ=hx+bdVv3z>
zumZp_3mqLboTtg&q`DEWDlQ3U1b3yMswt>3jjxrsa_cDT&+vi~sVax7fN_i(dF8~3
z#VF+ndXr)pa0~(KkfB3I;^6?81l<&IL-vUntdKVo-t&1pm?M0PZ&ZA+>{op1>o}$a
zMpdljxPfpoJ`!yU<_l2`G>pytEiEjBBUSa=w{Ip>o-Rh44i}Sj9zE)4La8uehOvso
z_oVtCe=LNlN<et&{CQNkLScykE0`P!lx@(*F}NuB82|YDYXqB1uf=#8SW{YV4Gu!x
zmZvsI#Ug}%SVjVc!i?jhMSZr344Cu7GrIz$)qbbQuHh*tc)7Xr982l`7D}y<89-hl
zCLXZ<q__#<34fo4VDs1RfLREo(pJYC5wUxr$}-wlv870CR#Gqr%m%$WnvhHfh*VaM
ztV1-K=h&;ek+`If0RhZEO(pE~<BSZ^ai}N;KC>J7VQ(^@?F4+3sn;@v12vw;B-o;N
z@96YW`+W7v6;a2vvAI7>7QRX}|K9%cSILPu#uxR0@#+UyL1K1I(=&>xHCF(_D|c|d
zTdKl~fW3cgWHIjS%^Ov}0K|1D5B70jI#L@2g}{qqRB^^-DkYHH6c@02EKiNyxqJ6}
z<Zk?s(>^$8{@=%)V_d2sri5?f6yCS_oiOnn1W&M76wFn+b{+o*OT<vk#js%rXnuVE
zPR5vbT8sdaqzJ7Dz>uO>*Dfi?pC<AE%3_CXY9Io{^}&AS?&Ae{6g=v#dW%4&zm`_W
zh_QzK_wU^40&J>kO5y+oV#~vBgXSd>pZF;fIj5h!8FjL2F&t^D4Rpy7p?+bQ3#@L=
z5=dTn8Pt2GeYaFaQ6d~Clcpb(0d@&HO?6pR_QF*}sXroL!u%`*pE=j`?zXOO`6(SJ
z3OXm$j8Rz0mc>H|=#Ipkef;I?SEQ4dpBqI0EcOD(gu|jm`wtvQM6MolLn%dL=rTHF
zDZ6(bv1&2PKnZjgI+AbSY9o_|Zv+sq_4zT)*0vrpcNcBDfUg6ZLie!Z>g*lfFoR@E
zQ2QA%!jm`2Tv?L<zJ-Aw0I2>ygTD8Bo`_{Vxf_#-UVA=EnNB*D(6vX8<qH@7AwTvf
z%)|kBLcKP;wVRvU#P<xTG0A2Gf@X~qQC9diA`qt;zSLdM@7T5L=*W?fj0s`+S44v5
zk!w@>Hk&YEcI+aAV`&XMlzHEAqMQq$@AhoPIuAlSv16a-vWGL=o}{On_i$(XbAXBp
z3pIjMpw*3Yo_!nbf+0hu1#f<on%Wb=F&Iow3ka$n9yVescomdatDZ(sN!cakU%m*^
za%BghAV6Y@lvxE1Z1VnC*d7ND9%S#t-?I5l61Ndyf<hEl5IEVro1E?<&R>jPvDB#h
z)@d196F3`Q7>dq<B9eDT8rp=2gbjo}i{`fTv`;@|gG@sBdF(%ZTg#ky+}vsdg(RJE
zgJ%jY0-KDJBc6`D@Ng7~+_5ufL<GC>*V*x&)3Sy1WbNhc+aGbvm0R2<{(&gj+i&lX
zB~W$OLk<BfKK<rx3@Ci|`ubL+NL{vW*`gnM-0q#tYD@x_l2!-?TaO}H1lc9bg=l&#
z_tY(TwVQJ=?aNtl_5dHGam1dB0+&N*Gan!(0r=4Tj2j8`Y>TI$IZVj!QWf3fk^wpG
zaW-~84i#4|2-jpz80`dR^<=oM4#BOW(F5T%<IJ2F#DZ#SVb3ou7n><`XTbLD%QH>}
zYkG2_*P=i6z|d_C;ZxF*0OFPqjp4&*Dx@J*XK1*s?#yMd@bgn}k2|A378cUwibfy)
z+xBJ?E8rextj0g34uvQ`M%^1s304>RcSjQ~h+CxbT?P!eCgR~`*;d&j5t}|wEvQrL
z=&ytu0A2~aQDEJZzX=XaPVgF5f}_{L1nq}GBSy?JH)^T6v2D*B8WpIMG;7ujJ}W}p
zC!;Bud;=Pk>X1TV8_<2ETk&jFQwn^1w>KrkQ>V0mDMV6Rlo*KC4(?&dzTYM8`3&2b
z2Pqo`j%!$uRjVqvIZT+J;*~2`gzu$~4i85eXcSF=bb@{;D!M;spjf&9fG|w=0Avxe
z$pybc17kc`ena{s(N@E_6{6#OtmDc3Monq&v&kN$tLyc#nd9sspNA6~Ay-9d&rk8r
zojdIE{=@syHpl@bKLONJHlZ%D05GV_?%EcDp(&)Ok6toSSO?FtC^KsRGJrs9k)R?M
z|N3OuC6fRwv=3zuph!`z!5bf|yRHei7c`W~qwoXJ$RQ2;bDP=9`QetL?1Fb-S633o
z?6<NxwRW&XViF)l1GCaoQ%m?lItCygfu3|QK_eUzIzR9gkfwTk|NecV%O5?p#ZsIY
zMEuKg9HLb9aq2`?N<@u-Q#m)p|HgO>OlKnvFfy{uetfn=lwufdu-Wd<jy^jUPR%OV
z-2PmvhwK`m0K2RpFHb1dcz!&8<cR2IJe-~Ob(>z!42&VS6#fchnt9qV2X8UH&B^*W
z`Zd)2uv()CZ?;SnV@_gX%uywMe**sZo}oN94uoI}5tneKY0CPBhI159<o097I9Z;g
zrrPKKb@t|QHLq{@Z+qM9gs<3To<cN`dCa(s70PNglQEPzBn@g)hR9S)h73_ONTMVS
z#tel>g`^CHB0~x#b>2^_*uQhmKj*C1_vO1~wT92<dG6=Fulu^L`(gM+PW8PG)mVcL
z0F;(?eEIRC2TeL@$72qu$AVazKYsiODBeZu;?kFU`XR%;7KaQwEq{H%{6K`~Vly2g
zG!NoFnaWdgD&VJDn|`=TU<^DMr?HiRVmyj<yNc^!@(1Dh&8t_du*bHKBf0@gb0_xH
zS9`KXB8DH*S*RT{d(fQu^Sdt%-II_~@cOmkL&Nh?Mn*=u*%PudGh^6d;B07(JhI#N
z<RD_5Y}oK8y&3}EV#|}SH(_@+=!L1kj93pR?_C1W2eI;LMoIuZILbOIl2jY<S^9o_
zq+#=p)WBV1+-(bW6>oZh1QHUP=zvVWvl-_d#>egs2<Yj>3>f$I&7uyXR*~MdXn1JQ
zS`Mj<!yz%er$vq_DX;(6UNdy>u8bUbGd5#J!x<3)M<XJh3J@;I)dQz=l+;29`TbK+
znUrn7oIhwx%<v<2-i^8_iZTB<q|$`lN5rGOql#r(YHSdM{IHtCazpA$MrjwZ!OF_Z
z7t>ipk&MQ@S@Y@cWq}MI&1?+A#{9?$F{gmnXeH57Qvn77C590_I%4x_ADCDJPX=Dh
ztEQC)XT8theY<ymK2^c4Pw9VMqD&wwvFI@#1nbD?vBmAQGB;$VWS+7z9Z^yey6|{N
zT130)OHpkFXR0?)`qY&Z8%_=Vi@f+%CyRql?;(g7mMlN=CAPLbaNh?nIVN|A=26D!
zjT^54v4iLR3KepdA{(XjOn!-tkIsSzygdL<ilSg&5#Zehb?Y|H;rNftWnp{w?iJl7
z)A&NspOs#+U6bzMrYp?#N3D21Ci==A-}SLqA3y$H@sG^ify(Ug@Ug{Dg)Mr(={dwh
z!fDq06N(5pt8%>A1om^->xj6v+4o7iX-pZq%EAO78?(H?d+j`f|Kmnx45z(C`N;PO
zxly7+BG?fVRLE9zL5KbHs|Aj9kidyLPSl}LQ_B(22(p20+QLO*<<subq|W{{1!SHe
z-$EknB`Jm1zi<O0w0}ie(yOAF{B_&Zh$`v~S$M^!O`GIB$1Pc~fTYgC(lWF-WBPWr
zJ*UkgvW<?sB<N|n{v2gS+j>@IFSE%FN9=#bx7VEO6f$@AY?D9w=_@wsD{K4Hf1Z|a
z0&k>|gk$&qz`#?ZmMvI72Fa|p&y+VsMW1VWiBFjNjAEg^ns@|P4qne8gyRfD%(^eG
zt1&`@zfT?PxsXHynRv8nSER3gWsx`&M@Iv7?i3uwQnM@}`;D>bdDw8)ETn+h!W$hF
zK?J+?|6(4GcA<qNksPv*yAN&&$dN7l_N(ve-atvF!HUdBY(U1uF6~pk5@drymeUSv
zo1Gr|;zP>zVaYG%^n}-p3xj(=PyWBhkG!o|Q}b!_efW^f?jco~G+}}}ys4BP1#jM9
zO!5fxft*1D2hvupVd>%CAlyfZOA+Za5}YDcuim+H3Yvua*H*TxD9OZRX3~D-TVk|b
zbp}NN#jBQ=$<Ob$8)#-hI<eBh?(xWw<%eUEP;CMSwC)5*9?qHGSOX#`<bl$w73i|V
zx10g3jq9v*OAMtJ<IEmBpfhV2A{}&^NP+7aIvP?}TC{C-1$*$*MFC4O7ZagcOhi<a
zFczf`j&tX_CJMxHjOHXN`7bzp4cE|kL7>UY%OmB4YP6y3dI?yL_?9d~?vd%v<?BDE
zwB|j5=7A?*`Y(BUK2o(S{v?R1Xd2b}YD)*HFTm!H{sOm?X_43)etd9zv#KOa28Hwi
zk~L-MFx#lhZ&}WvsRN80yfkav&%=GRwGD=&s-5Up0V*GEDn_%|h5x1Ge;05Qh@92e
z@t=RXxqll6u>Fdo8H;GeIHFsY8hlyLM<(mnS3}B0#Ky-B*-eDTE8-%3vC>uwinFKA
zzh%8W-(bIZj|`lceh5g3*pDOWF%EW`M|=yGm_(Thy}W{L-=ul-Z(qL(7e@*x%-R_;
zOt!^fp?MnDQ9&$7KioTAe_ih+%$CW`P$PO^YegasFohg}mv<B;+IV&uws6j;hbSEl
z<TN64AtU%e<-*DSQ<DwMba<8=aPuuQA=3AJWHn<3Sl>fbFuh7BVi`_BCqi3I<=2ff
z8_SJrG7xneG+S|oZr@%-s>^%p`hvI-kRgl(J+6YGM4Nv-6Ip50n!B%Gx_GfZy(~(N
zqPK5li^`*txa){a-9VsVFi@8AO%JGEy?%W@WHfSbXau_w#wcBmxSgM$|L~z{)>y*~
z%#e)5Bb_`W%}fw^Js;(w#TZ5llTyz>fGuq3n&OBk0gd;rqb-{?VRQ9aFn|6)WFB1b
zXUHkoINWabd8Fyd9j8x!FnzPOI8$&dF|TGjr>CWz<D5nPQ9K2jGZ>Y!1}Vv`)8|DI
zp{~CFz=7o47ji@*-9#4T<(#c)*1BcOJ#x7ndW;t@=?RebT99W*rF&M-CSQdzVswvY
z_`O@V7BSM@VLrJdD>{6;KS@p57l(%@d^auh7)2+@eyoNU+UXvf#cT<H8bwRwb;O5$
z8eW)v7|9zeX~mLIhn`3CGX!{_wLE;-u-ldb^yh-GZebFTo}YFZJ_$$#r~e1IdH%kI
z<11T@(I!oX<}<wAw82A$c+8Q=xU7bUl4I89tvlIkvqeimb>yJl<Tp$2BFQAyt>3GL
zv7U4XmkJZikMuSDKff{Cj#rd@Yi5`6c<8qN$;W<5m(!xpoN2>B142TIHtvfnA4{n}
zZK)gtX{6{_3H9+6jOi|2E~lj#1)1EkX)m5at*k}f$s^_jU?b6sJ@5h11Q#f`Q2gBa
zSx4M%DP6gJeonsIx8t|kA92xtBa2*tr+YfNReiom;?j5U;Jpd80Ea3EeibYe0!^q)
z2l$vFOV+WeY>N84_mc`q3hy&UpKsrNE$|7sFau^t{V@SMeDI*I@6chxptEHNsJ1Ry
z{*<ctCGlh>TXKtr9LF^;xAh^x>gkKF<sNhkk>OvwSc)Nqcz9YgW<}OLOkyd)Ydt{H
zM;RF!?wLrjIv5C%g_;#QeOz9S;(4EnHGVfg!q2eGMMoG$LB?rG5SQBo!)_9gUyD9U
zr(=BG1Pxxmxvfa$CUtf%c>n&CPoD6l$2^E?VSHVi!esq_(CP4fkRE6n5(Y*PD%wTI
z#R(H?Fw5UB-#~Gz#D_bY>P%?j0Awo7J4!eEU}7K)MBF8*$4@85)*<$Th#iVA!+H1z
ziywE-PhY;kG}$q5?vf=WXa(#{SmCMl3&K{H#JcWR#)O#5WM#CDD=RCR-e}a##l?jN
zqcub&cFXTKVRip{6DfrbUlQuvcY;(%3t~Cd4ns<XG19NQ@u<~*?nc{`l$5*s2fV@l
z+QgSKyc>3fK%@TAb)B6t|Lr7H?-Yjw+X5C-_}fUFs3D=wn>Vi>GaKXdpU-bcGovv^
z))p3zfNKr9fzRfmYA0)FlNLlaQ+(=OZ*TwZop}zWFlHJE&<W`OKn(Yuzj;IZzl*m-
zWBF3WFL)0BeJ93DBNcMx7YMFq@ix#<NYLE)7XMj%%t-ltg>JpV%M4wAA#lH0H@r%S
zj2-*U^}-rxyLa7{Pyg?2i{T4%U@?Y$(jWw40|a^G)lG{C^QkxL_Je0@O6mGLpfioH
z2iMHy!+FXk5T~AN760ZLzM)LPK;WsN$}C2D=x87#9D)HpczbSb#=}>w>+WS}|L=W&
zy{Lb;5UKE_riTTXl+`7^ZZFAZx&{Ag38ix5A8;9f`2f4@v&7~Aa2ofa?iI}f1T4$L
zIY~`jop}9$TEnsRpZw%#hr-vdahtk<&nS{@*@We+8-hpRzI|_By&6|X15h$+&9aHw
zfWXFE@_<cy>{!>fX}9j&iHV7!1QJT_l4$yQ(CUe_LZDLBvqz5;9a5PrFb$BI_||#A
zf+?Iplsa;n)av4T-#?4XDp+aGlP6OdE==i>ag=36EMdFr!@lm=as3d*jqwckHNIeC
z%fKb|nt&ZUrV+A973f*JlNj#VOqbed0Qi_uvY=@cWg&8oXao`$a)tD9@AhSMU}P@k
z90NJ-YeQws3`In@O2g~Oky-29R(O1QJ=I&%qD`CcSNHSL_kk3s&LxQ9`=y;_CAt`7
zRm7l$c6LeggV@f?*lD0z5SwDl2FW>nxr<oI9PDv{FLhb>75jo4%mQN#2&avKT7}CP
z1CMXGnu>R#y148-tXXR6lfq=M^?m{^l4^ZS{BjTB2bEDwtg}>_LC}tgZI`-?Qcqeo
zOj|oFF+CR7Hj)gQP9Pm;64T2iT<_SJm_!gF+HtQZ&AhaJ<j|p(%Aa3s?o8*moGU@{
z8YUCU&s?2`N=GFiq)tT{b~xD%8#YX`g{y9KX*G6qA&o_!?UM>B1sL<9|A8w0;lqdC
zs%$9Apix*?_$SE8Q)(Y$4o=QLmP=heGqpu4b#)xCyXfeoW@no#1I_fK80&p_5nVCs
z8O=l?cfj99ATZ~LFAQdG1`!zwkw<X^?V)xkYXy{28NPMvZ4ON@RT3gDaJ$2w$xt#1
zgsG2xhS#Kze<;~%DwY`O6MIOa>@9ggCjvU%KmS0l7&rEx8GN#%K9mF78EZ}+f<0b2
zlvovwrm%z(6Pbo^0a6(OCUs^P#kV->{#T;ON^T2%hRvHN!Qo-ryehIhVe4Zz>&fHC
zTuM6HCxe6i-3Q5f1BOt(b89;xH#nc2NIc}y2<uT?oCWb^kgbV{uyKL*2lJ?i)GK{>
z6@h$jt0xrTn-&TQRSXjVO*dWrLn(AMT-1CTkSrivhAfpIZ~=hh2jNw4{dzFvDm$7H
zIYG6z#zfe>3nzR!Z-PV&*&0=;w*cu_1l((;B$G5H<qoP_VTImhz30zfx^yJ&&l=Z3
z(TRNlXZtK-QU>G9c`@M!^FAWTK}-)&#hm&X-Bg>!V%9Agl`TyoTDpks1(!$TIx9&~
zAKtzDfxEk4P=NTR4^Jfb=S(ooI<d8?%FTY9v2oO$!Ujqv+LL>WNP@&$t9tP|SIiJN
zC&OMNe(MGypmwK>$tvaMZ1VB~PJBu10gm=-M6rfnJQfnd?~<}V%|=6Ur%duD=Sw=#
z*~-{t?vk&fK_cJX%Q9}+Oma~3R;MNd8k%aUfwAW>RfIqTw#CP7-@3I0%{>alA$iW%
zZruu{Y!sNvI^hZM=FWZ=$w5d$?HI9uWnTNiQ(R81>Iamg;w=d>IfbP`K?JQ?{$@BK
zf>Lbi@B)%Lh=aHRY=R8R0;ewogVdFHF?X8?GoO!e(F`|&Bb$Z7>~W2cD3ig#MFkQr
zF7eE5X(}2ZLqK9;9ydobsd-}!I?r$*#bqK*hvxsWVzZDlC`XeM6Jr<N6~wsWv%O|q
zFj-?;o~HiKKc=OZsZe#*e5k&aH2|wbLc-D}r4R%&zNcpU4*&>Aj!O@%%>#Pu-?uMK
zg0HS0lW;w-lF4LCPSApq1i>k5rlJxKCeQF?fB}T&(9RyBI;YBo^CQl@l+kujqrL|2
zjnnM;@GsH22-OkVbv3p56g!w1a2DINW2{>d)gg=e&E12Ty=yG@R@}TXDMxR2EeJaP
zrDjh)Nu#yZC%R37`O4-&MK>G@O5`c4EBqphFr<ki?~h#j{<CL4u`k<Y`z&tx5saXj
z?NO<La6(L-OHGyfQWC%7-|2gI7N%R;O(NtubU2xrl?0?`m_UI>0qVPL+Yh`_X3m%q
z<LDEx3U!P9HO4CkP77M!Y|y2{teBP(wN|azMKEhG%+UD<_4#prkDIpFiBx|%VIE!p
zxt`hiB32{{l?wFLvPr1lMx#CAF=B)pSvnAYuk%Z&AK)E?lwiRqXa#NBgkH@iYQVe@
ze6^E9Xpa{A8`{wjWV&eA!e#0zDfIe8!|u)6uz&;4iR&xiiIBMOI`CW=U$DERHX^bq
zl`{55<$wRPO>kN|ZQ2fp`za}Q`0Y^)W@W$GRd=Dy;27N+CvO-N8=IB7f!1ex+;3FH
zXm@L?GcSqKvsq)<PUN8WQ+=6Frp(m?>lc~-#*N;p_+9!{CK?9EEHgLX8y{am7-SXD
z38HNi(xF%+zUUH{W(_960|Vp3c44|W=>$=mbAk$*L(k#}HXFSPJy?i+nsI$}ve0(e
zN9br=r+o8LrQ8jr5^WWrE&<1KW859(Ct<Ig3*oKG-Nl%}arNpBR5au&hdrG6P$63?
zz!Q!PTF6%lPdhvN%AZ;5viev+iaxw*FBlcF(NlOz!u-Vbl9nnpqwL78BS-St3#3|9
zUT!q-U;y(8iPLWo5O}GIyd}HDa?gPS$S?As!xg=8W$6)nfr{C7<i(cCjEAhKifgXq
zyi^`uxYS_a6_H8lw1?+~u8GDJhnEM94r!S=O;jR1=j0U5k%I@@*KurU4HCK<!8+t*
z6kQT`0Hme8*)`4`J02n|AEpaxv)H^lP#P4EQ}lLW6Wiw{#;YEq`$N{rOxzoY?}P7%
z+9~LOFvfZR9`Yq<J}D5Q5i5lD7-xce^UIAjnEuj4`@7VM>OrVT0UVf<ln2R-I?LeS
zOil}I%U^{Auw?g21`yzkXTG34El~!Z4><iIjEOkS8EXxI5r6Tb=zB5;cdP0#GI=sl
zp-KT|1&g@|dF0Kh*$xgmnrZm09DkN0qd!HajBy$)QwL5tz+KU*CKCba37_Rmqvo*w
z+G)zsd<9@KST|^}IV*{J2E%gb4`C|V+D2d&91Y2++aSILw*b0928hChI$lKsEl&lb
zAA)?3bWwUP50dGSQ`Q|df{Qt7)F@<uANh!3*}D%s$C-koD5S>)^c#9<9*$z4I5#FV
za5FefI5x$BF@5@O)vG8N>1u&bLFfS)lqXEA+f;)gqIo_t0+FQ?#GsKMR2s?}RLaGV
zqM{;GAm0cP0WpKfO)r1CO`7>WBE`F+pO;4Ofl`Km#3#Jp8A(tmsW*F$ExXjdd!Ihb
z+1|B`qBRT^bni&q0g_%#!b$Iv2!@qYFW05e6d-<>b}j-Y1ljE4bpbWbjP#8->Ciq>
zmoRb5C_$HaaRE{R_n8k7peiOVXIw~5u7=x&E%6zImx*VdF5`9(AQ7>|+8j)!S0}t1
zH6ycp@dS%a!hB1^3z(P_ZrfCNn_|ALYS;e#GqSSGeul8cjoPzRSW`ssaGUAne}Clw
zvcB{_%9AZ0@i7}%E5;*5_)--It)npHwn`&ca4vIU!gc~C$Q``lt<C)wNs>ZS;<YYl
zFB7lEx(g>K8mD?@ZOM0TrKekzz+phC6fDzaq#?bj2w@n8lf}$%b*h7tm9P4U84+M5
zuXRU|H)Xw>nGyVBl|^Pf7r@{j7K>DV#a;77vNSC4o0+wB4v1)QZlvllaxKyW-K8JQ
z4`u$fmOd&RYH36y>&2*9QBk<q-Tn|5Pm4uM<BnJ<ZH6JtB9;bQ5w(pxdaz<$9Np8!
zeVgMq4uic-4HzfsLTH6`ku#FPi+;eVc*kW!+_*uvF=1Ok95_qp`H*RGnv+E9^%-KL
zH?Lo-Xuxrp?x>yhoMSe2Px2ZkC%wT^zN!7VkzO)ua0!}AMMj;gKfM@NG%S;2q)*Z<
zQydC=cJDsoT^868xpLBq7GXNtoo93+k%b|-ss~jbjSi{uXYebKN_E$)>}*k#691$%
zv)807OC;E3Z7Gg_kr8Qm<H+-flPt;N1h)BYm{F$T>Tt#o+w{vijiJ=4Whp1Elik)u
z-?$UvKEoB%6l^7U)Sg|tOiKIw`{sg2&H8Ag&)^1=M+Oi2M$;;yryB6dz=~BT?=d`t
z=6I|PHs?j#Iv)^o%OG84O%g#xs;QQa?odE%JK<i)zvhqnpxgMH+^{e)ir#4OC`-y5
zxDi${i>BZ9a4|G;*q{%qk1Z$$DLHINV2`qDQp@iO>h^OvP8c;boH2|zP7im`TR1MT
z@rbo%9~JjYNoly)$J5#KJs`;fEks7;(nTjiK|trInf7km^2cY!PQPk;ElP(HQ)k^|
zgaxKONqSp9)f={l9%D)P)p5p4a~@G|(1<Iyx0CfxJrzDRY3uBUrsa8|fj<WaUoR%6
z!+=Vi=dQ1x@Mi6BxO%tXlBHiTkiF7ezUNJ@Pk6JeNM$8YRf!KH<q^;i#Zl+%q~&JA
z*M6v?-4o^n54M!DWAS3)8LG60;srJd%!Jh+a))t2A=gHj-%U$<$6lorV$s!OTS{x(
zY&SVRBG^HM))-G6N+AhPjx#_=Qdm#K07Lf^J9bQ5xkB5bQl{cP7xyuCFeJ2(BD~*7
zwJSR{NhiXkS@_}ds?60e+!({|;^cH*ExDI|GmnAlRJ>DiDE@mr`X`IyHD+d~6c4i^
zJWV!R#lEE<?p*ElCck_5Xa4Aus2+6h=FLg&aU41!GlS>~7aK==m^)Su6CJcu+cO;K
z6Gf{Fm!xoYp#N==&)>ea%;`^m=|rCO)7!VhVsycD#*GZBu8OHN<;%Bn?MAK^VjV>7
zm16F^^@#R2xz`WFkd}ECYz0<0svjMQ;|2!n-bKbSCr0F=^xJamwi!&G+=dDt$RT(p
zg9q^sBu4eu6JX1jLNXnZ4)K4mhGZLwj<Aznl9EK^zDiLnAWxNX)a*CGVlbd1{90HH
z>E3-lxfMBKk+2Eaut_83jd6R1Wg$36)ybqpn`|GWQzPJnN;`5m9(&eKCNn@e9k?e-
zN1n?2StyHPMlUviHasw2df*^O@}~57V(C{Zlx@xEw$hTIg0D{aH&sj@lWc)0A%9p&
z>oNVvr>cYF2c52%DwVeN*bsG?NznaI!(<HXfBN~07t?e8zTf&)L4oYX5Ug`-zZwB?
zSl&=i-PcN8!X<rG(d^!=P|kR}OKQ0mEkEeW4P9p;Em#TAPwuOyrG=LORN0%HWaK$D
zj_(Lp$ThI7msJ|8ap)q)pne(o#>P@yo`DuXmgw)G1R}Mb`E%n~LN^CC(B<BT-)`68
zI2IG_-Y?v0kw-`6U1nG}uYHJu5tKO0$V!c2Dp|qw@crb`ZuegDXNSQ{_SL4YIXV|w
zhFe4(j@ca;IHNR+&d=n|2O>^1j?Fw<Rxor86%@>rLg}J)998nOkrbV~684_l74AS8
zIisOyj;lN$Ro-K|Z$y4~h)6LUls{SRoFCG(r!?VVz@VmoU05i03S$i<0WhHG$nZ!f
zA1ESTQ9f$8h*jc`jNu#SKCLm-DkzG3|FnqXWq=MOlB;}K`keK6s8S!W$NtPKv|=oy
zYS&@IDj>v}`Z74e*yRrXQVzF|f1o?CRG5@jyfZ#9(4e;3qXb`?H3L1)!-)>=?I>Sq
z*B-6h7|ejE^0G27RWeS!!8Jfx##2WGOrS;&_T|$oylVvqK{_Da;b0}TePa!Phm+4<
zrs<J6q@F(yAOCbaVk-3CE?$tkcDZOoBv6rN9<0zjJovQqt4O8Hj-5O?gvCQFFzUS=
zXkp`q4LNbkJ&H56n_0lqr>nDO&6=MacV4QjKE5`g2M$%C<q$5-CzqkC-_i$J2wFvo
ziv+tz?XgKG5@#)AnarPB>KFKBrd(umYaVC-QMRQ7y43sy3k)~Xu>&YplZcU5nhh>+
zpVcJD?SRGf(`V(<79|s*$4P7Lo!p|OC~iPuR9}J<3EM#rZ7LByT`S2x68iY~up4VQ
zMG8^@MzcEi?CB*un6q`^CY_JLaIIlvpVM8RH3fl!ezJGdW(F9}3?hA0Df(Zpo^U*v
z{nAoGxQ(y6S1KX%RhNiN<DzCJSiV7fY1E>4jD1W$ch+uJB?<Z27^>prH9q8e{y~;X
zo)g)loS)GM<ccDO9vE_Eq|^I0y=&q9$h13mpx!J6c_8M|c<wrK&Xh@$PU0N^dJpyS
zi2K(k?UyHNtlckm6_H7upi_GP(Ie5mR%Nb|*~r#IJ*wR_u2Veo*}rm5ETrlzEIc&Y
zV?A)F)ZYox5&W<9FDj)!p7LD+9<>ks3!~pf&~?0+l1x5r%zVSUcToW?0&B&1v8xx|
z35K6dAAKE=LRkz76}lfz4hkmWWjM;s8FF%re=AW8lXgh9(9#vN*`a?Vyffm1^jOLq
z+EXL4J?y0;DFr!Y_e3|P<w|DNL_-WFpd-mG6-i&bqnOMm4SdFIAMce;N=o8zcNGim
z;B&*?%jtVhNfSW#m~GRT^G<5Z8}fxdKp4+>&c%80?j3%eS)g-#m7+OTwHJz4*#_V>
z05E~xvc`2jyP0uF&GU`T84nN^73Bfh4#R}Y9AEeEp8=qe-c0?L=h}x+wtgwh>s^F$
zZ&|E`3KCP|-H?9$42+G@a>YQl=Dj%WDQmn`Nl5l%1sOeV!M(sI4A|DVPOQd8(0<V(
zL~)m>u!L9ffddY4i6neX{4-MCy=PAb6AZ}W!zx0{#yx6<WdPDfjqBvku@{y+5kBIQ
zEu67dmX^0HC$gtG+Opu@W7$+&BprmR@Z{-J*ep#nND(@;XrVE09i6p<o2IU?Unc_7
zqJYfz*&LRnanQCQGGtuv^D||q!wW}jn^qoH>w5}3{mzqXbo}JWZ#WkzYbdM`Bw61H
zD3E&lG75xNImvs21{=BN<sb7sD850<&u!{$O#t98Hnt%a?FfTt`BM3etaLk=SRAm}
zB!GN(2G8XbFw;^?c@5ckfQH7{VrL|9v*^I?995~Mp)m)MU*YsTL>lqr;^gx-R#x&^
zo)as_G9r8cM5w@N)2$K7`(NC;z0*kR57y%;fmo#pz>u|N;Hr@$lX%<m_fG`-gFDxE
zzyQD6{`vcg-xtK?+*7nJ<Jt{GjQsiYFgOUrXF4>&H|tb2N?P5#rD6UFjS_>oWrp$B
zeyQcU?4(X116i<eA=kWCljP{7Iv217YPLPF)7-gT-G3-4sS5l8w`8UvcA<Bu%-0-E
zk|lafl&^bx4As{sKT2jzoZVY-pdD)|vvm!(Logo`1pmrE2L>NF?zO8|=|7$tmB(?*
z|NhsuKN!UL({=4oPYZP<OR}Y)d1O)Ql3q6FPxW^@)P)5_5~AFXuCNd{O$JuH#Mob;
z4>2ZQc{)fmdP{9#!&gp+v(qp~g%ZoeT6*p&Uyr#R<J6M`2#i1zWML(pP2_^Abs8|G
z)CM$t%<{-FnQ1D(b4FgoFK&SZYKpNjE|>^VF672S8I0$&dJTX;A9NtXu!%xC9XO0*
zb4v1AfkiJ8Qhm-@BDXzM08$4*E&j|IHEH~KG9_x6@s+RJD|T_M@@0%1BKvuS92(c@
zLTvN#nSZJ~Bp7TrBwO&1+THa}rK8=$8h-$k7F;9iTHbyL33}^}9ZO!n&IaC~Mn|V>
z(2g!7#-+nxPm(<)B_@(c=q*2_c(2Rv#t4Dwm?m^XK%3gcN2rn?PBt>~OMo$^qYoZJ
z4={D6C<;yN0!O*yyz&yraWZ2on?VamMDzkrhy1(yvbqFE<+sMo(@;<^i0U-SM>B%2
z(izq^^F+k(AohQ}L!BhicGd6BX=ty~FJFX~7JdmR2$adJ3+(8F2M=PT3GfX2aeI9S
z>31*Qvx+#`5my108lYC_PPA_o*mJwM5Q$RF>aMJk5J)M$xYDk>B)FVDXF2~^YUQ8V
z?e4G$C~<`zHKMM>RWepcEyZtn`h2B3B_#Q|7sUBs(Rs+=!A@(}GXCZ}sZ9i0Sc`Ra
zFMoLB?~bRsKssuwe}byOdr76d_kE{OiZgUo@d<0$MdB^~(@z~U0To;tf^-1x;8Bk;
zK8Jb*ABK}AwU!Sfo-eH@3fQj1G+QgnjkJZLi(DI_@ZP=V;p4=`4zGZ51g;Y@oN<cx
zu*s2aUQtT{?eNHO6*NM#2aD}+An?h{26;zsvtrGfr2>8vC<?t$w-?&BM;4pi(Oor5
zarvHSjj3xZ%l;s57qNxv9Tz5=Ei=RC@kw|~&Q`?fy|Oew8@4g+DN93mgl1I{l&&%T
zM7Ej0Asj*e%h?rFKp)!(mPic6tq^b8+I~#>mPcHTH*e=+_d!>2eG}W&i4r|iBak-2
zE5q#>MU*9=Z7paRG8HkT$vSRmXPOru>iYRg{`Fi8leTsX7Kp*uaP}|IlB#qjhTH#m
z+^qAcMA?^FFI!{?PV0}F*tKuwOqZ85xY_x93#$nY160)mEUxtKzP3fm<dw}IW(*wk
z&7$AkOxeI0(y(5$X5BTso9WmmPP+1sN4M@Pw^nP`aDtlJYR|Ea+N}2o%wKbQoz<p4
z8Z|2WQSt8Zs>5GKeCx2VQ)1ccQRVLf%EDXPlMNc=Po@`ccv0n>p+~On&y`VSH<L1E
zWhGhUgv9Z8vuCdA^!Wn=@k6`0r-x$uc&6TAfu%nlcGMpS>!FNE(6}y~Xn|MfofJ<w
zTD%TE2=}p<7kNZD<L9Fv{!u*(G!Oig#>;7$PT@xKSkT80c(RegQX+5P2T9DcTG%=h
zqQQec<kwVxi-<~gc)wb44`#PfXrGmo<hRhIer5VhNO17vJ|ybA&<Ly{NDSD11R!3i
z<vNfi7Kyhi+u)xbQ$8`73HJe#Pv1kLox&s4rXL>g^oli{cRlF*kS9F%D!&|mpc
z__O<BUdnOq;>A!v%RpR+os)z+!hX;JRtuK&{<CVe_4H>lzTfOv`*O40Sdb!C&u~Gx
z1>OOfV2mM=U`&>sg9GU2N9rASx_L0TB+7C_^$s0I0Gz}>99{eCO5t6@6=B1(UtvK3
zJVqCoWjb<0WyX5M5H^U%OeboiKZvI{=sReT(8Y49V|F+dvXaJiB=Mrh$vT~PvW^oZ
zJyy4W+L)qwq;08&cW4RIDdFy45Yq+Q1k!&d+Z(dvF?fKM7kLb}a2Z_Z1@99TAIqej
ztWNMdk0Dal%<2MSW^=uwV@!B$uiuSpRVte3!l$0ZNV0`Oi4Rjl3-!jdT1K#vlIFy6
z1vR5@e-Ht<Z{L6XXfMWKc{*Ak$00~Irz@xoy*!SuD$u;g28tkXbbe@^DEEd1@cTSr
z=xH*R<dWMlVCBp4MTM+FeD0iHo%pNv6MMD{`b|bTdJM|LO4ful5%@_AcYR%V8vLC8
zn^H3SFR`-F6L_lvG2mP>&}q2h+usj}kN!vTpZw`;0EC-NEuWW{N5AF9J#k?1sI0%v
z{P*QWl&tpe>y&yX{ZHJOR{Na){1Mu}{ww|dVcCSb3-tR(NBh(*h~Ga_a{te#uKV%Z
zZT0Ip8CqD>gGJj9)qO0UFuP6N-Sc!@O^1uq7cwer<=n^dzhAY{n7a4YngoWdE<sD9
zudRLi|NSX_i19SqYKf)TCMx#6;<ef+Wsx88IGPBRSVLFWNcqK^Hw*cN`pqo<vBA;0
z+v6X=$!jo{R_OMFwY)g-uS&DEetqJg2aCk~5DhP1U!fP;e&~OcDsunBhnLk~-d#t2
z-JU)tXeI%us0i`x>}B^xkLAVuF9ZM5WM5yHDhF#71}ZnEDZC1lYBttXDmwUucpFYs
zp#wiBcM880-hrZx9BTwVMf&<r>n}r>(Wdiyicbh7Zwm0h-VHRh+YMoLc!mpnN3w<5
z5=rX3H;PyFtQlK>SzBY16!0fJ%vTHeHxu>S9$Oh!*LPVD#mn8=Tz?1k3VJLhGgADE
z+3u<Ut$HQ6Z18IqQlq}CPr#tI3a=frbiSVNp+9ZsLot;o(?P@^t@iy*%4v*F9t$I6
z<p9M#8Z%0kpRCLjje8#ZYqx+-d%2;e*BRu~g9@MMdyCdOCExCkv&p6?nHT%J*-qw0
z^A722SCCN(RDR;{&{E6_9}0qO{uR+X0(+l$`{bV;hOgG*kuSz9W6S(Nt75XwLUYe6
zRyH<9|CT+fI~Zl1WclWEu;nGY8R%5YnzH>1Im3-<fAr}^5My&U_zs<tVHS9xfLxMa
zZa;Kyn?aaN4>K)_C&`vM*EYeWn^d!y+?Hg%L#Ix{Zxs|4@XkDf9uY-cAu4o^E;KzD
z@oaqKMvnjj`~-9hH*eynawj)eFiPMN(!?Tv0C&B3#lqje(Do0l77dp5xsem*r7DnI
zN?a;_Qd0;~w7w1!8ZombKcZbPHI9&mAw^@tyjD^!#9YW=P5n!2{AxOdJXq9Je}LlN
zrfw;Is&favF^|WJ0{YXQj70-sl_P$j%MPGqFAn43<2Vi(u@&r0V+u-2*2rp<4HQBK
z)289TD8?8V?OqjjXq3CP3h<Y%<Fm70s1(b~)g_@>#k4Ql=pX7jGudre-@c#FQ)(u`
zQgx!7<6S7Q)7KUYaVEoX$zW)}zkU9^FE>=Q&b4OkK76>EGF9j>avIK|feOsWCnF!`
z?h3SHUG5iYzV1d`X_7OiR&ZQ0(QV@GpbgF1Aj<XpL&}sw``r2S<pUyGZ~r5%^}G_9
zWD9*=A^f0R$@v>r77ywXBor~2FnauaH!I|z8v)6&sE)B`JGlOs@t?S8(ObYIZ&i>e
zE)!XOpO5MRy{uZcZ?Eh1_`=G0EKoXnHhH>+Ko%NitvMDiT)b$0=UAVexw*Lr&z7+w
zLf$PN3OgJxH|_O>a*K)E+kDPh97X3a9Yh`Ydc=q-QDLCMoI3SqfuAvSgx0+*O}_5P
zM2Ntp!oZ-l#L~(NGu0E^4o)aan+gH<knOQ4um<YerHg3ub6PEE8{jAZT(^8CK4JNA
zsXGo?78qkSXvyOhVDFF>(6i&Alr82{av+fp>>wO-nBv5%Ra8Kf+FRXDv1mL4{;7|o
zGjIl<9wysc`0qXW3=Z+2l@$+84?VR{U6b1*y-%8<VpFF1nJFSjS|?7J&{%`@U(RBY
z8fQ`2!0u!frA?w#1iQnE{RmSbtLYs|w!k`n6lS^*g@fJgVTkTIPo;wnqD|sF?Kxz<
zxt1Cq420_%z;ta^S2sL@>L~5(vFo1SR6->PqM?CJ(WX;0kD^0o)Zc6$_4B5UHP|si
zGHHL#;lZ&KUV(`m!KU1BI({?J>yx;EQ*t8{V2V*A7mr?NZk`QVjSe_<ALy}ogzNn$
z;sS-NhmBR(bK@mvK?H+h0fxc#y9;FTr#S}>k)av9CZcc&sQe$gh6Tko;9^`KJ3;k8
zkV^YYTg}I#=0`pan%?CyvQbplkMtJ#!9Y6&ahn#Ri;(C9d>&Q=lX#a)-zFNPaPo*R
zGk;3-p|94>>wfL#WtZ4lO>a|HchA-1`Oe(cFBEJ|G_Y@Dy><=AOXc%vmdxe^L@ulI
zpn$b}3H7Wb2z0Q|yu*sG8G5Wf9nflqM3wu9@5urnAl62Nr~1>{!FS0lNk)9jE<5hw
zj5Y|_%aug6YvvvKqd)bJKnjo*lZDYqxqIi1;Ydav|5RzU_cn5AES5dA%_Fh|+*&SS
z?E?<@&4tK7j;TJ#k!vH}wQt`AZjCfbtKajaKTWze%C*&5a<Zia8X>c&4Sb-Dtbt^_
zVIE%%xqQzQfTO0(n?G7r3QWNZj8pChro~I1El?cgp5gUriISQ@!skd3*Oh+jt}vO6
zO=&Nh^FelVG^Ovay&XAnWZ}^M{m*7LA=KDLnj5X87rc5qu<yCt7p0WKA-2=!UUqGG
zbbQkbNwUY&G`*A?Hzv*BuSWeI7di);p8wi#x;XC!oU@L!1%0j^T;~ekC2ddrN*cJT
z{_OTl)dkH_5lLF+c-t{Rkn|$wwC!0zPqVD;YqDT2(`dbNM<M#|@NQYf5?nZni<yQs
zNQ1#r?|Dyh5$obL%^ahY{I~oO7od{Ie&^KN^(Qq1SYzu!r^sRHb0#}~K2|FY!&J@6
zs_3xjObBK=d>^{xQ>GRzS7ZvGO6&7S*&HSsQNs&AV^Ufw1r7zS*Q<{oKfZV|=Y3}N
z=84B~uo;h!0-*>B8LBEyCX~zl>DiV#FIqzj9so|%oPfY!>40NT=}Ln?aN$MwjKYkf
z5Mn&^9f>m3kd9eK&~lm)_L?_G^oo=zoK_Fy0cdMHIUv9SN>$S@r+xf(?_P<L2K<hI
zOMGv4PXA_dJr{tq1<BgYn>X=H&C9c`SqE26D9V8LVM}m;^#y;+$wAtZlkv0&9aT_j
zq|VvEfq)G=Aom%LG<I}6+u;!`i*L_4`6p+gXQrWhR+ir`q--#yH|+=Fg&3fXXAF@9
zq?mR5ln5xzy+*p`<?TIT;>1yWV~i9@(1B^`Cvd`(Z1mb(7BQwr;Tu3facL1j@#$=B
z4<o-?9F;T}*(C5)w*?_G9TK}vty(!DC576+=ScjTvsj!cAfUec%$~G$aP9Y825$ty
zz@C?oBU#W>Z$76mGRTT^h#fv+L?Va9@J9IrL;wlN7NA$Tow1_kxZK3EKGA!^%%7Vn
zDTVsqaV}w#2^>#H$8Z!Ans_*~qd3Fa?~8VdUOO^v+1o14E|Qm1U$L$1T(LFlEV~!i
zh<CScBGbEH!2t`$r?CHc4KM3R9CLk{*n^EQ#C-%sDoG}|PWR^obNid^I|MDrhK=Ct
z!uAYcZ)&9Z;?Bk(HwFgra6~}U2<8`=I16X9*iz;Q**&aPIC1CoYws$3sf&wn<Utg7
z>cOIdjiBH<avf|VsUvwUp;$mdBD}Iqw1UHsLC5wKqf9shVs`?!swMU$srmfni>^)<
zcTI3P69x|?DDmjuzZ6Xc=Hcr&-?zD6BCF+tFcIV?(Jyu<t<zdMm@B6eX9g+ZsZ@1s
zhqb#ipPe(0gK9DCd`wK{NXrE80(PHaQt0$06#kqFIr+z#1<ixSLdOcVjkG2VBnBCi
zhWgnw;UP<&LU3OG<ZKG2W)$8esijs8K3Le@xqe2tOc2Q3{)YX76SjUyH^GJ^pNied
z?m~Ms>xiaICkq%&B9X*2zk%E%C51S#{MIfF-f{2OW2Lu>fl!wxex8^dmUDh-0w4kn
zH;&@e{PD5Y?yPID3<Qe8g}ULvZJq10X^@OJ`iz@r^v5#oYFaEj@uYh;eq7<YS+7e<
zB7aV3C25LDSKq!S{(FPYw{9>^AUCLrQn<_6BQHQkaddl^-MeE}95i$;{O!AUk&NEv
zlrTDEyA>>GT;*0VkIu)6Vq832e#IgK*jxDJ3>-29IzbYV0Oq)O)u>sW+qTvAdd$dj
za$fH1!uJCwK;RJ6xo{IwW8M^7*_#-=4jAx-l_D&(`M+l0-xkw*=EQBHfg%A)UCFCg
zUs83WaAyTg4IJs}#BC>WVlGz|AJ7BF>0G0Al5yigj?T8*SL>HU;t8cmgsp?@Easg*
zLMuw2M^EnqmY|?)CJ|Y*OOXxGtZ2^^fS|kuqhKD95T@O}-9p0n7*}{0ATi+a3+N6z
zJAa3A)!@%R1+RDRn+n_t1@ltFi!X~TvQhI+h!+Jtms3G3Og6RXv%=0{rCuWCfGy(x
z@Tqol!6bbpsTo)vJk#VyRmG}j8)J<u=(Ly?PteV?nB^|4g0%&VJ7n1m4iMrt1&?vy
z&}b4M)`l@Ylt;ptI00MwxRe70NE9t&B8?HbJc|;9uE;t2hduCZpio1@%?f9@j!WUU
z6FSB{3VVY6vT9K`KoAA~7DBs;$2oMx7lL<+2i<zeK+rb@^>KS|YolUddUh3hxK4wY
zSh7iwlvNXq=rz42_Bh14VA0R2?6q^$TlH%uPq+PXvXa`N;4%Jq-)5Kmk<+?W3@|uz
zmC0FzW9#Edx1Nee!rW}du{LI_Ngc$ZBV2O~gh-^IT`H|-mfb7bsR2e75SZv&3J+ZJ
zVHJ(R;7}k2^nDJMLyi0L1)%E7w{Nw%ILtN@($M>xVxe-XNx<%Kr(^y%W2TaL|BGL=
zWlWF~M8*soZ^>tJ+_a9C>M9dRI5{5Ag2BUec$|~dYLFN=!b5HC+xue^(w0oeFoiH!
z;}Ec!eku=6m9?bB(kAJs&^^$b4(~w;wR6jsUD6Vuz~pnB=3=s)%msk`0ym}NMG<ah
z3L{;F!h%FDE9vR?)M*Fgrs2ac-VXua*6Zy~9g0`w+x6$b17PF@@dEGPBOhH59~tH6
z_mC3~!-XEx0x`OfJo~Z;S0#8f{#7E<bN;x(`4hk!cP%2TnB*Q_!>n%)@94_Uoaw8*
zh%|u|pL3Lyq2N19>f+yv<5cw?2gYUq&hMMyB92NpIjIT4)Iellc)L-qSHBruqwN$9
zUVzn!kzFcGZELKLa@WLST!tM%+1IbQ!AZz9*yDUy#<BN`w7dK>eEa@3+3<UVmuK^f
z5E`s9&(N!usGt<3mv^kDicQVbE8FO~IFs&lyMOK4xXPkWWY<juZQ>ia>Y~=h?fWuj
z*TKLwKvh_sIOabA=A`6$%{V!^tzch);d6HZhtu!fBZp<{q&_NgqUsd%^Z>IfI}&q}
zF*h5wYxK2-w-f$FYs4p=eywa_Xi18vsTiHIL$C~8&oH1RsP$`>UCrdqL5K$~lG+Rh
zGZv4;zD=_&6{y+Kt|fU%`hENMo%QH3rBo6k8YwC*XEF{qi~ald0bClb<SmkKZh3Q#
zd6&KuABLMnYzSH6l0(Wy)DIELMaQB@r%?nQKnmQ)wV$>0c6oDXtxCzA;(~(iqishp
z!FsLVCGao1g$rpm!_lI7=(e!t%aNK}%a_tY=LJyF#(^uzyVkU{Vs2lq*BYGq`1zfj
z92d?BX|tG7xzC=(8ZEbjT*j|J5|C`cx)gL(ZO}F918UwE7#E@=0hSv1sP|}*v=j$>
ztI~tpWt3qQ#I<&GLzeYyW1;aD)vmKzIE8?s6ojvauSKNZFd>u@1P#UwF8>;1Ip8dW
z@S<z>ix(%%zuhwax!G5u_nm`7jD6stz<$vyIkzb%D)q2|*=IC>bW;{n0zpyE!HZry
zkNM42zM)EhJLDO=`PfW!aWsC2*saJyIlKdvW1~PF%vp9csjf=B$>5_Mn6{{NJpN!2
zR3Pd}W^U0qFOc@EL$_Nut}l`eiy<T8T5{zT*FQiU<gJBWm{Po55g3rqfzc3;sE{RF
z;__S~R=1_WO9#TnhbG^QR)IUQ!XQO5j2k~)pzN92$d=>Xi?DiGGo|adF1^0rc}Ktz
zTp8(mTAn1=$9gbdX2q4^qZ=UKYSz4Yz)LcOR@SR}c`YKcQN5G}n%;ayeJc!1)sE<|
zJ$=6)r!P3S&(^IJ^HlDRLk#ELPD{gz%2$nGaA|h?S1pS&!~ABnT#E^F$8KED6EjV1
z&BB*chFHYgMb&z=nFU>I^uG?4Z>x`!F~28pN=KlF+OXzDOE4*V_S(3QEKGi#-RMbF
z(>(@e{ouSxb{^cf@0M9Pbp=_5`rbX#F!)}UK_?43Thd%`aGVj_W`%3N_tWJ2_iNuw
zscxFl$s}`HEEx}1kzf6*Q6ux9({B{pM9ZQco7D*m^+|8ktlh1wf4xUZa!@rwNZj=j
z2Ezb7z0>#CfdN9tn7e#A^N6AX^seEx7oQ(a&u6-wn?#wwlhJ$neQXrlw`dXc>}X7v
zg$oyI1NKdvJ}=XuoC?@W6$_Mx4~3#6%OO^+^{KmA3zjbJ{q10{=G6D{dC%geA2~Pz
z5&#_;$R)-%_s#ui%QzVh(t-do@ZL_(9L0$n)-;JcfqmBd)e@E)Rbs$P@>pVAc<657
zgt9*D+03*3^%?Zq;obK;Fq)j+Wl#MkbM(G=-zuZG*Dj-rFfCNuwhePn?earDwtra5
zb~tr<jB8Sy6B|K-q}Ca_LD#fx+rl;FX@|U3K}RA&S2w-k`E=WywL0+{7gafZB^;Q=
zHYH8$K-I)>D%Go?f!D4b-yW~PW}kc4L&c$cm?PR`BtXxyg$sM8<fI?YKJl#6*-$4@
z_2Yttq$4xZhOB-0Fum`+oR`9w!E!|VnNCQTZzV=kr}lQZLmNgTR>=*hTiD`!TteHY
zLu;EEBIo2{u_*)k&pZ^<<gYagh55n8Q(E~+^r~Rys&`-<`N)#IV;J~1vWi<xLKFpF
z)mQ@wXfo@6?vcx<cM4`tPL8#Y#VO0_i$=a>dtuua<~}|-xW8#cmgDqT#qgCiO|yS)
zO}%?pJ~|LC^~KSRa+;{0TG#v4db`V0bVfh((1PMF>=@?C4a<qhvdPk`_wJnzU4PB7
zr=y0jnMx^`rv(VnG}+WX?j#nG*&{7o$6U$(2lNHbphN7%X@`cB4BAbpGi2z~@>hj#
z(vG@$!l6BB%T$)m?di4X$=MJx;>-mf!4jZ@rCF?4MHWY=?<wU-e*UG*wt1M0?N}2t
zuXeg%sPzl!mYq9Cvw1izky*e~rHW}1`A_fSYGsYubo74q`JAP*VqJ%rkNCO#Ebgp_
zSBa?x=o$GkXb+vpEcp^aBm3icP9SK~N*}7d2c{!rrZBmysXD)6Ui>!x^c89aAT0f!
zJ#P}%iMJ{1mCFkl(hW|^8INq05wheF4#2_`v)u5s)Kr@D*+8V2^?0dLoqix=35n1-
zo1B=5eyH%oJubjS-H)~Bo=!!July1umWTZ;69x`|?4AFl{|AdkcY;eF{@mQviwJUs
z@+0^=@nn-u>Z7Sgj+jf{Y?{%y?Fv`zoreq)o+MU<=owOU2emfDs<ExQx~3OCf&cF4
z^i>fSoEz77rNX;+Y<~99Nt+Uz<w@#lYTdmi8yk-%xx;ZPCf%@E5Aw+>jtu|d*FYju
zS4M|y-V|O$ZY8`8gq`@J_@$#J_%cI+6)AtYAuA46oTkfSk=`07%(q(GqjI3>owT&4
ze1EcbzEj#BOYWsdWzosAcTB{1KX_vT8zN^0YeWj%bm%&`hr%@#`=#zUwS`C{DQ8Hh
zavIR5WKN0j%qf1#5!|Nlzk8$+S#;#Ji^Wt35=UJxXI{+h%=Ub%k{cNm7MMo4oTvm|
z-SBtX*;#D||I-nS<LL1LZ=vQuZZSR9*eK|+uPov3b9Cz8tgb$?<L;y%C*`9KI%&?i
zToMxKq*L;7d9>Z#oCc+9il)I39URAJDSA9kaV0-F{;s|*>JM%8j;Q*blP9D&0{qj#
z;+)<5`5l<RQuVPxzR}o(4W<cK<^%y?r=8u~9S9ek8Kl<VmKe5_8kryh!kH7dn9K^i
zF0-F8?m4pPM}SY9bcOn&OewKFlad@;>bHM?*rBZ-!WX-b>I*gsx_VR%WypNaGgvp|
z64{5>v3CRvXm5JD3xx%dcAB9f5(}Xh;T|+g_F}LWi5&%;J6JONOCOn-u5KC_eo9IR
z@e9P6;j5QCYLZ6?vR0$?E6)o$MN4*eUH^>h8)i5@A!ere-k+&w_RAbomIdehFbn%Q
z#h~qc*QgOdfByL}+qWLZw8{0%S5LP?YsqFR!5qLSgf?+gYBR+(?9;TaZF)Iva*zqW
zfhG=jc(=Paa8avGKDnL)Rc<KYQ3`{-c8yE)w&yfIuQ!_S;Agb_CJZHjvW&a`v})C(
zA=qLs=WNCm&@;eg!G(1da=iJ<_;5N#$X920MwmyW?YT{|U6^5m4FueUOO*qspZ0=-
z4uNsczI~x4TFAt<m1-r)Ib1-9&)sjnb$mkQ*I6}{7Q;g3jLBKrXPdiqQne`_!H(3w
zcKaiH{gBTY5LO%$H+S5c4jy5g&)Sa;g_8;rL=PGSwGNksbDnC9N2f^hw9INPPIWnb
z{%+Pz4H;y*x8gE{`qZP!g>!+EIC<E?fM0{1^t{^x>?69<8u{LplEiV))7o}RZ6Bk3
ztDL7#(^k5;T>Nk)$Jf#54S5h{#h$qep_<%9C!QffPZOS7QNEHUA`36F<Jn*J2b~HG
z?9g%H)G1T&3pz5=wMWtvBO_v;uEUDxwa%Q2jx<!?>sY(suE=l{f28RL_wSE{e-Zo0
zcEH^J<b}Kc#Dv1*(so3_lg(CNwD;IU>|NbF`{z(s=#s<mxzTbenk#p?)Ze+s1SZ^V
z8#N?;{OXTN4U4MBwWvu@fA?5#wXVIn+}t2v0JEieVVtIg`7P0>p62T`V5uy4ELyxg
zKK*&Tz7LJzP*eOv*G`V~qEDi~rnrpSzKI4oOg~&W3B5CDs|_BUef8?x`$xyx1qwPo
zB%?iZOL`pj-eWY`sD#kz_^&-!6{V!%5aTIR?m!x@Rbbx#@MmZw1q_O-T`!@El`M37
zmktw5g@krnJJTguj--VJ8!mA*-rWj*tnY>qt@fF7=N3bZ`fx@5kYbPQvqS6F6vbyj
z<X~yF8FB;+#pqv*k(mm%((eK0Yr-G-VjdnQ5m^o$Cp=~kW{s5NVInK3#BaoRqUjBX
zYUxLdM(NkP6hvLpdiuXz#?iCBuqSV?yfQq^F*W?<?d1&ad;9&LJ`dXWd;V4Whdxq$
zGXT~;wuxF^aF?7|K2T<T+PRS=Ys26cb^PJ+M>dqA)1XY8{F&|KQyE_e|F_=xxh6!t
z_DWy2%yW6yTN`8s4ymeyEig&>0b;<CO}j%L(#IQyJ}|o<<_Tc87q+KSs|?G2_Hg=I
zgVl=`g{qbD{Lp?m?ka080l?GYpkMH#tjO=<1uO*A-oCvVHdDt!>*w-PQvWr9?$uTC
zvQ^rtPT#gq4wI21U}0^p{FxF6KDqkvI~XlId`MYwHu1~t)KsR{1iYk>^#A7ANR0S+
z{klg=JKxvP(S|pwp;8=v4`LD3WrP$<LCv^7^it_tg^X@z-%UqW^fjXNR`7eS=Kc5v
z^pyh^S(Da)SeEXc3q9?O;n~2m;m(zbyUAB?AGD2DG%pHYz3SGGw2|1Q<6w!QT~$PD
zE@aI4^LphzwO32Y`AI!&VrvV033z^3?c>w9K6tQ(kCaE`re5VMh#>`iW?l?FzV{u7
zgx#!rI6~||3A1`T)-1bsoO3LX#G7IIJkYtG-~N){Yz+*7p%X5yoFi^0frm6262vFW
z)Ei>l+sCOe8!U45_Dpv_rUVo&yFzSh=X)Y__r;Hgqp@n5a&T~OC=$y3z>R^t<oVWI
zEsjEHbHfj8RM9|xqdREc*H53`a_CkR&a<raI3~C#uHi!FwgZ$a$W=6@y;Ui__@xQH
z+U4u*K21rTR$bXdBY45nhMW}YYr?hY`0~~h4vmDyIb&o#Ny3JXbJDaU`jdd^7HQ@z
z4-Rd1{_!)j1)QZ_*XqDOsj$#4$ct%+8YXq=t7fIg72`#e`ZWJ_2&ADmB>|CEQ2{a+
zjsjVDc0nj<v|r30+B-MaI*9^7ve|y%0d9rAdptN*s(Q4zMa8>ca-b$6y_@dWy?gh`
zBj+y83eO_@IJODm_$tm=DU(Zj#|`cMRw(E<Y%rTzIxnA~CF;vh*7SXrgv<tQDNMb`
z@pG)5W)Drxj-%Jl_$H$%;lBZYZhl6JS7fRq#_rLgbY8daF2m6hdafKsrO58*oVutw
zS0H9Le09rZM9H@Fm60C`E7Y_09k&UM--JViU5w4osp8ouMmY_CU4h_F9M`CaU~9x@
z+P61h^AOLpQ`ptI;ftzyf#J!M_OPVm7Cn&otS!k2Ij&O%s|qTuM416=&t@($iksDT
zgx^_FNhHn^j@bA62~QX9;novO$m#8H5_ufyScKZa0|&_960e@s=>`d9*m^eN*D!8B
zi**%hi3+{lYb`Xqavu-sJo3^?(Oz^uc{Vmy9x#+mpSg6%5e^S~8jf&ju}uld6znt}
z&o#%_k!YO5nspjsxy(hU{7?UPhH1-?Pl@44FS(KG*6kT#dJ#l@!^}2DG;zb}XGu@N
z2cbwg^Y3B|F7XT6VNfvfJZul7LJ#^mf|ixqU;(2Olk}3Dy_Y;4&DHIVc06I%s|^iT
z5eBfa&(hP^O=W|z3H4HhvB2YR7Ln%6>=pOyuo#@?NO3gdjDz{{2lS5x$baz-Mm&hC
z%(1^T3>gk!!Kvfp-o$P<uB2jKiMk(W!(p@yRttX6rNd2+-hrUfU}y4dBN`B^MN2=E
zDlNRT4~_5ubj0uo>AfCXFqG{_&CHcP;O`&9WV2(bf9Xm|?!k<^r5J`dA_8)ZlUbpW
zqPMs0=e+}z{ztI-9fs>ZDbI?>T(OYlAvnJIYn!f4x4QT<d_;fkZgjfH+l*E+GVV$<
z;mAT7;`em(yW8v>{*%<Pm$L(*NSbD*aUEc%$xq9@rsj4ODP3=paytexba~AS(br8j
zA{*@8+8A?5fy(E5QgGj-=aX=y9Uw#MG{_Kriynz#fy@vNg!!}lGL3<d{GVr{qxb8t
zU1lX!K*yJjw$ydXK1m}uZAo(q0>HO`kq2dBJNTNMQX00`I`S}?1ThYr{kVAq`%d&d
z86Zx7<_bf)@7|4kpEec6kzNy81CJp&Ls*EA=}?s6td9jmUuqK&?2hUE0wR_R>4Ix-
zpyR@TS%d8}#w#O(6N*BRcI5ILtZ+7A#$aFsrmmua%XBs8ih5uW*(F!U?<JwN(cgdH
zym2Eus~BYiOnf}X3!%%1KG)Zo7@~;&4xe7K1!z>bLIS9hM5o@nC;XZTkj(L1$?B)b
z9`kJkYe38xR?7#}VF&Ryrq@zao9W0Yj}D=lq?q64u7X4FF(>b3(>o4qCJ8T-AL&Ad
zy*~Zr+kLe4j1KM8Ov%2A&-QGTKsYok=Gs#iP@l?VnP@{qnBWs1*uP)U(s|2A{RDJa
zFcget#z%QIREwWTWeq;l?t+rwiUN~256dd%;|N_nfr*93r%#zyNYRQrPOapI$Wx9$
z4k+~pxh0KJYBjv=DQ_WSxl>bxgh0vDrFFf=*s}O%>ht~n&uICCs4TEwE_%~I)g(xC
zD4BjIl0~6hM-VlBmEdgC5~TEJPwv>dbscfWFa)IH(kQx2MAgajXfY2*)b4X9d^_X^
z?HbR=)X_Aqs8wFRe96M=CC_SA)s)pPbT2*_kEpAve!_b=OcqXG3aB1Op_4)%hVA;2
zB}-PHT70=A#Zq4`2_lp4LyA~SO@cnf;s?qkP7^Un<8m0jq<S?s*9~5VXp}{VwwQTo
zy$I}7;_{qEiZ1Zc(!OEO-^{O#Z=M0y8CV6z0B?nyZRU-z*Xc((w`?iRd;%;Q7JMf?
z9W&9e`~AT5_ZUT~_-x<afu@iez1PJ1(&<ZS_K?_5I5ZQjDj54$u}Fwct;{qALI>b$
zPdhL$Y<AHLV1ADL7K#?|vAwI-tdSRQ?oK^RUF2so;x$7<+N!FmwraIEYDh4h2<WiP
zs1am0--)P{R#kmp8J}|Ee`Gg8bc4Hy%GW1yj;h|aiI4O0lJB3x%BPwAdL^Z%eg@;m
z%EADcwPKEil7Dd-_eVyS005L8Vyj!gA5Pv#n~GMXc3A^j#7yP;@bTlUPhl@BJ{&d-
zu||eZ*R#ppY-~G;uc8m@mue@*7z!_b`mG6+?~YCKFp1Z*jtRZr80n9m#jPSp=@-Ce
zTa7sE?~nHFWO~yI1QgM9`Wo&unq{L658(?pqREcNjL047dj3O&O{_yK{up$2i#j=#
z78sPQ_v79#qs$HJmbO01P|QjYCumS_Ka|7@44Ot^=^>Ql`*Md{I8oDp5KSCEUM<#~
z-y%-t0(85#)Gb<X_H_y?HTd{KC$2E?M9-hA?tf+80p=-ZXRn>3A#OWw3`Vk=t?hP_
zC7nh8RC4aX(3NvJr?J-P+_B@#71N4zs|0?p=Hr+O6a3vfn==jK#wT0w*18wY-DYPd
zu`w5S-nxA|Be24)BlU(llkDJ5sO3C^LENw=McQM`3sA@jr1^RCsg!M0?}pJ(^^U)n
z^~d>+nLripg<^ObO{}7d62G_c9oIQKyJ5IeX>8+9Sm<MOeu{E5eW{X8C5k+?@9<xU
z+wA#bjZ?A6qQleY<r<|vc_PpP3YQU}Xlw=gCFp^0)F~02P&Tsn-d6GhFa-e4DYL+F
zw*ohxJaIxrgGs#*SA_C__7iTE-Fo#pb$>C+t>CUXR2apkE?8YhM>lTT6d<IGr_nPN
zVmKxTV^mg9kjYCDE%@|rFXUdiGKbGW*n7tgpy7|>Av+DT1raY`eb7dx1}dt!LD3gO
z(m?Bk%c!HLH!VztU0xQUA2fuozZSoUm=(P{Lpfu>6k{DkHp#AM`#Z$uz^~yg0ZRCN
zq1qvVTy4zo>(5N4g>1mc(ICQJ%GcaT85+rWT9H7{3=Qkix9^$z{Ya*SJOrH|tBGam
zci;d~_95sLnS<dG2k0JfyT}9P27LMaxh-S`8cp!9XyN@lX1Lt<CB3qetcXXDIhm^)
z0DnX-FNF9<u)~{=b}Ein2ysV>tp9=FO~9IM*Up7=hIxA-Qpjn{RuzZpt5*-{#fIgF
zk4hETmfu<Cu}$Wlu{d!~389cdm;ep#B<LfNhaN%iXMv!&2#AazB=HYi;$Vp-lPO@F
za{J;F654A<^v8AJ8E^wf^Jv>-H!H*2d3n{qACKta;TzB-NzAk-b-|&L0#RsCS)CNp
zOq7EAX5rycV_~~4WJCUjIYsA8Z*y5-+pOVY(Uf8WhT)+HoFak@#5H3;^0TbRiuE~W
zdcFN<s`nqR`N!rWf1cdXTkzG$c+sXPJ-3@wHyq5<FyiI#wu;|7nn5XwchN~zV2IU4
z87Ys86#u}VGTVBV>X<<NwR-ycHp&lz#{Df=*R>j$+9#T<+uvSak-jRmIa1qdj*33V
zKY5dS_H(LJM11dDuWKp1O0P>91N;2}0Sn&?3##x=P_r8nEr<NSP{cWryj0uy_5jeW
zrKZb95|*W1e#X|lu>9}(w%821-G;w7;GsUOezjX3k#*g9%{j~7D-;u#w}sIpiAt`-
zF&J4*Mae8U?Jc+M+!?TeQm2;UF1}d7JcNG>-wQ#d;_+!T>+}l_(u#o6PQOlJQ`Ouh
zz5Y2$GChbrFzQ-KaCpA+n@s;7ZG%n#sgmKpfb_q}>SOD#Gp{4lqq)3$0~vl>!L}C<
zwfd@n{~n{c=g^_rnTi57!N*(NTJadO{=YBMmGhMq8%cixKp6c<t(-tSuXF&NxB*Ux
z3+f)<J5w4{q$6UxF->ualq%{W?)oPwDZNe0eEeItQ1^4QMs-*J_wziH>yfzMKl}eG
z_s}^o@%XQ$!^@1(uKzqbGDenQiu_Oe_KV+-|Md=@8GZlrt)AO`Ce;)|)JXG$3m2#m
z<B>z$xTpN<`C4u3lm?zYEP{ie>rFK1C2-!!bmY+k6l+F2+i+7ohixT&@2?b}h#(tl
zFm;p;$b0PD=Jot7Kk@&i2$8UwDRO>BG4??9B|JPzTOLE4q`aoOm@vw~?K*<I($RtS
zS4{nGR@NG_M!=`qiAX>RpDD6xbwx*Cc0er;Q3KC#cp?Lwgs~<Mt$uZD%d-V;L6rxW
zC$C(wLRCVuhrw&SQEIKU$Ugyh5Iz)s<fP;bGt$Bh@^M?m(m8s){sY{20If?Tb5XNq
zDew08M*<K`w<Mz)v^nuIPI{s@EC!x=yje$w-<Eu#{sMTm9PpwhruHLwwm`GyMuIIm
z@V3ct3OS)yI5`boX3oEFqO-mhVkP{P;E<5U8q5^}O?dn9<MRKf2&)@*0y9~_oN~b)
zCs1#Y4_0yeu>c!XtNWU*1ck905k)t&1OY^9ksCtZT6wN+--rELIo_Ng?hNP=Mjp0^
z0-PD+abqe=`sjApf^c!5P51Mi*v*T$C2X;P2B2|oKUBw3H{4mj60AaH;Zc{WUFeud
zwovj7amc(?dnx@lO*~#f*cXf%u<zX58AZ=^qY?!X5m8{RWKUclf9Pqo*XgL^R$t#F
zf-e0dyV})kIFHl<845eHv;`P62R@0FEXbryKc>})M;o7C8ldl;(RA?8Oe&@*BrI0$
zw9r!H1#};@<OM|1^gxs9hf0!viO51DsA+wSz(Q+S^P-n)$mkF9>I)W?uWXBUD*mGL
z+!ikq3%#T@j*jg>&5_(MwX-ujW}tX)ll>2vhb|t8%Cvu00ge8=2_@|ttwv4jxztAj
zaCxAKnAe^WOVzIs@jkxfQPZ+_KMrH)|Nd`V{@>q{_mA*&Ehkz0>+ipDkhdob7T0vp
zpxzG(6>l3_Re!Yc_t`=dwC<JfkVQ_yqTs*CcA|1sRTbVMQ8mssHc$8Hz4-7!O!1Sy
zZHeN#rEeUlV^K@e7R`2jE?8(@Ny_N_e+WVw=0=dR2b2jDcpkFeJ!r?>xL0?yDSpVG
zq3x#l)(O3T{R|m*J&O7JXTOwczkf7l)_-Dj-H-pPrmOqW{nr1#qiR%5gMYkycE6ii
S@I$<q!Gvk!PfKQR_`d+SPvjf`

literal 121695
zcmcG0cU;bY-}Z@+9}TNW)5@xpc4-+QDzv15XirUTi6oVfM2k{sOM53Oq@k(3q`mii
zj_+Bn`+BbDzMto>=e%F{&lR2L`Tc%A?{OT*`}lm_<z=O*{$cosL?Th0J#+FBiA3#0
zBCY;Oxf*|SFe&jW{#bWIM*1XaiTJ;e?1*P15-aKKNl68pfc_>u3&r0}0<D4WdnqWd
zQ*S@{j+J#Y)xAx-b(x+B_g%_=znaeNZOV=4YNfsy!O%1BPrNISVhXz@pKyrx+Cd67
z)eTCUtM<FB-k`O5XEV**ozZFi&!n%vHmKFQ#n}ywiQ6qDW-8GgU-&I%=IiUb3YW6-
zl3z1&asA&PNF?WVC+6t>>qq@_{~vxj#NA8&2HJN|pFS-Un~O<HON)%ut&LN^ShT8F
ztup*VWo6}S0h8u9^~}hR<X@$FI>|x&kM#lD4=ZJjemk6=o_^=f9TxjxN&Hde?o1gL
z5utPI)<pj=^WoNf_CSkEr%#`*awp#7{D3VkF_EWt6Y&G>M!Zu_d||xA|Csnp=QDi%
zgv^Z_?ar&#l!^>xd2r<j9zTA(SCIH!iR^HPjrcRq(HPf;%d#8&%I<gM%BZj3rOTJ?
z#=FhxuDiX9h-jGHLQVVb-McSezVy|`FHPkxWo!R<6`%F$#S8u$pC6ia6f&88-h44^
z4e_B2qyZujNRwB$Z{J=SA(uOl(jYcJToCo_{rmSP+^828r*g$Dy32<gr!qHe*sys&
z#al%+wJeLCpF-wcB>_U_)lrIye>yrkxUz;nHJIcsBot749AzdxQeIrIg^0BTC)Fg~
zin<S4EqDF|MjiY0>zA^|o!@Ke)t<a~L3jOVh@<UPlQp|skf>#EReO8;@cB!ZepV}M
zIutQk^XmQFTW?5wm_+v5<(psB?yHFvXt5vLDkpmJ;>FEDLL;BI;(I3-7v`jQZHin<
zF>2ty_3e4)P|n;~nK(_EMQ@efXy;dA50dhUk8++}jw$DbZ@6HG!Sp|I8rj04{gn}A
z3sWuPH;bQ~IkAeWq$c(%i{o^g-B7mm+~h!;zd>AjSN?hy!>Y)Oe?Lcp9F5KkMOE)F
z7kM%5O8JeAaGLq9*=$-sh1I(FXEj5Zx7cjAoA`WN)}66W1^xZ~;)^5Rfx?!Lc()N3
zLwkH>{numtYc-t~w7iegF*BQ&aAPyXJD;3*dtoiF1#;wa`1$n7dIOnr=UiBIE5m=p
zT={4_HOOSiNqh?Hx%yP%PhOkfzkh2LKE5;gOQ)dVa9mpZND<4_+7St(D>?Qv>3@Re
z+T2*yNn+tA|A;TmggujeB)ohXJC`qmAtg}6I>@|K=$KCFCKuL&2lc)^Kj^1nt5@~L
zdErO+h1aEGbNyKTW0WqPot^ZW_9Nq!a^5SSLACN336CB<+Pvp90%D^i?Ot|vxnQw{
z{*Pr|!rKy@4smc~cfa*moGubyCs`hOG1UK<PLNHz`$ab*R7kr+$h$1DhTp2c?&~x5
z^~AOE>y!pyS@?TT<}Jo#WzDY_pU%gM-(%ISiTSn5B+G2PLRy-MD0HNUd@I)wwPEZ2
z$1LJvlrE<|85SnuosJ{BLS-rXm%e@bR(gD@apy5@3dtw_{^^Fl60HX6Pp!QF;L7{M
zX2$+F4cWCEK7822+dEjyVRq(EFpK5xR*8$3F5O1raj7J*Qo7{dTfek0<Ro|g{Nt5H
z5Lu~T&c~HhRBZCs(m(h0T`Ot&og+1}dG&|qPoIuPh%da?D)N$~J))A(sv|ykWu-2>
zOq?PY6{@mdI>8Hmx%2xdUAD>!wjmcToH_GQc(ZBVOl|HWpUrUV&!0c1r>Aj=Gz`24
zdQTE}F7dc-IU3h>z4O!fCpy*N-yb>Y(#t{Xp5rjrj5wC0RZviPyz^LiQWEdqH|dg7
zwEW4o?b{c+rN#fL^%h^W`Tl|<^v4>;;zG||wYg49tdyf;V@RjL$PVJ0NZiY*C~<=4
z`X^V*z8c&fl8A^J6ijN}k&zKpQNwfR&e2lPhz{pDEzPw=ZY9_F4^K=Rj8abpl7G<e
zM1Hae-R>+66xqe6&x^Rl3%|KOLn&6(s6MGDHnsT>Hh_cU_(auZY##CDRYG>lmwc_!
zyzKbZ6vGWJ*ZS)|xG~;7Q)NbSVWl+A=m|%lu9G~9Dv)eYdzy*dPa@4%Qe@NBRAYxO
ze;pUrGcq!nGhMGkz19qit0wDDbg(Q5Qo0}k#PD^v72@l91C}d2Rl#^qrH6;dJ=PsU
zw{JG27!mJsLP>aqJYc1C9DCjJ{KbpE-*xrxcRgd5qozml(}Z4FJ-j_uHCcGB|KmN@
z>2?n;4ab?^D+_8Vv$A#9e!Swt;%{^bLC#3CI9TA8a`a1Ha<{e1In7>9I!j!VbH;KX
zkXUonpeDw2^C0I2`Ib%0qG^iJN|Ll$v%S$G)`O3Qsfpi}cS|xM?n9TPT^7W|`G9H{
zozWI&{e64)2HgI3P?FZLIZKG0oqeUjF#XI~i(d{|Wz!d%iZauZ<DjIZRCR?Er9AOx
zRa^WK;v=s)y<5&vhg9Uvh1dLRC7%QaGSSo1TQWNLGC7P_wC(2N;;P?E{IW#g%17oz
z_kQ>_O&3{~GuJf4Z`>$YPW~$Ag}~+YooK@CZsW|b>?1C)>CBlk-9O*S&{CiTU&jUD
z<t!Zb_j}R8Oe>-kqjjH=`ZcBv^wAEnsh+|_cO=OxFW++H&?66zf>XA(wpcc+_&Z}=
zZEqtZ53F2QQ^oSF=0w*^C7yk@zheJ&O_bFAg9Z;a?c8D+$B^Ei_&tYt@rOW5qlb(q
zlgMPB-z;k8l?EU8)9AegUtVxreS3jiw5phvi$-rIYK!S+tORij9^OmAa!rR19r};o
zVx<$d>{Xb_^%5COi|nvhJay`eqO!VgP*8KG_0atMe0<g{BK!W=7Ml)FhpdXra>uu8
z=$8fv&Dk6zZjZG44S5+zwDU7%PC@99T|Zu%o0~h0hq9bn4t3|*<#-~o9^m4NSM~Gt
zJ@t~DJI=PU<Uk{_3QjeeOM5dng`IsC6BC1_=Uol~QkACr3jTViAtm+c`9bs@|1FlZ
z<ge}(S<XH34<v~T%l-X7eyTvFnJV;VX^B>jM-Pn%3quF)!7VL}1uv;xxw6wTDLd76
zqBnSH{*N@1rGf_?2ZMmIJ+MolRlOboBTx@Qz19&Qo7Y@P-X_vpg)3L?v^Y%Nx^?Tq
z%cIx7Jlc-db4pLIr#{(WX*@za5XFZ;E1l2eaz<aglkqEb_z`H`JmdR8n)yy=pH{Hr
zY>#pwwfOvSyf>QZKu4M7(!$hKcSXlw=8y(@BPs?u&Ieo<8j^--{@Q!%s8l~{{}9t)
zAuhx@a=B2FE@q$*I!=BJW^$Szi3jvU5B%UTH<^`{waYk-4KGR;qAu^`>FitEXgl~|
zm+^FuoRjLMOIs);kx!b)`iZ{Up}G%y#Ko5o12$MKFY(1`AfHGrayTUjEC;Z2an{n}
zTxzpzUz%A*S7{&t_fidi@$m5_>y!l%m9=GYGBv?D6BoUaGcDxp+qb>y*>+QL*wL7n
z#>89SjX52gENky;VGn=(P;qp06tn`Us`^GVG1s!R2r#p$RwvN1=F|O+Wr7`#S2k;)
ze&WSd<12t{SQF!{kwVpUCSnOl+{<N$bTT|4A;F+Fj^_Ayd95Y`zhND&@6*KFQyx|=
zA+h2qhQBUt&%E{hMef3Y$=0n~b9tWm`cCAnp|PJIDdLtR7h?&I<@6`LwOE*_b+c=9
zY?>bF=xA^MG_l89bW4G}ok?%idt!^zm_@7%k#EsVbNMum<){xky2z^(QuB95ga?u;
zb9&yDl$4;)J?KLU5&M6rs`)j)k!rQAfVe6GdAO}W>nSC18_uf#x)1eCE3}zkKk_Q1
zS=5s?l5{UNvDdQ$Is=rq35$v4D93AL?kxTG?a|o-#9K-F{>#;h)*s*2L}M}7n64k_
z=cim`Hd#G4l&h(#$`D5*+#Q0R(6lMlq(yB18}j=iZ!I?jiJPx@^%Cdq0*@=k3K%!4
z5N||Lj;iVD>B(eStO+JBwe#%R<ve|O-hMvS{-pn_SE|X)iMM2!ihe1h(#5JGtNI-U
z+1TEAEw^NO_LZ2KH2@G*?8DZbei^nuCm<m#Z0{#>DJK0bQ>;9ioV7~PU^)6t*tG1-
z?;tjlgPwL#*fhmHgi#ZXNkdBPe|~haVO_%YsWk-2sTMwbxbJX$;g>IpvDSl)GZ*M*
zZr@Lpwe0!14=*)>b2KeXrnby}B$JH4On~Gw(;ez<mAlr^*x0z*HqUvL<{dtSbCh!?
z%CkvEvD4yMutRa$dImoIDpW&L^@|rj*w2hMZOWY+%;ercG$)b-$4X7uefi==ZhX6m
z?L;m>7}`a_1mLZPqG9Y77M5TipUrW#F;~Ve)Lu!rmY=gO_Jap!Mx_1%Ae5!RvyVDu
zadTu^MXC~8F5j%@h=>-*wm@Z2R#JM<*B6%=SMQI2<WW1Kk(H)D{awRpVS+|%`dYA7
zP1PxS^(rj0>G3<GohJ1@)`gFE63wdQ5*athd;Cq}y?gf}92(7vm^7XACi{QMiH@91
zV?KH<?<KcZRQ9Kyh+t+;fa;Gx**%Bjjq4KHz8#Ji?GCx8&o!C~i1J#{>_%EMQf!}}
z+9f5WNnii!Xk|^yYDEwXZkd%OS?;o=-N#H@^Ny7YcF^ZefAQA|nwjm3r{BKa_xbZI
zETBN6X#u`YLqmg^BA?BTaTK(KO)O`Y=9ts)J(eVds~S<ByhTTg*@Nv9k>aRSa<BPg
z0Y(lUJSaZ*tL(<qp0)X)aZ{DSi@a)OxCLzDG3^pW#tBqoHx@@hV8Z5XI}^aS$p#be
zTZhOApzpDgI2!?3+~+jM{*=+kA$$NIj6Ofyj8713bKT0r!$UCH$^Mzvdo&NK8vyzz
z(SZBQ-xLU_^B+IHz~wX}1NwK5wGJ$AqE@rzgH{pneX>!*<XaEU-ntJRU|CO}J&W!V
zvFN7Nj;wz(Ib1+PQ`TtP7ptgs_UzfVuLS*)q*b%}b*rtd{LP7I?FZCM3_)3g8GSiV
zdx4Szk6xG3*Vm_|ka=-LXsqPe)nt9&r%!*ToBtW<C_>8Gbd~sn!YYm}%uUg7S^nUw
z?f*SS-UU{rl`TJj3W6IOzy-1#(>|YS+=Ls$Rf$;j+kluYp&z^wcXC9cT07lG&5Bb?
zA5Cd623gmHN^t4YB{ym&S)t$0={0QHl4~^#Q2q=ZpTtjpm2(^&0-El6Z6Y>Wya$p*
zP(VQAn-gvw0PP=&nC-x$@xP|eEQesiP&?3bXeoIEGLPAgcSi&}3RAjRw>YGt_XFTg
zpLWzFqBZYsUNf0Ll7E#Xe|foD{l`!8BKWw1Q)+5ze{pE&3Lx_Xk>|vYkfB>chNVH8
z?$5BZ|M^?B*c;P}YHExGIsz{Ghh8x^jrb1d1@9FQs1CH278#Yis!Fac{{8f{%`)D|
zb3DHckZ0r2<lEXo)4lmO<QXd<OLN1iQE;biSaOL{ljr!o2r3t7g7uXvSN;Og%n{IF
zH@g-<0iTOH<oxV?^7kwMlm_k+FlMC4AN~D3v~=~br&5f{W=`A5{+&i!h))WUU4}XG
zEkMnTfVaTd=*!3sPho0_37{`}&e`#vt^fOPmy<T_;y<ER5_DVyh_mHpP0STC7Y;W4
zwBbK4fhrHpso;y|(rhgNt-@h0uAxwGv431z(>lBi>XY=q0r2`4{9W_J*VnR`&5c6x
zs7Z6yGQnw$4g$b2H7neybMiGA>O~r?P!N)tnOW#*&wpG(va;+!=80`z0@Be!gO#KW
z3lB$--TzB=UMw#PRRM@O#4s1u=^rMJ9|3PbROW-<Uyd-YPdZ5HvXLw6=$<`$R#-#R
z3dFw-$*f7}Prz1+e6(qT)D!RW#^*rEI;N(1=eKXg1M=eESP@Rvbs+3z&Y$1p(i&tp
zc<|7nM_5r>ihM{rBVXBrDI`sCL*s)@A}ik%xLo-|^1JJk`-*+`pbjMIU9OgWM%t{J
z>$m_`zm-DL-hQ6Rx^c55Be80%bpMtEe97p$gH4$P_+n#g0|1RM+J9Hqa|d@j8kPes
zg`e8(uO?1&4J)+l!?hu@HpcK&Hc)t>oj#Y6yolr76=Z_7*|hZ)pMjD}wo!Fdwd;nh
zZ@tLP?(@TyMJ%BbXe5{sMgF7hhXIILEE!u#6DYuot?R`pYf)y;pq7DX6BqjVyc)1=
z)lTAnUn@JJnUm$Tw1{3RNjnb(F1OWX<6Bt@Mir~UMj;4P0ELGsT`J|o9nf_yvXcSP
zjiTio<`vDhov;PgDwsfd21}tcVs4c%N1<}9Za9G!2deI|%noT+5}o%<C<+3fz5*=;
z&;_BCjE#+fZ!;b+&Yq|N1%Ofl;R2*p`{vD)fn*r<a78JZykurJg1Vn({--o@_4xon
zGYFoNwDu!k=nEcD%|RXtJ-3=skQs?XgaM@sgo{5xcAO$2A}bn1%ojx^rPE|hz<HhC
z?{B_MnbtHetb6u!lyiM8DKP|U2ElvHt9Pk(C%;LvDAY&*6^IVQ`RjARc7ftz`(CeJ
zAA=S*6#m-yA8}=;P9N#G?8*8S*<tfC+Puc4sGyJpy(cm<vR)Zz6Xd}fnZ#(1Phzn*
zjgSqDqJ!+2yhE!g!xBij5gZdVxG4iV?nk?+L3NMX^@xGDZ`;vVo%%l%eJapg=vBsU
z7$o@+JNK6+BuvSQ&2*w?4EAK!rx?xs<}scQhUj-rMy4Ti2g;2o6b5=O4MqydbLYOH
ze2bwvXJ>B9t2ILZM=P`a?W^p<`q0CJkTJ=zLk6I%Jj+?#pRGCHPH)|4CT*l13)-(~
z{QCj5*mNu6+o8`6Qnfu2Ckxs}TN^T+kTIV>e_sD>9b>|aBUf%W-}w`(p2<(?^7->;
zouE5i*jH$C#P!qeTZTfD&EUd009Ca;92pipqhN0nLoG{J*=1y8Oh5^uhVF~^Cloj?
z^l7qy2U#-E)OiDBG!oPgzY78faSqD{%_Mg!d+1aknVCO)xttkxL(sLMVP0Tz_zXep
zcI5?u1#q|vva)eAnc>JaFtLG#)b*0I^z;JSB`=9JHUc2^aLscx1QV{+up1zd;;?CX
zh$IN36!h893*;VLwSHxRo)LmI0aar)CqW8>0HRRnr`oe;W@ZrV8fm82ZC<{3VKtCk
z>%}6T1NfQzYIB`9hg#Y&LTI)~d~q{#cvx8D?e81{%{E$S$}6I;?QXH<aRIO4!t6wJ
z(?)G=ZP2h>2kh8nob4`(d|=R=YxyqjTiN-I>Uk+$aOq7BQ%y`FR>Y)0IY#AT^!2+7
z(r3?ph5jXeEzeo<-a17^6JVXX-{1Yfip7<)$u|?VqUbq{PIq{F?-Gbp;`Dv_(xzHb
zxd~_zQc<+;#yHK~MTEU$@B0V@SF|jVXAvOMC};~1n*s7w(>Ir!LR`*9szB8Z;4`QJ
zY>g_l?yDKVj6t+7=HvWZV)IEkQp;V|fd>$*t-x()&NA`iNA?S^sv_iq9dfIbW55$2
z146-ZOhpQE1<T3F4gLD41Ca=eOW={9AdRcBnjvJ_?hx@(1^OY-IruSvGkVbc_hYC}
z8Nged8QpJPU0ow@lgrn{3bnG+MzUjwA|xaP7KRr>s##1U0Xvl1WYxe|t9Ci!R4T{S
zl=DDXfVfvOEZ&!Wc8-^klM~0SI8GO^5bF8g9=oCk=cr3c=7Z-J6b^tV;AQYT5si^v
z-J$}O8%_4|Vfq;NPoF+r0q2JT@GJ2a0aSlE+_gObf`aOJZ#@mOnEf<h*6Zibw^{Bc
z7A$1da$yN!KuW94?*o{PvO?x)qhmIKf46_}m>Q>3&#<`D?#`ZrL>Mry%T`uXo5zmP
zZQEvuW;)L^3$>hbjKn7ZDvC)E0%m-CocXvdBZf3PckXPNjjO7v0?N?2d6O3nAGiYw
zWn86tD$z=v7gi=wlC%`)AJIy&-+ug{no9doRu-S)_Ug%#asULx0bYnQCd;aeP-h8^
z1>+a_F}ru+;mwel1ez=^-@bi&V1XiU)y<AVPd6qj70gzu{Q*t~zXX$eHmU#q=SXs^
zqV;!iacOL9OfbvTt$2$>RTilMg9PYT<}=RC$-IeHtnXM!AXP$4Mpc~@=9hrX^UubO
ze}23cz?jER)A0_r8U6Nb)f-|Zo!8vYCJU9+tk72N<_1y=3XNoCOQB3D45%Rcl$2`n
z?ykXKdNUu-nfL~AJJ@0J*Pu?$Sm012vWwxi#K55=M*<+!-||zh!Q!us>ef?`OPcdS
zV{dh|`R&`E3J0F<lR+Q7GN7WZ-4!Bv?^B`CP;>U>OP40QIf-%zg*>OH8nrPxE~7Ut
zb5Q?h*w>(U%R|XXoh_kcVO4_TFej%*;?1*lX$}xeA%e&csGy{MOiaW#a2GLF0s}du
z>tQW&-aFW#SD+z0lg6gzsF4*VQb^Ep@aPUhHH6@uY})n-OM3hFH~rk91EBWUX%LO;
z9VzI^*tKNeH=4PQn&Q9k!O$9(e#&rpdU!;~?pjs#2%LR<+#)xSyp-9r$^%PViLH`2
zt}HMA<n`-a{KkIB7z`csi+GM5`}pxA@&F@7=x%D#cOck2_3^m~R>)jfROHRTN6;Xn
zinphjnl@k43~*YUAtXBJ^uPTyn<1^Npd%`Gay6GINWcgHh0D#eVeLk`n5(IA^&i5*
z!XQE2U9<jyCjmKlwKrg0&~^e@DZRihp|_%)h-;^=tJ`zx)G2HxkWO10t_Twc>&2Nr
z(yz(W9NhzdS9{{y_wT?Y&^dAC0P4|JFBTp_gF#T=_iVtXW6V?}s2_<Hg6x>n2H7oV
zrVG^Y)YDZV`>{Jmff~?{ALJ_2bExv(ET&y<bRiG=$s9)B8v_sy^$cE!;Q|8IxOrrB
z3^|vN+f8oTl?t|Jf>GyVb0ve3_Al$1EXu7BdXe`?RcEeWZ$m$uo*ygY!Z-r>1!&0T
z@2ppnTvA!jWm*l`LRX;YRNKLCtQ@CyWuS=9-Q69v;cICr^{!N49&d~n98{ebE;%~p
z^pYn*@+&io4@%^1yjq>caq!@Kt0Tvb9TO4Bj52%u>J>vr0I}&LlD+zkDsrMuI^im=
ze7JF{E-g^pX>oR<FPaGjuMet0T^f_94FPj*?Al6xle0<0^6k(4`}#J#QjKi<>jf~$
zg$Q$Tp!1CzvoOSk1hWR{T3FaGf`>?UZ0ziRK(L9Gsgfi<(V6n|zrfrzCwUc>G)#&U
z%w9ixb{Vs$0Tu8&%+in{Nam|ax|?zJ{h^78iPTKO_Sko@yX^Kyhf77rN)SSkC&)MS
zVVPzo_+i}O>*we9?Aca+;}$>-{h}>fw`N-O@aUF@;JP4vqTTI~UK>L9BS2^@sZust
z*svjGpnyi)zU@B!=^Y!0D}4BFrHt|JUAuPeMtV+cu8bci2M0&&|FN)DfnW>XEaHpS
zxpV5*uV1I70AM#jfm$hAbsxwRySzbSjDnemVIxq<(me1~6q5m}hL1G!PJUxFj+h5)
z6I!ApBW<8|1|y-6Md+iTr6YgA`$=+~v@Ts%QcBUUjv|-W*TKP>7cY)Xd_b2(fz{U0
zL5_ub37;GKr!`MOLnG+P6Ix4hqOe*mOAjc_|6^r7#=~<XU}f8j_njsSjjXHx-_`bC
zZ<;+#zD7a$7>vu2Cx(rjR}ycc@mG7@($=QNB+{^P5@ZdzTqv}vZs<5{1fT#S_#>2>
z+~fN_(@^Uoj}iM&<qpMrpUewa&jtcN;}@&F_<)x%?=x&9I(Y2WR4+4G67_+htT9<2
z3|b0QLt^6mg<i7}y5&lA*e!F(Iw#CoQAG9IKHb0GVTCNLb{zNlSN}tJ;PXH|>LS**
z=2-ysiA~mNDZ1(D>A}Vzg#W$gF_|~_ZzdOpkn4vJ94JOB-Ia=YT*C(~>0~XGDRMow
zg5>47OMtjIN&;75n1?Z+29l1J;=(K5g9i@i60AL)xJ&0(%jB5hCBB|hGe;DQrjl?C
z^rG4e>x7m~#VC06NYi9@#U3o$wW-_Y=0szIVhBcZw3j?_%VS>d6b#PD;Xj?7STfXG
zyp%;=`1=bjM?WAM-n8Qn`7TJ;a{YE#k}p7irLf_PVZ%Uk$1(%jV1j_o0*;9_dxSe7
zzo>c1=vg9LemM&7Z=^>7_dTqvH0OEzzpYj)?^9WR-_3vi`3E-;&5(+FBS!^d7NFqr
z<;z+Ky<Tt3h0zI_Gcj8Ot=M8ow)O~0d{NBwAnY&_>UC(`V9O{F^D|@6!!!p}R8>{u
zHL~$Ht@-yvEPJ;#yd=px9UsnHL*R09P$;XJeYsDs0WF|%IVke(o+5{8<O$dz0Qi9-
zuXjuiq_)I?`B{yYpP*q9G`&`r2JB)2ZR7^yb`pcDMz(FN%7?eT3BYe4HD)MPu4Gcz
zD)DT2Ew4FU{+|mPnwM&d;Z78^a@y>X!d+b1<3BOGn}7(qUedILy9ncd;uatOB41ja
zGCBf^?)R@>H|$CU#??sHT3+Fuyt?wZG{7k+ZI44&=Q(P9eY%fWIY=S|uR9q?W(U+q
z$cvuWeBi(V70{WwG&B>$v)<oGRrG8Ca{L<+&h@^_cM)>EF6)jtL}0uj%Vy+Hu#?7s
z%Cr3!h{l{?CSo((3MH>lkbQY;tCnegi0yx&%U!8hQUtyxiwvf;;Lui9R&{B|dm}^J
zJ36$2SL|9hmQ6O(;;6FZ^|wM(#d^nEfkjvmV=Dbds0p9(MG4l6gvJ3;ZO`e)XXM}~
zA=5=(1;QJ4QHQvY-5<fCQd=BF3>&*TJMCfk`1kxl-C52bXH_!{$;tvn1orLwjRug3
zW&#I=E8*xOHxr39%gtncz`s%nHK@Na{bT)yf58bXJ?1OKmC0u!qJ=HG<<SBywWo*M
zK%gEUCCh@mCszigy=o9Z04WinS52~$uNOSfFM7pq^vc(FJHN5FP}fV$)ryGEil|RD
zSpxDWvqcyX#FpmAfQN`N@ba}4Xp!}%IvU*ef0VM-PyX9dM!FFb7S;(>{5!l5Q$Qf8
zFh4DeW#xI~`B<Lg+mH}e0RaKzKX5^j7t?ZHF2zG!PXVdoE+w}`m;_aq0fcjQ(*GUM
zftwI(Lw@zkksD74+m&ovOIcZ2%yo>72XhxE;<3q!|6_S_UhpM@$0`HBKXhM|Szyfi
zG&8MaE%4eta7v^=ORf_L4bguDY0tw$l-aUi4D|4JxtUyi6de=uShye+vy(Eh#p#aY
zwlY(LO@vM~HUo-;YHROpT!F^v#UwIVag}hupa|F4W0uhq8R}IZdeugW@6VO$03Aj1
zAjKbd$UmrU3ffLp1=Wn$9f<hb>FJdf6iza&k2D7v!;}s<x}>=Hq44+S1BVY|I8eRJ
zD!=(2j$U9H`tOTmYu*Qh@8m!O_**)3BtmC|Hivq=+42AhP9cnS;Q7IKVOTQ&b+T7b
zB2ebS1*7JyJ19e_6kE0qC{RiaJk*=BLJ0>qM&HhaziO)66Qsj#y3K8>o1;v0>^wf|
z#~&0<CkO(A6&$C%81Ddc3{_m^%$*+v34c|Vs#mFvF_fLf#<Hj`(dbuWG5Vz+qlA`!
z@W}u8_6>1kX{Ll>4N|y|pI>T3vf;H{AGAMA%}^$Z(f5f5o6gG0dI?tu93nT$SpCKq
z&z~z0JMIt76R2m?j&cmGi1I^y6IWIONjZwT^5@U}7hYMQhf4#Fg*+hk4WG)TktGP(
z8$)Sp6fm-X61IV%=HrJCM+4GF1ljfm8S_(+H0eBY{}AJ5*d;aX29h!B1$&0f(yvEU
zRekWFc3-#)0VOpZCaUox$V$cGBg88JqAdmo!SKFi4>_207W;tgX@go;Rk&bNV^luu
zHf$^n7I%WIQ|*Nj&O~oDpa?aLHJ2k~KX`5+4LhBFyrTkLhLGyxvnN$)DX<NF6`|4u
z9<<UKs81%WoZq=}=OL`1Oh43s$b*iCaGHcH2BQq5dt|;B1K&9+dgpBU)PV-<3E>YS
zR5gTZlq>|DoU}B!KC`=Odz!Q7VCKq@PBn((kNF2Nev&i!5YciUqH@CZg2`@myOC3M
z{|es!iezQ!GJOoEfmd*Ajg_LGe?s%YD!wN!9qoVSn+(DkBN8wfd4w2(DD=$w&(JCx
z;F{UEaU*1^5jftI0|U6>%i4Jiv{BlV!E&NnIn|(+^TqrBjW7I<9D#?OlqZ0(n-DM@
z;b*mOzN3Q~1o{Wr+=Nw&n(B>?mxXE!T@TG#Q!M!9%Q|Qloaz}#z`^LnIro^U32g&*
z)?NIDD#$ou(ypXL7gUD!`Tct)#<UM;SR5g62Vmw<(M1t*-5sSN#-lGzHMti}Qw_nq
z)O3(D4wa@&KnuO1K>YeD=XEI6R0VD{a_7zoBiXsqJD);7Jr(#co>&HqV1zNsL<vv~
zgk=2tU_=0q{q!(6?X2A`!MyEsbU70>SD_ZY0zuRXc2Keu<KQrAL~Is$mlYRBR%UX6
zPA<2r%9KV_4u{!swE8vcHVw0r6#zS<e3!L^lhv>9AIx$QcDo=jxw8koMEws4V7xBM
zstOzhEo=<(5A3!?S>-bEF$IKbhU5W<$|E>!sm=_fnm{}n2dv44z^JHfT=DkHPMrX=
znkzcVCM`KuLu*O$wZwo3y!$?34M0Jh#D>7w=*_=fkx*P6=ciHh@4JDKM8Fw_91_B`
zAG|~N<En`Zw{HE0&2psbvRwB3=y%1I@}H;;=vy68(Q#3+Ed&28lxfq91b762D*E!J
z`W(?MoY&L<EyU-{_F`KV1A8#8-L9<0tzQ+n!)e2oa@K8hbYctRm2j)fK=6!~B`$d@
zyDY}PCtUMMy0rS~4kAz{-I#47^|^x4t`VBVlntDiZfG@$_8379ps6lFy)^?bY4b-E
zpeIi}IYI5tzk^V!p@#n{@!!h4fB$}r+uy+hTehIQ#y%R#-+E@7YnWPM#oPR`%d-AF
z?;yO`S?7aOEdyb&wihp6D6P~u_3E?>7cMk?U-kzrX~M^axs}=24*`>=3_j6HKpesk
z9rLj?@$#AkI|Mi340go-D;b5Cql(zUG>K7v(jFR8=0$^n20|xxoEwPGU9`1%85Fcw
zz_K`ucmK>>OPX8*;`v(i&M4vd8E%pFWZ;{{H28L@(4UimeR&gv76i`(3Qb!Z#yfVk
z>ehr8qh3Lx0C*_45g$u5hw04YLP9eHD}||sswD^V9b$_^12C}t3t8|8C&bIyaiA70
z2nXmRxhSqxs<trVN5Ztu&aMN=-D+VyNxv2{6eom4pzuAS#Dremx{2?`XUgoWAzdgA
zPE(mJO%TsvywbDja@{KXQp_Qv%yQNA{^ZJ>1wu1G!9$)uaDy@XCnja(pmYwoJS2&Q
z4@fuEijTd~I<hc+zBb7b!lu+KtbYZe;Hq)@@5Q+xnkA;!pgv~ppV9k8detOO5b_x+
z@t;3`+}!p^dj*Kzxvi>NpPFNaStsb(;<vB=#iH#RnW2QHoF<VXM={AEX0iYbz%}E9
zc8KIB=0<b(fD8jrv97DLadH~NgbB-)1QPyj=~~`15W9)|XO|6tQo5NUpHPE9PR+m#
z;rb2{iwc|^Agl%~OVdS2XMmcyFI<k?nB6(#?uRg^psFg2t4GJ1YP$}NoES}`O`Lw+
zoA&{G^08Dnx_6F_GiSOTbA;J#+v}vU2GD6MQI{ANX*%%-M%2Gr>emupZ*-hdR!SsS
zEnG8Eg+{Ou9FbZ>;*}sQ1PCKH(<ZcE%nKJ#UJ|^wkrKFqAVXlihU$|sNxVlbOmJ3c
z+!zFvw`OB<gf%>J-~gsw7eh}IT@7}ZZ2&91f%SSV>m-LUQvsjs##W%hIK-;al0uqo
zL8~1nqBV_La*k8FK++L~HGgr#wlxSXMwB)p+gK@Kgc!n9%OF0RzgIqQk?1aj!5nHl
zEd^Y>#f|Ca=+s%SXOMSJ^BpWjjOHa6&oM?n21ac_a~0M58HN!MN8`yTfJ}m_5luWD
z2P^d3KfB{!jHcy0I)JNboTg8pG`*!>+s{tGIl_|;3%w-m=PzHHoR${ghdF`H9jU1w
z>u?na9~v`ONj04T`!fYg7)X9xLc&o>7pSu!_B})y5w2Q{2?=RMebP+USEQu?qFiif
zS*$^(0L9O}23aQBEL!Xamk^8r2=g-yY=;EI?>8aJn_gu&F=55}5}q8`h)KK(#AE~X
zlsT@*bJrB=IGhG2cWtOEgQlSl%Mps;CK{8>ffUWfQNN1Qt3x#2Cj&p{-PK0Zs8uun
z0cjfpH)vhh;(_?yzB8aCPuNbp;L>EKki>M107HAc1a}h){yrgL`1xhouiw9KbZHf7
znJEQ;H8R>rlBb6@gUK5KaG-0FsD2W2N#n*e(*>A)#sQx_`DqeZh=C;Gf4tuZhy8pf
zySO_)4QaxSxSVE4;wbLX5NhwS!oK^Rn887Fi99i2f~2w{h9JlpNm}^kW?+sPLSZM)
z75swgD<|9&2GNU^5@jU?(=#mjvL)YnX#~(o9lj)_5rTv;xWEz5WV!pAxN`M0L{p-U
za-w-7Y2~?e-AJVVY)nv)Hde4Wz$y&Dd^tPT5^pEs4CDHioCO4#qRHKi8WbHgR*Qjp
zK1vs2>tG`V)L1@@0DFHc+((SC@T7gOLwS2=yYX|Lx8r!|@*o?_g#=oHYaHHu(u4~!
zz?f8oOo&iIM!%&_WO;S;`X?A0@39sa7r%t%r!43=d903ki3nty<#7*DA_!9%AsJI!
zpidAdM<Ph@yh8O23|s&+z_1%AUK}kRdLcC{%6k)02Ph;l!2rHQ;?rXywVN<DN*qtd
z^5NXTo0ym*Vq%LX@H4!R2x7p|g}l*trv(Fy=K*f9l7vD**fsD@=QZ^lsf_>+@sLus
zBQ(#;$PgX@S_7;GMA;uuDO_3+HU4}CFNQ+b+;|T&2|Ql_CL3L@;S<rcGID7s1oH^5
zj64fb0pJjX@(ODQPBQ#9N^PPe4o|?Y9EP#VAxak%yPT<JI}E+tv3hx{qGDqD;rikv
zY7Q%`ULdne72b{(pEpaN#Q6AOATe7eru!}-m}5XuAcQO8OvEp+Mk}jHSTS5!VGaHS
zlhiYW!tyA)H<Y}(;`64o6re0K@Vy$-tYJL_)ngH&{SJ!!hnx2R5z)b$vGl?E&noV?
zrKRck@KfjB>`nCDC*!(z+rcg0f*3{C7v<c9Q~v5cU$2FSd**Hnr^I}HvG3hIrHgLo
z-W}01sZZ^hDw>#U4xVc&X?z#H?l_B6cH<P^)WR!n^B=rp8JhGsopT9L80g0R;lpbw
zb@lZ>%k+SX%6Fg~O+bo@j64G#*i#t+VE5GOG^zZ|+V$%p@hd4P95{FoL(?c&EPj^4
zeG#_q6d{+9hJS3JqB?HcN@6>C_H0jgMd$$xjt|ZP0_m}DH!&H4+#(}F2))jy&t1JL
zA}T7%%WHvFkH~d>0yJ|@<RJc0SI~o=i}TQ--L6n(=-#E0SWgue7J_;2b_F79kKcl~
zQ!h{FT3;W#-C68L!?Go&ehsPR0WJvBQa9S4Xghvfn(a@Ct0MJ$D=nQI8M%^X`UVF&
zlFXsvJ3nac>l1ODw~>12<yDFiP4_#PFMnQAQHhL?UxH{XRg4aN+MAhU0A*f6Se`tV
zAhpmrIW3_9ZI(h=pxe524`gTJ7aqF1pEWf-%GQZIMn`^XB~1Qe{KxYYz$&!3-Me>F
zvJtisL9@3>$5xYAad|1p$@^S4?KsN1cQ1wVy456JSw8)`XD?sAeERfK%X{)C&`Mz%
zCbdRKN9TF->O6rP&`WsXHN1N2)P0L*<Y@$cO;;4!!g(1J&&QAVpMS|ZU_l~LvVATr
z)JF4neFAed?(ZD2?oYfJ6k7l*s%mPi@JaS!xj9=25zx7?|BDAbioqV&?!G=aEHBje
zkVx}OfV4+AI5x0h962^I;rnlQ^#zDjx3#_9W$-C!*z<{x&vjM)qm}S%@bXtHw?JhT
zcbreda2*~+5Mrt)WSyT^6+0Dv7j)cPIaKBg4*M0NQs|j>7H{s+CJXY(3CN}La(O+n
zDNd6eu=v1%hqOO&16cL5xCiGY!Z-*;Y^&7j)vMozhQ15L{PT?D#9At#LoN=EeXi$U
z@t&2Ge6x6&d~2r<VJ3u@;`#&`z6krzPucaPRwu}~n7R7X-iAc;`SVG=2=XXdlZ}Vx
zHj2oa_0+t)yyK&za10Qkk5e3bV1j&0c=F!8-av9r0C2v)4r6#<<NcUSZO2UmP&}dC
zy=Tuc_-|*+))U*u5RZtj)RX|k4~6fU*qFuj-bKWqiR~mSty-oPd7)twRKDi8k!It@
zs~F&792RukZhzBrd_agQx&%z$`&^N6h-_(MUljPP2h}njO`|`+07<@o-X#4f6Jn&9
zb>{>nSb8RC!M!lD^zK;&NT7?eN85-sX5C`0c<9)&$<q2db{sPOU~N3hf({BWWbLrm
zgqWysc=zshocD931@l%^R3uhMUJ;5I^ae>v6b<)BkH|4zULN&62l)KG1S_j60+;uU
z7;)d?uVAtRJlE41VS>V)W;eyj76ufFG$vMG!WA)yIoy3(oP*pd7R*H~3~n|YW^^7H
zs2@Ij7!neK193-GlPI@~;-6Z?#KpxK9y5t3W%wi8w2X`{tVuFN<h$bgZ^f1D;JJBb
zft}dG;S*T?uaNAaH0|8HnQcHVXU&HXA0RHcKJnQj4JJUYKo#BI$I(jOym|Apl&oIo
zyn<AgnmUE04<j4O6u1NhU#<fh^dWY*{pi+h$8N16NpxJiFkxh9c!bThsKXL@AvsE9
z;j|fM9imQ)Sx`#E#5C&D?bqWF$Ju+U)~q`tfY_&Aw{9KpHEr?`6~9i3q#5s{x3si0
zGBVQpgI!T2rdHL}8=`aBkV4FA)v2rRva+$=rTq!V1KI(<0-3OiKQwO2s9@#fl+EzB
zr-$-&>K^|69Qhkc7{8RL5}!@HSj`}NL7Ln>UATjka|hIKzbgiqH?_6TiJ<W7%HtdG
z8xT8PX_=S;AABRiwY8`s-CT#ket!BQ@$uc3nyyc<9DhH40#$J(O{v1)zLlycYBjHL
zi8UAbOU%#^-b&uM@z&zlDzLLcMTggZe$w~oFx@`GLtK;ib0}@)YbmDfbCS$wsBh}%
zREEGR_VWh?$@v8I9z^;EgBo(qvNYqiKqt~M4{~u?O)L?qT%~oF7HL`oRVIgb9ZA<P
zEy@JIPOlvPmUU;>5!Wvripx@==Ue3lwL*&0yQsiTjx=3Zaaf@zhP=t*S>L{$!x1DH
z(&~Qwg0jbPC2{Zm{io~CViV`5F)Gq4$1H-l%REp2@ZopdBnhoOMIYi55}vHX`9Z3u
zCDvc7t0y3?D)I-3-bsWJ{nFPzZa1~G#ugXsCh69$z2Dk;P4D-bmdw-Bk8fL8j89C&
zCXT=w3Yq1}IkBo#|LxA7E6OhFdS2c-{*wzK$n4$s5BId!-hKOey1NyA2}08;@@A3Q
z!$fj_qpTDXFN8aj*fW9w8|v53mt}wK?%erOR3vUQERk`Axa{HbBb!J$^ife!yb>bJ
zsO1l$XO&oK%Oy^q4wrm-ca{$^jIfj{MlA;PK0ILIcpiw3nwlEoit7`!IWI<`#I96W
zbZGFGG&J6xo(u2~FK|FYIE6{QoLpIBV^+|)EqiwFW)^p3+`JiALoXH_k3&5$B*N^V
zqN1{6=g#Vu@41}b+e!Am81v-i<)LmtBf|l3a9~U?Q3!Mm4Ep-|ym8_a3Z|xANq4ud
zre+JUD3@jqsD+-cE~>&UFx&9(@X^uHEx8+^%%MP!O-(Hgx1BhB`oW_|bZZw>{(1o(
zg@<>6R>Z`{!pg4Vw?`Ag8t{0?2tX9T#a^GGrI7n!2&^sQquy>}=cYS1pp=vh5aH-~
z4(=#~Hx4&DLVE>rfP@C;`=ym&kdH&z*{J>M#(qvD%w9y>u6yV(u!%-QL@c3?K)j3n
zvYC<mqGZd@&qtH$Vm?AkiElP;$!Wq-0Dve11B2T3$H`+omF%jv(c$5fO%uN-Mn>M`
z1MuCM9oOsGU(kl80udfoikYaZa4S4;bydr<;l_$Vi~ZJ&CX5#r6Yhx|cyPe<A!4P5
zZIL~H-onz-6X4CH<SX%V_x8@UozREH2ewo-7P+eW<4R9QvqeMq7FQ52AKEQ+bSXE#
zZqbVvTbkqlIO~HWqQogbSkUm#idJwl(9_X5&W!%XkufwkCiFCLAwFJS{o8d}S)$6y
zwR2NL9|iUU_OP>y+m4^Pdesg<0%j4vx+d069FQ(8E$zRr+TFNg>sE6VVnWWjl6X=!
zj-N#jfzsL2b1?n<s*pn{eBq)MfETQ+5*ahZeqaLs>h9fq)Mzzzb&TGI5U$H$k^GNO
z5(_mP`uh6=o;;DD6ap>8zV~MC>4$fc06&y7a3EpfLGl&iBcgBXGraWozs{%Q{{6=f
zODn55b;+;59^jv)^B+-EQc_Z8XJ>J&0_=uG%zk)k3eFUJP$ihPUDj?)F>VqD(aiXR
z&CLR1`}9dl>I|4W9jBTg=8;ImuCA^J4<6tw7{tDVzzoo~!Hcm;*}aaZqsUpX9W4t3
z6o&7`#b{hL-@Zj*laZyEKcXQ)I!|+$vqAxokdU}}vlI44^n-M7|7XuWf-iM;-jq56
zBmjFX&$T=fv^<Oq(UI4$TepATz8{zhEG;fTRoRkG-`v~`5%P`ob+Ai*etyJUS!pTs
z(-8z9DjJ?+Fax<3)c@<ZZ;k``>S}7;yky5T)>VFMZ*OmPN*j5J@gL_x=jeYvBC?Y{
z^g+*OIPzv);oz(W2xLoyR`A<ZvPFe)IYz}}{OfSA5SJ72I}9ISDO3gCieLp;Y)H}z
zMe|0piH^?1Q7I&P#h#56t95k#l$VzacQNkTWnf|=XHbgDSQacU4oMsHNXX`?`qjk2
zV%!bD*IokY6Hh^a*U?KSZ11$nDtl?nt8{dU*MavgRq^|{xN>nq2*_CVCKtvsQBnH3
zy4~;<4i3t4eT7v*{lks5nYl3u5b2;CR{m@N=zt1`qw1H`)zz<DNyi`uI}*rm#K5F&
zVuJOY%g|Xw3}dM(Y%oZfoSY<vh6)OVX$K<*sMPTB$w(FFE-m)8wT&dNN4E8XtMmr#
z{kwp~pH+eJ@$n@~Teock;?v(NE*|{VU(bU8C!Q3_6S1gS%cod>VtTbsCh{((YUx-e
zkRTKxzN1GYhbphAs1PXj?AiB_d@*IfUvc9;Fh8PX!|x@9ZQ1)4=o$`$!%8ujP{Z(4
zjap`CXsGYNd*Va^xYW1eV#=Z3#`G~L9lEGWK)YX$w5x4IHN;$$-1+ez0fAJ&3#Fad
zGeB=@grn3MSy}4K3rYr09b!nW(Re069`JEz91^oM==keGjBt=iU4o_f4g%v)`aFAP
zZe^vUs2Iyb_(<yO>s_B97SLB}xCh$XZzwA(+XiB1wOkt<`?|Zq>0T6yiHZ66`0$C_
zKoZ5eZ(u`vjMNJ9M8~GN7*FPs|F)tMY0)UYjtjFh)6*0b6uc#>7e$#(0Mm)s26F}%
zt0C!xp&?5}`<*|3zVr9*UHw_&St!@R@VK+ms$97OYe7X-RT8K4Yi_M|MH*UK9hm5p
z3vzRF@82I@!~V~vP4;t>9G)m<(eK|My#4akt9pcjUjNV#3lo!i-?1%c#g$0%=GX+(
z4#qV&y?&vgWsB5cbM{pY4bIy*%({qzr6-Ev0o>*ry|F4+6RxfD)P$A|vwr=b3&kx?
zOMyN<k{zI09*kPIZdJcIv7UiN&@`lLJ<g3u!)5p2K_O&=-lsQ130eX#p`L+3JPv{H
z-YrD{fAY&-0JDMKpK@sZ%dZ?s&+OS0ADi*+%~T~ne!P4A)m!?+2`ymL-F6a=u~=Sn
znri~OVFT<2e^6!4i`q|{dilWFaP0<~uWGQ*(^jpuB)*8y(8{-`HhQf>RTLH$<}EEN
zb3%3$U&oc8h`u966<?TsG&Tn4ML2-7*WWx($rhHFIEwDYi-u&4rTgM}^C_r-?O4}^
z3)GGn5rAE6k;3Wv$hU7V_3hmiF*VeJ8wNHALH)w`L;BpgM+6%tzox?>DN`}<o>5k|
z1|G&xnIH=*uf1!kQ^j*|>+qOKJ%kjS=3Df}3eJ2R!eCYKp!zo}lduIP74eEq*`JJ%
zIA1b;FXsO6u<8Dq>gwM+e#KrD01aT6o13eo{+7JQr|2J2&Nm!5mpX9*l}K}-_|<u?
z#l=Nl3`K8gZjwI*OgcB-bD{&*BtQelQ-w!R2QfF}<>%pP-|@BidrpQ#046%P_pMSW
zed+NMBx8g%p8fI~LGu2650<-cv~bx0MH*fMFUvLL9C{3zYI1Ti5uYz!>>mjGdy^q?
zone6_HdlbbK*)dy6kITHC#?Fse0+hJSb$e0U7llM5qHdTT(Fgwm&b0QOh6nFwjLA(
z2!bOO13LF$PGR9$JPZU^3}67Fk5L<}&O;TUMQ}RC#>N^fwo<HJ>-*xx7O8dAjKyZr
zsLsHhZf<UXH)AvQs026(i$dw(-~fj%rY>i?G!)r+cz)K_`h~Nh;J$fN0qB6B-?d|h
zeYF|7y9H|K@bEA?74SE+cLbVUY-|&hXehI1dIuxlzdze4$oE;}^?BL2NiGemyW6JJ
zA@jd~^yr`9GIm!mf?TJiAiXCu$qdsUZk*}ed5ZEN9?xPoWh^gW4kitPOE~zUh@iFM
z<^UugsAs9P;4~xp?^Z^}0K9yC--U#fqfX=6k>yZCakP(LW&cD(Qc_h*%M!8^3x;)t
zE&(eSMy_yZJO$v~wslUUIU$G6&UW*R7c?$(pwmQ=fJErpp^K|<9BE(67M7GW4ue2H
zXFVVNjvd9HF{@_WgZ_IE9)*h+B`FE^iHu11MjtBuGX|<Z(ifjwvzt3oij|fEG;rOz
zILx&0&>oyIeF<*}<f5=ZdV7`|H*UzHf?)|^@a+2iJ6W%?u)N$9r#mq%(vDl&48&E0
zS4)fdkt2^C1x;XI0zkmiA$SaGjtoXQL?IWEhkl&unBL+Rzm}CLBVI8HLz_Ty5)gSX
z-Rlr1r<l{?JVXIVQh3n7rHdEAf-8_5GBV$B(jW!%lW=b`T_Vk=!C1R}=gyhd{54W%
zV5GsKkGvz|+4*xenmc3_Bn%J`xFn*hwyKJkn|mi>GdbBZxkV@!zXCxFBnifF)DWYe
zVH*~{)k9V<EIh5Qo_&3g_!@gnhRE4TC^{e?iu{Bd5%?1f4cL|g$3=I|*e{X?e~u3L
zd-MnE-YR#8XNYHq>0!yvgEcW%V1i=o3dZrJ@!?@wW-Al~Y@2E4o<UAL52_1N@Y`E4
zRS0fqQag6+sEbzSvbMH169_}QY|eGkyw!ozzlir}!)=rlkROEbI1DMN)pj6ScW_Dr
zH4FzWx_f$NFg}Gt!7TW>?ff)1QEN|8<GCQF*48>xC-%7x&)ACv2S0xfn@Bcr!_FN$
zN&-b{(K}C{?m`CuaE`zI1o9TZh<Ela3`$n5TGh`1xRz@*@G4Ez<Iy853=AZ7yFhF4
zoC0`rga8sT^ieWb-oWPS>gE<KXx0v7-hYEf3KJDGG*6%p>?}XVJ9zL{)jMa!Lv@ND
z(S@U%Bys%4KjK47jA1g33$&QnoPDkkcB?aHQ`*iVwT`0=ptQr3blv1U2FDjJ5LF!y
zeBwhiqqmRREY-@ZT^ieN=OMw0@BIuR7}xmH6JkAB88)4US(IR_knlhi)W<R>lYzca
znd`3Fh%$2|--Ijtds&%|i9g;A$gF?+5e}-@+0E6}sc6gS=^YF%b3zY?30^5)!^ptk
zs?(LTy;L5w?cLoFA-^~8rls6OMHRgXG_4+U>(i7FX#7R2jeWBT|4N!Eo1Mq9UK6zm
z3l=T2LD0^&r#?A)*xA*ULbdrE<$EjU@|qg$cGRPK#g@ciQlpL`2I@d|J>NOFxv?~l
zJD$G8zEm!z<u1+|;A0JZR5<YU#8d!dF)DLmJ>)e$9WVIBlOUxgC*#;$@07+%>WPTN
z#GeHPT4)m`jZ&0E=Hjy)?ClBj2S%ZwA`i2Sq@-M4zivRqz=N{t8eliVOG-+Lno;oD
zty`9(ooBt=x$#sh3>_iHX_=IOKb4kxJ$eNCtKXb8gA+Rg{ryQvNuQZM7ZnkX9xQyn
zCz<Yj#Rd@P&H;Y{0t4@h50fQKiBKL0q$qnug@q_iAneGt@cayQqRH6P!&@-{Jzn|O
zfvl(~JmUqab>AcikDb!~!C+!yVs#&~0r9f1Nb>Wmg>VgVV7IYe5uhi0L+DBm9L4~G
zFh#=)c@wXeej5%N^t@e}`8c!kbeKj3zP`<i$9Z8Hg1+;1h)^)>|AeVIAo;r6+CG4Q
zLRFxm@&v5~0I%3xi~B>rCityb{Wr&00B}N;ATEC+-I4OzT0H$=aen3_ESWg+`vS>*
z`}aBb0Em<~fH8Q(`Qjl#>ea;T3@-C_cTczIxrD|DKn7xg12)>F0bd=GaLr^eVj~)a
z6P3fxs220E!X`1s0(!%B54l(2@?|))>=BcgTceUnQ*NLkUO@R#X*>^wQ8_g}YExZa
zC5i=xkHB5Y8FNuJ=w4{&V0EalF_ZGh$nd)_UJ#SKbyN&tad8@#E)jJZGi0n74jlv>
zx8vWp?@K|!J@vyg1H{Q0-EyE1wi6Gk23cY6#Lg$#Ct$s^vH~(me(GjHFJG!;KI0eN
zAQno=h5^4I9wp!@{dfoAHbIkP_MU?F1SkjEhJKEdg}cSXnwy$%!t71ZpaR6&d-v9G
z=;N3IJUij2%n-8SscL~`2G$g`B)DF2@(bxL_sJ>5kqev=Pj$f(Cg@imGB+PX`fb~`
z4b_JQY?qT0mStU_Gt@@xjc)7b96x~=)dP?u&z*bwAtxuNShgAkP^(@4yZx^<bprE;
z`gehtDcGG>C1%<_0Rd|73<H6VPaJMS&FM=%ll=&r&0m+EvleO*J^(M2Gop*c#=?83
zm7953BP3@4OlGRZi3JbhfDr^B563%U?=1@u`e;4Wj9%{SOae|pmG_wMOuv@Dde*oO
z%3zD*oDq0204)SnZtjZOT5Akp(Eai78O-1AEY7DF5AmW>VNqsBI;5qFkvhn@O`A4V
zRac|lVZwm^<p6~U)fuB+;-EJA`|cr^=xD;aq$>(#9fw~aSb~`0zmqs)iVs7uSXo*+
zz_W*1s{iveR9Czl4Kp$_P$gP2EDz6R*b;vg8H$r#*vj%aP6h@B(7ksCZ8*Xy<~U#X
z<;y+{K#m+ip7cX<gfo1L81V!ofE}{RPbkAEHqH10-;Iw!GH0fz|L*Gg8JE#-#ZP>E
zSbmfV&XVAzXL$4GURp{@Hb78jW@a3Xgal%tjPi<S`7{790bocK<Kh4Sj$i*o(IuE<
z9u*wSayfVwLE2DPM;UAePX@}}9#=sTk(M;Gf~u<LO$X&~inTXp+xZ>L=EF%PG{aft
zirb2pFINJ-0_;0?u!8OJ3PO<p92PpwgJy#r`w`<zfHd^Ety`ZV1#uAQ32h<>{fQGN
z5Y%4x;EqAq(MruivQ(4DZYKfd6GBWe04!FpY#TN3B?Gn+F9rszYoZAsfIxl9v7<*#
zQ7GGG3FOE?c;%oRG+JAsz@C2-78uxsZtPDym<U;!diU<#&`_-($^m%7%mCmIJg|~W
z2ie&pz~STL#W7t(4MSRCKc-QPktFeLtKM<qDSYWqO<?b^yi!FxTL@Y*Z)r(Mr;Wne
z)si^5HU`PGQ8hT}?%Cm;mY_r7<5uXd=kExxg`ru(*t0{_W*C7CYzolV-qZ6n>%d+<
zO>VsF<<AC05f!uOOJkgt3yD{P{t5`VMN0JeaU3q%o!FW)h#M-RC}}+KxF^F1PDIfb
z{TsL&U>KaIy{`Hd7TCv(LbovxhbBo)MYYeBQP?69UyK3FgpY4na{?us7x~p6bJD8w
z)_8BVM{_<F*J~*UYXJ8ate;)&?YoUtejIn0O+qCq1Ww%|^q{FYz5cuTuP=%1!V>0q
zGK7xmg~JCADmL)F2>c0I;Im?jt&I(UynsPXf3&LRiP+zrohW!{haVvnL7u|tmNys1
z-v!d2(KIx~BQE8jnf=g^eS;(eA1s1!i__&eWrU+LM^V-D@+8b^&*2CQ5n&4&As{Ca
z5dwCThKMgj2wsRX&Gg%+rc<D5B#qb9*Uv)jDyd8a$j027fmfG!s3VYG)RC|}7%&cE
zCx8GfQX-Jzc!9U!Krs$4IbxUAuV3GFkDEvv67Ox5k2~<EL>&%({3#2Tqa2x-U=nu}
zZ9*4A1(qM&MMp>Hg=04_nytFa*)XQ;$*Dp&#@GtdjUJ4bG%R8zWo4<!$*2N~zTq@D
zW%uS?DmVwx>HpbOyfnvQgVu>X`DB?2*<@U#bAH|)2N^QVpwmO<jr$!+TuKX@9{*w%
z8YFJy<+=S2l9Tx^pY9kM;<7)1@tk&YHwwR50G@H@onj1dqCfBy?&5ivWpo=c%s^JR
zz0=Bsiv`wD2=n!UQp7<s{I$>aL%6eu!wi%ZE-;Cjv-|;S;5~ZmSaf1EpiO@M-512U
zX;c*V{$E0Pc%f}no7Txwr_{4;eG9R+H|WptKzc8Fyz}zd`WX~w9VZ@bjI*y{3dt*W
zXLPT0+XPqv$fmRxBcapWuF~Lr`Q4AJ(h#B=215#dCooic?GD4%ckWT3r~c;D@n53@
z&svWi;^XtxQ~iabeHgT!vMGTx$DhX-044ks_rCWty(lH<6TCsN)nBn~-WX;=j+
z=DTmUjV&ISC(Of0cfN<?grwwkkRLRN@!+_)xbykCYa%Ch)mVchY~%&BVBx;uTFjkv
zN?6f|<H%UcNy<0M&h{LWu~Em>smi7?o}Ar+<Ehap4qRuHw6(*5iJ*&GSXh8|Y$e8v
zk0ri=P9Zw2ef(ZQ=e0t`=LM6&0~8|D?qSOGLB?b!su9@N4o}F#hi>oWvAw^Ejd%dd
zZgCSRSa(oG+GPU+0<vy=`G?{4%a@QFFW5ZBE}Ki^_!`1&|DY}r2g@&c5$KYTt$0z9
z(uO}pU^s~UIx=AB`}+0atPt)o%Mo~fPtxtM=&PYxn~XQHdhte^%-H8(BZjS*z3~K1
zCx&AzzI7<CFBe@f1Y+c41m8k$$Jk;4y1Kq!C+Hy#VCx!cX%U0#+jTgU{&Ka5`%zxr
zo19O!@N1XsV3<c&K(Mwy*g&zm{@}BRQ8^BvMcI^(X?f5ReOUG#toQx6Xk%x0?04*W
zy_ln3>WVP(@7lT3z2Up4Ef}NXKl^q<2?w!X8vQK;b_ocwFh8$v(`$?KzxhdHYN^J7
z#g=UbrJp~$x(AEi`NYk0^yrg?vY~ggN-kxQ#K?x8zS0KJuw9iOe(h&gUGQ0B3@d8q
z7uIz+{K*+f$-9%sw-<JdyoBI;1#RbdnnBAE;0_;y99k)Kc&v$Dz^hlYc;d~v?@=`-
zDlIuEIrRrJz%L0HJGEDb%l*LUB`W?&=lUPJ`ma{sOt+fc;ZvOo0F>lChh9XK50B>(
zyFn$LTJu(|S+fRDNS2l^0t?a8p}6$XYCyG3raltP%GclHWlh&7Jiu_bu^%4LjMl4r
zWb|?)fT?a?MQw1;Jx)WE@e<x-vyRhd$0A^q7#nf6^ZhqHbXpbY_56A_43;0x-M@(E
z27o#65|4|37E@=}BdiyeqDc=TAOUO27jsOXZ~{lElQX=*3H8y@Q7n6#`;Wpxf{-R{
z?*=q^+0LE<@IYi`ezHc_G&;-ql8V(YiaMy6gvU`zlSMW*fChAb>tMurA-Q{y;6URr
zzws$R%0W#&MOjf@Z3I;1n|<C~Ug}|bx=>Ni?9>n!JG-)+(SCw6Cb#1E^sZ|9WLOPE
z;X%*}G7m2ggcdm-K&1$ch+r25%z4m6Cm<~R>ix{KN64b`-88s%?O0FMT|m|N6$fJ|
z34d@-A1dxPW+fa&d#8LjGqY1b29bd5hNZJS4d8_F)cmVguW<6r-NR##Ya{jqfTh2;
zAfrqfLX@`tZ>mk31oeOaQYXO`{bk#tqck3OSIDh}V%=C&<W>k=NHv*itPUmUmc3X@
zQ&V6e@(uX0{WQ<5ZdY1t`(Qz0@KXP-8pb)L+qbKol&(kY9es=z_qDiKUO@rRIh5fT
z^?6uVSBD{vUZ6S=7+mj9eEIxYP)G<yG}ujULe+snsB-7yhqh_aO89y9><LAa*(!wv
z*~-8Gz+B9nmsMd7)8fVfQ#r1qk%TdN{9ZXYCR+UasHj7t+IS`eDC^c{4KyzhNd}x$
z1+-9A1M#%zhA0UMi>(uw1^>>MzH%x%<Pql6a3m470g=rp4$dauk#_#c5->Fgo`#n2
zqNWFDc2HQovO{)`74$)WfT?9}qL1Bv3?l}|jUK<D*X9YFc~zYX;7*t7ai#b*E$9NP
zT1nWucee};8MkdaNqJ|y8~)W7rUK<SDTalx>i8P3sH%$Qyo+JXMwEbf`DC5qivC^o
zy61s`9C5RgME{{Rwg*qZlP1n;p;n=;dR6l2{p<vTPQH-BfWmiP-2}mw9vy>VQZ<?|
zN{01<5FGA<JJSh;15cFfm@N|yO5z09J!Mzk+t-JOzr7z^5Bnkz(LJyswy?CcX*8^<
z@$s0%luQX$Q!6V}kz;lp))$oy<m*(yj9X-pg2$|Ph$+B2sXM_DD1d>Lzq(~`eBF0N
zbEmR6fY0~uXD|qtI)ioqKUP|)OUsc#{}jNi^NicKE27}^c^u9;M}*Rq)pxI9uf~6o
zwYsTuR$nlZ*g56%YmL5ga+#R=g}8-l-kZfbFj<n}^2TBL`!MhjQ3Soq++xHqfMT`e
zHR9C@9`NJG4;YFHa>my>UV~jx5AOaV63ULMgA66c`f-;Wu5K++coG!EnPrVJAgarL
zSCGAMKrDnZa2$k%I~Tkd>F8d-p`nEbXAN%L0+=lmZGsd31)o0E(SKd;HC3>Cs^d}5
z?(SbPmq6VbZE|QLBzbwcx<H1y=-hv_mh$kdvCETj(18KN?$cgUP~gR*4RD$fXQ))v
zRuRSwiS42XgRTPDH*#A2@=Qy)Q?k1iSQWdtm06;}3W%s?IKtvOL=9+5ys)_NO{q5Y
zua3#Ula?X!y$KB^&Q}3M2thzU@#2&R9rqrg`hR#>${TXU|9Sxg?nj$Ed-lvfQ3~Yn
zAb<NKZ*QF&;SuqIjAj0Z5W&`H!lrFfP#(0jv<jBKY@2#$Hx7*vCsnHOgg8aTYG{RL
zKKw7n-aIbH^?Uojjb%<UB}2v}DuhTSbA_aV%o)pAhz4qJWyp}E2`Lf{Qidc=W+FS4
zq!A&K5=AQMd7s7Sv%kORd0xNg{$uZ6=)SM}y3TW*YaPe2j`bxnIXRhlqj;9AiV`8x
z?pC$@<*Qd$o|+1lPQIFKU%8qqCJrFa=JBv?ooNh1`9btikHr|o=V4B%cgw)JhF>Ns
zo(1sB-)p&j!@>Rg!!E}jt~4zhIdsH`UwzECy+37;MFDl}*xJ<hl~bRaWbGZFw0!3Y
z;r*Swv<Fd-@L#>Gc=-@lwa*8O)Utb;$1Q(yX-%*1>35gwd^KGiP9!E$%IL8$ooh?Z
zTMTT`0V}BfK;#1lp3Pm;{b!H6+q$elOYFVh>RWg;8W74FD$T8*Ud&~^K6sN7O%dYK
zafSAm%jj3JLD_$wX8@!46sGL*W$YCsbra18+Kno(j!Iggs%lhL`}-)meM)<$ysWI=
zZocz3>fOXpq3YCKVOF_mon}&a0EjG=yVW8iqgTbn&rubz$T$+&R<THQ&J5*%y6G|P
zc;cprs@#<;#(K(T-*J9;M3F~S_v6a|#kZ_-Qq%Weent6>8$P@YU4g4<m6@1LV7O?*
zggd!id+e9%(F2RG**7dU#@wB#O%?->-_Lcel~tSSqsSqaTb9^aAN#TUq@Ul1q9Q?3
z9X}bc!fhWiY!H-7gQdcBDE&l|ec{5h^mI+o(8$t<Q!Qv&rh4ZT^hN0mAiynCZgDjx
z21taY#O6!U)EjQ~Yi8bQ*RdnVU(EHyVdfW4>NsnAei-FmAYa3USA{<~{xCK5!sU>|
zm0c%zk)LHHw6aAQZ65RPmyp7$>Ux6o=ddr2_l>k0S12D!X09G$!neyW=rSneFj+_t
ztq&Q#-^uu)G{QG1rtZ=(UAQuJZo7q(KV`3;WqmwnMR>x+YA*Bm@o>1?Rgb%UFTMC$
zSjRao-72U^!wJSS=bD-Z(4+%*O+~PB)hce0QqG$@v0=Gm*COj1jFW8mWqNV5|M{mJ
z*h0g9+tkBW)WO_)HmBq3+pQetn!zLhQUeOik}d&+A#=dH1<R&huk$Xe{;UA5`ua6|
zj8yoZz_KPz@&$+g9N$cP_*BKWL^K2tN9>G6M9DB;ups5vtZT&w`XUjfd1>UVC@zbG
zxxA?8GP@12m&02Q3`jn1JTNb1(Z%_f{!XkAVl&BP&YW#3f4}~+gGy4Wqe4Gd`~qQA
zI`-8*nd6AZ0*L@;)W?#NtXjEGOBQ5(SMWBO<v}5T<;tkY7Ha`i34{j^9m<PY_uyi^
z`@w@NW0Sj<G?qPM$969?&>3f(F}$^<&cm$L#$~(FDo}76Yg7DrdV9y9A{@2);Niow
z3861yRsoW}{FVD0sY+C0;!b<}I0|Yk5D4>wcC{sWfr-=6Nq+U}75Fp2ILfna+rCly
zBTQLfVeu#>WpVPZfS#(0?$>uX`f8N^%GiYC%gkd(4GDkjH0YLL($1`;)SQOMm|ZKA
ze6~i68T8NN$E6GzaFro~b?@$bJRdzuuDVUk@Vz-L!Z%DBeiQ{fXebD?v?BE#AW#0P
zoKJxPFV?hg*Dknz8THoL;NW=&3<s{|P@vxP-Pm#4%1xV8!#?IA%sVKr@#y*U=cI=5
zYcF&>YFc*n&aRaA@7`V7{-wqV6mZ`@>DoESw@5iaTf!qE9;ej53>-rZY*@F@6LvB&
zF_9VuiGU+C4L8Lj$NR{USA200UusuDW5j8n*4Nh9+xKDPItl7V>=R(_%9Nv6l6-oZ
z+(0dlCc@{>VnJ3`djBf$&L?Z(z{QHdDT0CmPF)>wrlzf(-I%i5YNTH57X51rMW_Ey
z$=L-v7j9>l|Acq@(ElEMm}PhlLGeRw`8F#jgI}>LmflfuBkBK$Ql#BvVlbt{fXnaA
zW^FcqxvT6<;kUg7XQL1MQx($zm!7e(XHS2?BKcmuvNd#uB5*Hg^p*{y^^Tb;JC79_
z!_b0YQF@18Z+3O}NFfc~Y1-Y?%JbDI_xh3k6CJN{j4nQG=3C<)kOI-Qw?i*8U&EyB
zpBl{CjOo2eOXMjNG>-fE0jp(4M_=dUF*H12sTnO5z3GQ3Y0e{sA%N)|B^sK+bWvP6
zuBAATO6Xdm@E*i(HGE<TClTjf&cbNpgq2@*B!DI(=>G>xHX*T`EtRyG1z~vM%$eJT
z?_RySd8`m-DgDWl$Pq(6(#S`Jd*#X&cxiy|gv3Pu-RPDkj2oB0jwFnKIuTv>l{VmA
z#Dr}##Xdx<J9y>Fm7<GTMI|FMGh(r(uzd+qH-Z50@&Xq+-;nzFam1UW$?(%eNJs)K
zS<Xd8eFjYV{4zQ1*lRM(a<T-HlBotF5nc9%0uqc2EWM2)7(AhQ$>fl#uE)o2+K8jk
z`}fnOL2K=~*TiROMKWC!E=H!Ny^#|4_DH*%buBJDCF4cb4VyQux18i-vTsy<Cy-HL
zy2?OhdmiW)9ia)4vu?YOyruS6cQi}w24DO>X|=DV^a1(;4jc%+1ZQ8p(198?j?H!S
zs4P2{Zj+de1s3ip2t_k=F_^ot^pjW%XWP&IJ5A~P<Fn-x%}PfuojImUM_70aU%Xd(
zq)cof9dT{-@rkRlYMW<9J^7eta=`I*S=r8tSCjPiaqjHhD-}OyU@Tnp<42D&qIy8<
zUzfTf_3VcAQgidMVr}=E{%r%1m+Zm{jAgj0>eBd8dWrN^Ws5rK$qDn)v@sO}Uhirc
z<odR_c;qFVm%5<c%XnVkD#Jnm-kBNljK|pl_0WrG#eoCwqW?=;+C>j{K|$3E<ml|4
zJ*~TB-{^$Ek(X{+bi+WgGuNi_Y^jZ*Z}l1$RDWE!=f;ikkw>7Ea}ph6$&Kn~mLo^S
zSYgD75we=X8{=n>>*EVi4<Z~u&RtL#n(w!fkm*g6!?`b1nbL|hJw@0R*l#3bArBA2
z&1C%cx`q2MRQ#f=eQ+P_Pa-|;j85V~b<N2ZT>z=33(#7`)TyqHy0g&CZ1Bb&+qa3t
z-`~&bigL$zxykdXZ@Of^OZU2njsj%m2K_2zahaJ{><(NraM$g(@8H3+m$(SIQDK^J
zCy+L`?gHh9udqDZvC{kQC{O9tt!1QJUR@D8W9h<|X%l@{uUe)4?E#7X?U<5p$4pyz
zJ^ZBn32h_+M$xs3(Mb>+OTTpsJ9~CCnml>P_$|rn&|w^J4Q&G;RvSLBC(EU)cF+vS
z+vOLv#%O4)`h5Aym3EQ;u+;s(Ai4>?9BY@-BA6-#6!O5KLx&C@z7`uhy=1r%h)8}5
zC=GwVor(+Ki6%{Ic8I0^$jQhzh;EBQl2i(&R3FHyXemHQHJcdo0ph_my>usrecrSV
zo9Spk^uB%jt~T(FIv^gW_0__v*bD44!^OGmDtJg-s>t$r$aGp;JlKu+t{MRNe)GRx
zanLL5eu;73gG|qc$X0JZlsuekJZ<8xpMI@TJocBW*pWa5I%>LOWpRfG>1=hD8ZUC_
zyM+)#5APZda(#GWi$Vh5&oJ;dhRNk-hx0p)3`|>N`TfjI)2F5nlV_1iET&g+`SR2F
zSo6(ZxSc&nTO;)WRdDNv?^#<{<&0sk4hNj=b&y0Fd~4+hPxM(EVxlHEIy%<scQ*gF
zu(amU)2B!FH`IOBDIeXk`Crry2pNFQsvnLIPibF4X;xm^&DEyieDvp}hxZG`6U79p
z%{-Tdr8zMIFxatUhm0P@+pnLMb37jx?Xb7!XhTi9=xt}Z2WIXjo8Y=^hH!XTUK}WV
za3Zr`mXHgv(bxVgy4UWoC&HmMMf7n#E8p600d@C$Y)2^5U{7HcA{X|%<Dsms?zzHt
zfNS)Sp5LuoQxqDemhT^SaOk7%K3-m=#RHIasNc`Z@{XJG*8(XQd9`Jd%Aphc_dk>C
zAq7T&wuP5$&5EYns_|JXeUH*p&{;ee&2CjhE@ew}Y&FK{$fLueWX3?<z-P&u*W3{J
z^{e<Oh<`QHbxzu}HN{``Jc|$g1LZ7@2j$f@z?+9XSs9UaB`-9B6&ByZ?Oe0t(SU$i
zb>7NDhuU0q2^hJmZI@Arwo@(3e*k>G$V1R_>uyo4%{f~3Vn4D>SL(YGOEl#>Afd4}
zACAgfO+9{Gusbb})};f_!bJ)_F~>K1gp}nYz(jq(I$$pJv)sp`K6c+fEs#$=G1enE
zsOCifTWjPz9-L5H-uw_p(w#eZG+S5J`~<Vm+jsHei0KcCa-Wz(xLQ84qmK74j_RMd
z<ICUL4W;qzJtwws`t@_c+aBdovj*0xx!?JfX}(gY*x^z5;VIwyz3-D|xK?4_+9^|}
zbm+8$l|%|fMg}cLME@RavW3$;RckYUVRMHZ?e2x4X;qFFTiKb`!l^-P-$mIb7*e47
zbvvhp#zQT`e1GMSIL+{d(TV>{U2;OV-}UXEPhC$$X|l2~i3w=8l3Iq4F&U<+C=Hf<
zaCQ>7HyVund3k4Nb^$_@H;;4IH8ZorcVYDGIdeKmOTQXtKvEMBOgWwxV$i)@Gy(A%
zC<v#zJi+L>Y;s4_@k9z!A>We6lbyG{s{qN!yosb{hwAp&%RJ;v-IbaCQIC4^uXm~G
z=zS|geCekg!eLlfQ{#EV^k+2lV|=VGynFLzz{OeS2#{me<!86dwD#>YFJH~s+4=Bp
zss>jxUs96IDi0^%R(S5zXU?7?Ge^~-Lr=F;Lds=2v^W!W#bMQr3o1C_P4d>+0cjih
zKA`s7*Q>a!?8#20{aOwOImay)E?hW&zM!B6zbU1W$%+L*u>&Q~8M4i4LoXQ_HT$nC
zNtmGz%m0Z&hs}B^`rf)#tMuFJFv3GCzaT%F;y_AjFUUVDww@pnGkf9R1HT>Y?~1AU
zLTK*$09^_Lkon9W))CF#6ZH$vE3?4Atc(^dO0_xPviU83w~Znni}Wh=d-S$j8&{`P
zjgKs%Xj|&|g8JCz&}>}kJ`9*9O<jBF;nyOudi-?m!RZ0lTmjb&_-;^mWz8J#6CXZ)
z%y@;Ar+;85Qs}JbK66n*5z@Ozw}j||Tm?bD_+)24_!76F<KmqgcA*5R@S|2DHGTSm
z$W?IZa$9;)=xZY=*!<j{dRzm|TEKm9j?S9*)6zWb6~YtGNIvcl5C2Hj0_j^x+OE^^
zGxNm@KY!O|cRQb|?g5dTnt%rSL1EVUIepI!+Muf|am<pwMhn&P(Z?r>j-<r|Mi%5*
z8~OsmKT&i{%<TP=WZ;b)$9GlQUcQ9+iB4jbq3WWmcPU^j)OZ}?t=Jqdq0MwTRIn*R
ze{z37JCbpbrqyET0af6%H)2d4ohbRs(N-amnkilKJTi~+!cA(k(_%yE<-6p%{JP;M
zlYyZvdQMMGKq0>|`CY3wh^6fGd6(-)S{d&lu}>dINvET;6khMyGn4*OVqSw#{jAUq
zOx{*%_FBVQd_sBoub9XVQ}GE^*kbhzL2Yk(Ftv)w8S@wK@Gm@KC}rfyxkd+p)hfl|
zL2^sn7x$CFaAEg%50Qd;TP(rFm)uxwNY{;{diL(U6JpYOm}t-TYrU6ilUcBUlNCv6
zt8yuOx+ei7f^DWuxyx#Edq78}`>@yHBiD-t_9Q2u5>1<;mh&pi=g{v@t?J|LJ!rrH
zVGPDP%dRydeWChwBx7B&q>y)@Legwa(F>UDuJXz8*UvZsMNTm_-9}oYuqvO!{>Jvt
zOD0Vcmho`I{-=D62ZpVYQ#?!k1XpmzX2*^n)K^oq7eUwjgiw=K6ecaBR=0o9Y~94A
zOl28aAI<|A`Za3$U%DVK-Fo+)C5>k4B4}dF-pB(aergk?A;(k}eH6JiO008EaeH=X
z6%uZ!9K+YsK)G%6X7=dmhOL`7FZ*?PgUf}Fp(HhismHc-*8AYxP4O%}D5e{4YYoGK
z$1SQSO8aeC?9J@W(z}Tgfs>dFiY#%0mxUh#Z^*c$6=|~&dt=1QmuwzJ4=J97o&}67
zI$TL#54(Umks)wE5jNX0Kuqc}Mk*<-B0mOw4Ihbe!-m{zG-TPxx^~^yN;sM6TSn=V
zL54~Bq~TDt)-y_9vYgG${o{!4UiglZA-TmM*B-rkHGF-n`Y9353+|%)p~|mcd0K;i
zzOg~+{=jtddX6O|lbz#`C=N%hFM~+TOZoAU&U#4O9xU>zL<r6k*#C3KW~3x=^PTmW
z(J>w-`*>4}rKH~^_E?pwFfq)@&9&oF2fsW7#wD52IrG$>XzL|y6j=;a-IZ!5cfFI9
zB}{Ne^B}K1>~@X4kJnvMWeul1h=xo~_!wGZ7R|W`jI5rVYF6{;_^qEA<{|bS>f7ER
zEhdBOZa<l~{!v&jofYdULu|G4jlmOt#@TK@m!qTo#%}6!s73SUyMEQI$+eC^u1aah
zg@MU+3`|=JzEzNxg?<(bnG7*jlx0BNL|eyKphhH-hWMkhE}owdgfjz$R3qtuO2c|i
z?3%Pb4$X3qI3cPkdH>$`)G2=Vl$Tc}su}Tyw=)S*bNtjM{g5V5joKLsSV5fljsg{B
z0}qGjd<u#h>H@Gqi2lDH&dbSBf7}+O$TFB3FjhA=H?D&Ox&{gx9}C#wI@mZHg(=v%
z@lOm(JmTZy!GzH^unv$pdL2D_xX;8J7_MO9X<f;8S>AoizS2*_I<rj|@Qo=pgjC&@
zsI})LEj71ot`hC-x^AO{sLtVL31bT6!@xIu9fpS>RD1O*+VY+|7%Qb{U&JZ((+{hi
zb2=hJ(mU_Y>XYAQ^-oi>Mrg-w2ua&Qb310k&`mnzu*fCS-Y4()a=WmwFePPXKN+g@
zTRisMl=GZO@J+%8SUM~;R5*w6_TU+wCr$`;6bK`W2dYPK>@t?t){pA7Jf-dJYpD4$
z#zD}+E_d$SnOlTZo>fH4P(61EiPqh%QWs8>YEWhaXJqE()dBGUvJ-Wow(uGw+Z{J*
z+M;<`;!JDZsZZ*$Xh#Q!T|^;ng_hPyvsEYy+hpg9r#E@>?)%6PxqNQmP*v5mM~`l>
z?orc<4qZB>$}6?eU{KRlo$0I{s6#zjUg8#iYn0*?=yq~D>WCq(KgmnTIg*KKFiITi
zCMF>5r6ZOk%n(f@d2w6L1DRca6aD@Wsn}u7=gd}MkB+vpwaw-GLTC~?$w<ku8?aeH
zRnkR|I-PcchXDz4B1B*r^S)DMT{n*hqtw+A3oNCbz{4XAn9P4$fW-PCOdI4nnyLU4
zoq|fSwMF>`l97m(2vPde1qRcnzenJ4dy{8*d3kD<8LliIFsxBgo491|JDL^>W*EE%
z0?qTY-4cR=ZX%*6EYu!2a3FE(%$dKJR8-({qTYK048tI&zgjA;gp%dr&y|!F#Am;F
zA!I}JYmvb{#)hD}8bL@sY6x-)!hS=|`@Sj@1xbRziN}LCaq;n^dgoFFmiX=!fA;bM
znPw-2aYybG9HcvSQdL%V;P?RpDEWF69MaMQH$bw3)hwGfid;E+YV+pJ9RCj=KZZU*
zc%26a%HL96wNXR?p|JB9E_&){Ymj=aS-l#i`&&GU_Uy^c%Uif$LHeU+L=>9?K_lz8
zBK_Z#f88@Muz@@m01w2o?R&uh=*adw?3PUiPMJ4LdI50Ul@Io1bS<byD!D!JQ{O$}
zz5^B}phz}%P`ci{wC5WDe2L{dA`1cC@_BeSn$#VyMxCDlg?R4U<n<^DCpi63%e3>1
zB-BDxvvjLoTZ(wSAz)ood#%kH0e#R*>+g)nv4|gT&$0`nEr^&qvVL4oGR;#+`ID1%
zB6UZO8Kb@VmCN-}myyujh>zdv=C(io@tZHjX@SW=x;Cx;=F+!q-`?H73`raJ7#rKI
z+*sxK5sR)rCG-C7L+cXf(d*}Xj-fPx7-L~^8zMf1fBgWysyTjq@!Pliv<_4DAa2KR
zGkmK)T(wGkvpkL;k$)zP2fGKnncAlU(44#7JubqXqqJwwo^3W&S5_k2T{r9SP1Fe=
zx+J-yfhEeRx4KR;Ew>;r7VTPYg!gW5?=1EVs<c}ZQ<03?Cx(WH0|i8op!T2sy#7+H
z@==FrbIFJc@LZX`;SwkU&xQGHvux|aYuqDg3gE6Gi#*?<#+8%>6M;1)86l?OP70(y
zJ9pme?c?v7>;naAV-sidW>3Wkz8q;bDH%tTsnyCnG(-+=p${)Wzmeyl&c+UIkkbo}
zdhG1nt(@lfU67U83zFyk+Q<!Ewfdw{ma?CBm{SBpVMUIJyiLPTK|wt%&TUO0x=no}
zM)*)z9WXFfpaQlruQqnlc1EUC<ny`YS(w*>0|pdYnujrRX<@e}pNttr_?CmtRUbTg
z5D%x?ld%XMLUrjc)5Ye<HBp_1_qIc%d3=$saU}N!VJ>15sg^CD)vq;X-Xs1nS_>EE
z5h$N!4kg{A+rq@u^iu20P6LJxoq4^RGhV8!OpAmam#z=4rzr$<auF9awH%0s7QtKu
z5>v}fk@C{tLXWMtRk-ka(sl~EgesZE%Mo$&%lRX8a0!Y&)lYGwMDcD~+9m=T+*DC(
zG+`xlB4j*!_I8}K(|NVu77PP1XJ+m5(GC@_npL~oi#>mgb3WZnjKEoZb*<;s%)eU8
z%E^tkex~HvI|kpO?w*k=e>e#vb&=~`yn6<Sw{dk%%V7)91Pg>a)#q;`Y7GJYCGCvA
zuHBABM_=XH0{aLxHMJcGbvEp4t)rG|PE+D@k1M*-V2qKgd#PJad18=by^*vStm--h
zJDoRRkbp)_Z8y__7SI`V?dnZUJ~Ln)l`GeYxeB;=J*gam8?ekhd!~{0A+3|@AU`F_
zQ?`j=L0I;>1h|ETBr7jpOUgjA5YK`~gCc<pw3oa*E_ZI%H09RNMMG*E4`EAd`GzOd
zoA70Qd-dwI+I#PDR2ExpZ7*NC#Dbj$Nx<>$@c>WfQtT;-3d2=YtY7pM$T)jEk%dg%
z&OmS!P=EiehlvG1pgNc+OeitZ-0=t%GknEoo;FfOnibAGynTJsJaPqEWr<!#yu4P>
ztx&Fa1H%ojeWbGTM`|WYs-~`M*eGCM9_;8bWBO>?4$Pc?IFD!te8D}yxp5(gl}$)&
zhdIV+N{Dn<vZxX6OSf(7P+z-^QwFiaPXDU1yllog#y^jCw@5Q57bLqBeG`PozjyEM
zA}dSoL}pF8!)gI>!<Q@j$RKUnAh}Mk3m{N@LbrjjLjw^t>4Fr^pg-hfSFd01#D?b(
zY}K+QZMmo=l$4bbD%v52Kv#P4d&lSV%(jgN<b;bsqK=OE>E9M0-$Yho`D6f+x3hRm
z!oL>P9E28cfmW1%!7nh=dKUxpjMoPfm~J4+;-1&k)^b#U5DC=}J^khR_Vj;>G*rIc
zT^RPj;Me^4@h~NY@is@|t9)cE0Rq;p-2sp8?A!n?L{}Z0C@oN{xm%<eUCaZeWX(6k
zuW~t#Aw3T(nW+g?LcZRr6*EI%p{%o3e?Rg^3t(-Hlp6A59n0vG_7_u%b<n0ISbT%#
z0upX*){Mqt_*?pHRTf>3W&`EdpKN3^RMs{W!_3kcH4<=$f)v<$-aN0QD|xxOW_3G5
z`AQ_poCVbPr*xcuQP+V>fID+9s5H{hDfI99QykqA$u^;cfS#E)efohqdBKpp%2Vf$
zSoT1gbh6StgDVz-=t$>=UnI<x7;p{zI+*&2y)Q~}QXg@4^GFUJY)|RRK?TxBI%bXP
zqA2Ne8U1`P+U%X}Jbeq*FI1lS`HMhL_<o$QcoGtTA~<vuG-<GAAjx{(PDC4qM^Pw|
zL9iQ^uUK(K{G}po;T0{`1r^3VRl_KSeZ-t?d#yqiq4kSB!oufcTBppqYt4du2TGh&
zn_`&wqR+-w9c}6IxmEEnteHxOS?Ju3RUJ6c_2TrGY+8(q#J~Cfyk;nbZ*o?G#X0jA
zN0tfxhX4EPhkA<5CofM=t7t9V{!K;;XB)l7e+F{ESe@rLWcM@;gO%S+=M+n3<^<@7
zwF!;p4ZRH)5#2hNe+FWhDv#~F#&08%<3dKhPqb2Ysp-fu(M1nz(&%|>$;;~qaP{_4
zRa|Ylpiotk><tW#;tWML5`ys;u34$oA2}o&jZ0|$?bBiNHyrv_+%etx_~)@1&I=m*
z<D#ZE6i#z9o5k4Lzh&sL`|GoDvsb-n!kGN)Ws@06#bnMBw&Hll?-yvq5e~v6+rmvM
zD1CX7W6F~!jow1}epYRl%45nm2G5pMjh^&!PC6>?){oPmuTM2vZVp>Fi$OdmTj%8D
zG?}R|tSC{>+h1ruK>)SDqqZJ8&S^lpyats;RsYdoI<4<GRsUVx4}TQ(e>tjPLW8IM
zXv@3%=swn%uP5F?=O7!N8?HBlz;ZU=@%A9r(`aUac?oA61!}HeI^WB+Mua<jY51{b
z^$X9blUlW!Vr*pi?EdhZy%(){7_-sZD(;%qY76U)v9`m9Pn{Qc+-iYO@yIu}S*dcP
zL)uK8tF9q8b)K}GjBC#y^)}`2N;{>ulRo*E{ltdxquy`%C&7PuhrXRY*C>=+N&GNR
z@J*ri(AirlhOjb>W*_VA!(`cQTY^Mln)=|u^jv&E{!W-FdG*Cz``q=l?Ik|<KdRS}
ziisnB)m+}d+V70>`mt6SQwj%W>0xVk)rK5_gWz$m){;IUAAI7c1`0DB-9si2yTS~Q
zNJ6A7&5Mo7w~<J?J%BU;E#RfCy`)@4QZX)L>I$0aDefMzM2$lF#5Nj=pPG^`fAQ)j
zgEzOnM~VwlZ!$qY^CB-V8B>0eYM%+xlAE#<F=Eu)$9)C`iMt}vO+ktqaP(;Z>z3^$
z-5Lkm=&sU4@GGNt+K85&M*$)7n5DXr>d6$LLCsiP(ClV=49b^EpEgo@W*w8ifUD<a
zeHkKiW-p3|BV*f0wA{Deo3B;j*-8?v(CCw{e2+?vAfxJvHiFtCKHnys{9jllNGn45
z{_{0flA6g??rNytuIOqJ`Ms*@*aQr>c<Ii1PQ_)lf-2_g;ASq<(@Vc)Oao(l%MkKV
zhu?3HYrxr7ize&oz5T~{gWO_+0*XHFL(r7*0kdn@zt>Z4daG6KI&|1(W7AoWtZoQF
zw|tz$Xc4FxA<Rt%7860W4<8#8-Z&*aR1SxKr2UXLWx1y1Dp`BXB22?f^!8CY_!|z6
zdn3`cp)B_v!w^SJdoS@3t0SVLHxn~4zOj{RCNXgYe#ki5fr5!--IuqPtEP@Vcm8}k
zMe?dOt5yx#t{~AhZ8YaM8VVX8PsR#|M@IQDAS8D!XD5NwO$KV)Gig|RN%0#%B9uik
zdRNh}fneD_5PLNB8EcCC=_R7jNHfPzFVwj>Nq8)Z6N6r`A4kPQx$q&_it~}1D0%p|
zST(w<<|=zz$5@ohWu;<EI(UN$)5Qu0N{mv${6=g@_*Gjf{xNSa=8q`4DygVEEB%H8
z4g6sIuKFAKrzMgsQ`Y*~t|N9)B7?|3WD@(YIpB&M5N<MnNS_%oAXf6ophjoAQ8?yp
z($UdTybIbwlGC5B|HFfGhn&?ydM*eVpEQ@pLgc8)rQpP28jOGVMC=VL`Pg@dT1rBb
z<l4GbP5l670=kzXJ~54`20PTMR72v@s%g@}s58c4KomnaP_ry9RFfEqa(P<tTxoR#
zXwc{4_aqXvr9cDZ{Rte|=1X^G@rgiOXlft!ZYwc51b^`yCogGXV-L>pnBQ@gq15lf
zvXV}Rr~*|NZ`7}oTh&BSvEDTFGdu_Chw5Oq^d)c-!dps!v`feY+qW}B%XGVTp!*F!
z-gKHieR^DD@Xr#-Y|2Xf9>f+6a##rOB#><x5(w;V4;ZH?uaLZj%$AXr{fgG4ts*^p
z!ZC0={*_vO-kf?j6GBbGL33v2Vft+5;BX_sUNZkWNErB={>w!H8tb}ciHV*;Rmb!P
z@zYH&M-0vD>+9WQD5AvrU}YL$0x;Bc6R<nXz)+rx%lf7u@~P_c%Ld$)yXetu@}^+>
zOK<-5f$tc_2Ix31k;a-x7y^g|CtmXvQ17sUhsm{7yi3gsj`ePK%bT`~!Nqw3deR}r
zaTGQr_u}iXocHVd1`>o;#o~ASR59b`*~P^Rr7y9%krc3;T-<6wF}kbF@8=qQ=@JjT
zL#&24`EVJWN=Tq9WPmHiZ{;Z70)2gla8rDgbab5AIMkJT`-EXGEtlm}Boa;U_nr1~
zpwji{kA`OJ`&qn6&}}9SFwbX7?k?uR$;!&UebVA)b84=#z70S2rmU3JV`-fyk+dBm
z(hKV(PVTq?V#m>V>9ljFn+z?zr8LXEJ>24^Y$9&&NQd>LQDlK=iuKa(%2y^#qew(k
z1U1EU$ZcKw2w$8dv`&4+(Y31{(ljv*M?=Gu%T$?xsVCFfxB-J)KV(OVfluOFajC@u
z*r}|9QO+k;z5S=TL~R1x3jG6@u3W(yV&Sz<5m8A;(OW;h%&$LuI787jF|nTI2te?5
zc1y{T1@^hI;CdMaW6UEhz7dJcwhrW-U+DnVoHmPalVMKR2O<^ueg1yQd{>ekSPWe1
z^`z6$2x4KbUu&+$Z@B2k;7LR9i(GJ}7@Jjxns>fpRf}hudcYzQ$CkLO6DcJ>wv~h*
zoPC~aK6$lw&z?Ex`RpsXeUt2eIrX4AW2SRkMRo-bm?>L-xz84@`qGN7w9k=*CMG1f
z%Al-4^uT;S?+J1)^8c(Hd5E4f>UO<-;O3=U$Ag~4H4YN#_L`cQ{j9eSUl<M(6njgq
zZ=O_cB#~5#W@R$iwu**>GikyAt5o5pYP=K<(zn~G_Bb?6@h<kZ0}A3gOI&P66jlP|
z^H+}>JwOEgN*vRhM=&&LLDmcG;yq#NlWgt@lCbB<i=;1+T62uNjN3Au5466(C6JOc
zH+ZJ>@=cqPDYbbHkBBM~NwggBGNtbEiH@9_F>W%1R9HRRz0D<l0jOA#Fr?Mn$1{UR
z`IH-Dz^bqwnYiR7;z$fH6z>vux~VP^*Y6T%gnfk1=ZC~X5AE8u)RubEZhBvyFWfY9
zCt0b)0A;lp)J|x=uqbY%lG2RwaT+fH&nd&~$;}m~kZVze&r9C5lbaZ7h0zQ^UPB;E
zsPyu6>*`n|Wc}vi9!gvWA%#Ni(!Na_=(iK%>@OC34Qq4}50`J)kjVGJY<Q-)13|17
z0bFN<izP>07<2jZ(Ft;rn~RPm?;Iu$lSPcMygK-Z%o&hU?g*7^M9_LZXZ1(QCzNi9
zw(s0!Zqxb>r^7jFyEnXs{Ggh<l7C~HL1X--t66*8xcTDq4ub(x@QzbdoFx7F!-wk}
z(qwECS3A+=iew=+cDre-roeHtL0$e352T)F9I}@mFf@H@G10}{a<sh8k5{|d?KAI%
z^Yq9}PIc-jxw+efGI?nAt+sAOm=g0nYB8;Ll@X1EBp$S@d@d?7Ufzt=kqSrbIC#z}
zWPi%8p^Oa$umdT1+i_KA+A0Id69Q&$r?jTkmJ<)jT`mh?o|2sH^cF^Ci0o%ZWGzYF
zwOCphDU%6o!!Y{GUWmm30|&08!V=USSOwpRL*i3eS;Rn9?iWucWS+9OMWh%N&*7ve
zz5CW?fF`~^uzX7I4!SYEM~|M#vOeH5fW2Hq!D^!Hoe;+3E8hP+X}B#LPt27CQ17Bg
zD;Zm7BL(Hp=X@azLu>1gggVB7C{1iHEj>SBhYh>&?W|Q+q>dB8H|VD`=md;KFTy*T
z1ve>6va&h&$!C$sO)FVWI?I>&`2KyhPA7b^zH&$UI86&N1||s2BKznqFP}?P2r*tT
z^fVvHt70i3+Mel>)Do^TV97|4aHDP?F-D>k$V%t>k_fc)!s<zrL_!E5wVqqfdf#HA
zUbawiim`D95n0G>N|w{hfOU5-IXM%>A)NR}eACzFlY8LY39lHk26K`^=Z2jiKK8p8
z3=U2ohl?TdP=k3304F(ZX8Ec8HE2k)V>9XVPi77Up@4y;Gt$tpDq~Xf&p#JO?{y~l
zi3rcOqLsry0l!+|r`fSV<VI}|G^MdFQVKe|oG6O`M>B->!wLU->hP;P2a^dv0Rn3V
zX;7J1_2mn^l>tZ4+gYt7Y74tQPwJ{i%FYHBp0S)@Vvh^A{EQ1dSkcR_QC5COAHv*_
zGmq^^Z~RBD(%-S;dcrL<T1fy?kT^3tfzgd)8H+3@H`n6%B1!mPz?nR$GzJgB9b}9G
zZCXl;BPZ$7!&gF>1cj-{^y$6Y@1eWBjL<r$V<l<%^Sd3|v_ZYSVd#V;@wD_+wY5)e
zTQ0e2DX0=i#$1|Ts*J)+M1K%!g3;Y1$L<4Pta)<$8{4Lv=_%Y@zc3E`dDy(^qT{gA
z)U#}13O81#_Cqk5R2F^Qi^bzu3*nwj_Y9a#YuU9kg^_xvpROIv`$HP~jO{g$G8BCU
z;b^qKVa6zu1qIrjQJ~0=S+BHdjHI}PxCQpRlxH~i^EJz;GsW-UBc(H#wTRt8$dlbm
zf>2Xe#~-i=@z{AH!$-Od5#=7BGhQd_DaJsP%$2>x;s5Z}ONJJZnH@0OfZ1DybqpCH
zUb=70Te$4)8hJZ7yY$es;HtBMfiID;AbflKq_srNx$8@wo}c01fZ5sfQE^g+-=4#Y
zFv?*9kOyWeY)D#aqduf?xyj5QK(#rt_2m6zB4n>|#DFvYk>1{>q&gb%vnYFD#`<s4
zk`(vT)D)v+9TI<0mvy>0i~ddA7!w}F+q@8UR9)HFu>HO8TxG&(ks2x_Le0!s`3T<4
z=832FXgSFo+syMC4ihEhybKgE3^7h&0IAV8tQjacG_mi2gE}(=_#R8n(ekqhT&Rgw
z9hADM00r3~y5vf~vGo)1$w;px;9?zBOC@n%s+A;Yv~tMKh1!5F{MN8cMVj*wk*pd&
zkc@@wsOrr+hG#xM`h--G%U*D9G82<GtXMIx_Dh0!NZ&L^)Bn>CrWH#_%>;^-am_eP
zOyA&?USwaOVzQ4U9uGO)^O;?DYAh&v{kpT9+_P{~B*Ll~!|{@ml8V!79x9I@2#lH+
zG$e>Quc;A!2FLVQA$ow&gHT(tDvTt7^sPhd$>5uqtMYTmLcfW0S<kU93eWn;%AUV)
zq4|Ot+{mEXjM8uHEA<}jGbZ=ybpq1b!O`(C_pzTZZzi%czNo-WE9?nY;0a)f#J=ls
zaWlIgiW}D1=nAf1`sN4GMIe|2Kn8{Fv*;X$*t>TcxB<W)($JUU-9=(l<~O(QI~R#V
zgL0;#m@NeC_jv0J0APag;@U3`>(8~m48GCGfwoqxqaAI9NZ@}P2w_b#o^rvIlO
z%%3=FFrfo+1kaDo>YHe)4Ik)D<Ek67lYfcXZtxP116sLg5|Y_qFW6@*lT8iiHo+<7
z?BH@jU*B)-ZXitN5{@3fE=Ka<J=L{wEMvGYlt3z@Nwb$a_oi(<-T1o1#aeOb&=e9M
z4tBb!&-`OGT7KL3#Zb3m=*Fuj<q}=X6)U>1sHrUuO}n!JSWr~UV(o4b6OwSpsn}U<
zeV%JW%g?F%9cJk4!c?a2c!zE67*rg_N};qN?@OoggJkkV%9?mc^+s#Q4<f5}oiTH!
z=ke>Uci&p3hPYY(4&XP04X?9%QC3;UjHhz2P{=MUTym{(+Krk4>@qJ{km~AcUogO+
z^#n7U9~H~3@(K$JIStrms42k_;I;VyhLMC67=cxvAz>LpdQ$lzo8|}fRQ${j{5gKF
z;P=>_(QzxR+DDDrf*^r2U_48J6VAOruY=3fOVDi9)t?c|^Oro?6w6q$)ltFUH;@l-
zC3DYBe&nFK{>#g4pY(MKktknKO$UO%T-f94g;<^9MCFNv5gBj($NwUZSCd*tUf6r{
zf*7;Sse<CAvmPz|m9**rTx9HYBOnjhK3?L@(8ifJtb))GKyQTKgUA`ZR*i*2jUcTE
zhRfnTaQr$?W4Juy+BzgIiOMd$=N#3!yM_&A<&(4XMTz`ygqGH#a%qXn$OHTKDOtaS
z-sE-m{CS<%nlK^jH*64*A;N4pw*)s&@NUW`B@!7n?3?8I-MWY7KshXGc$}RbOQyz6
z7K!t-mS`yS;zo(}A#pickYGP*%osE4qWgF6YJE(b&Fv@A@;ZK<73TzoAYLY_R6Ekr
z(xK0MIauib4~dU=3`r6hx=PdVY)&94uM~8<HD?BIq!pM#b^icb$IlMFXc5VdOFVRF
zR@doG9%f1rz`~HgX7~Mxk-YOb1hVD5`6w6nrup)$d0d`K0dS1FR8xZpBiH?RphwxB
z!!>NcL!VJ!EB~HLXe3VtjGLs5Ti|o-<iz9{a%8tap|<on*a-TOolQcC@a!()?q8g}
z$PV-sim|@cuz`gKv+J#(5Z*YkaM>PK!MXIcK$Gv7BG|1A`U<rD;)^XwRT%~_!56sU
z_fCn%mL{AA)NRFRc|21>24B3jY#b!z{)V6CVJ3Kq8Fv{=-yiElKjiYqOi>hnrGrnG
zd;_*ZObB7x0c)B{(8X<}zac@NMby=H3~e<KTeMIm==~#v2y#t>G9@nZ<?xymdE6Dz
z%x7gas3S(78*?s)Ji9B5d8n8p8ox!q1EPav`gDUlm~zOrzOZ1WubR84u<|bv2Zrc)
zSVaDU_$BhyzCAB4O@B(`)KQ7#v2D%QRbl(rsr4YNpv?Mnqq3~GvB~C3#LCpjBA97&
zE)QRLZHBW(#L{FzyKocV{?kh0vzCr4VdZmll#-mg7~PAa2`yn)Z?Rx@^k|x^Em1LN
zvoBh7^4Z;qAU+V;`|ohmcmMr&)0)9dFRRh0QF-xPsqMT2%$PtN=ZB(UkFnS=TK@6&
z9eiO?dcupDt{1@NtL05ojwLspA=YNh%Etkp7X_m1UeJs1h6t7lX|ieKnB&la+{@t>
zSC_qBC<)c)tii#J@+>NwM9wzwJc|tqZyPzhvRW_Ryh7>so9nhYIBf;K>W#}uTRib>
z853nwU0s)FcVPlIpj0&uyHI<T-(i(F;POl16FB=kJ#N{cyMb>_AQ^?(Ans54A>pAr
z_-8lYGy<vo)R2cDoA?kixSYa=)_IE}HC6;OI>Yv(K=uVqF#SD_ciBIH7crc}iA$0J
zb~u%!wh#4w1UU}DUP5-Ny3^|xCS_~~B(dHyV%6>#m>bLgxUf|2IrsY3vL7Uy<sj8X
zYA}oBLb9?Mkfc~B{n=rulaR_1qDr>Hf(?IejGl#o5AkLZSY1P%HA1YloVFpFVrD#}
zYZkIn^xJgcQ?%rNKkkYtO<}KTo1j(VabLLbv%a1`u!!u~zP*W(oGD{=#mmGWL=G$R
zri7N5XX%KG8w^=w`dUor@nEfcB|IB<GS$*UCk_Mzi|E0#PtLz3>LQnn(@nEMB+4{`
zxtrWMw12-QhZ`+ptmTg%1q0mpEsd$Xn!ch|90wJ{Q04@uS7eQpC>{0jv7<AH&!^@k
zH|ds#r6x5PiRFwL4}-O}pq>DK8vJ}k2jljaJG)agEpPnrk!SvV_<@v_<B|I|tr8?!
z|05=rKRvx0=dY%i_-Oe(_pY0gYF4vxFcT0>)#I;ARC3r;+r|ehiPG8n;Yj*gr_FT0
zE?&DfbXqW-4@WRQY0wfq8+h(fj~L|F;Sxkn@#xmA)0Va^xQ*|AM^lN3Csi~npR|@t
zT%4A7mop`i+<;OTH=;5W9((srZ^}XR2f&&je3|+#nnW8>dn_&;U(Ypn312zVA3K6`
z*`-)A+H=xbIu;ZZV8|MrWzFg<d;8Yp+$aw!V#@D(r*IXvBS!(BJEa7>B{AyUh>`M>
zedE@owpgatgZ$axcOVYAXH!O<-Wb0sl;B7Fw0AnJUw@&>DAEk)yvxl;xX7zcm{5mu
z(IIgc_>DLfE+t|QA%AFEL>B2p7R|Fxs;9F{V=T&cr{P58#Hc-{-YKu4c9I0?{n=q(
zF+pUop{jKe$<VdC%h`>9N>gX$V>EJHEZQRlqP>S=clWJmkUJ{WN8ppn22{)Zymfp5
z|0TWalOJLnhT$y>gXPm#MyBFzl((1HoAUBq@aBrH#$hMfifHs66}%HipR&U5y|Bz~
zV_`ziUcH8C*bql(*W(^g313`1fpw{%@Ts`i?4G8m@MqFzM>$FUGIYC)By`!oVw4a_
z+-uufP%-V=Wv(3!aztS3T9*OJAk2#qYa}yi3uMFeQ%YmG_9wvmY5YH))IqBe1fbna
zFWE|N4(&elS}Zm4>({U44+&QX@9L<}d_LdB9(@kSmv@zzRiYdB=SFAEq$3{|CS<Vr
z>kh;IXaUM$JR0x<lHH4B>l*x@b<yuRTuot_)Bimyk1tKlBvQ9Wj~q7p8i%bXu1*V1
zw~n!nxzYAEX(+A8%5v_?J0NHzK^J~6FBfZ-L4&4R7eVn8#)%`1kzapSK6bh&WWXwZ
zM>Urm{TVZ!mVO%_AX+N#Z5+2^)25ff(>q0)MPyk+J-pKw*t1`+KjCtZnmmDIKg$G7
zh4+qvJD9Y*e7V%Ri0J_2gNwpU%yu@7`&jv(jY2&x%ug*H!TGZ>V2Cs$YAhv{T_7vY
z+~PmkNvJ>DsY~O6fPk+>N9j8*H76$yTEw`Eh(T10kwqu@m1skd68*-`lp~NJkH7M7
z#q5Le0UN0v*_cDGja5@K;wy|kP6_L#r)Z!jZjZ(9Y(=7So2b$lg!dgf^aM$wjUNVH
z&+eK=WRdq>iHZ6BIWm-qXgd|K>tYh?TktA~4vqXjA*tb=?@|JK-~3f;)^w-+IDIXq
z#UJ3G%o{(>;ZF!wnf?EC%IhB5hNy&^pV>gurB9nEqf-(|Hhs_!1KQ~xvK;A8hkjUQ
z$u2@ZRZO?MQfv(HA~`fI6E!9ELDL=0>i%bol{B7i8!1Z-4G%v1YlrxjwaL^%Z$Eyt
zXxv>!D-f_wpEeEoE0zYc9EusF3nFvJD==s6=)&tZ1CV!c>-9svrp%EJ%8eroQgNMa
z3V*;$mX0L2qva2oMR4OHkfErw5&KNu<#)>22%fTHNCUBcMz1hviPqddiQ(wcS3yoj
zi@d_f40r{P2S&8N=IF;jF%zFe<34`AzIX{Za9=1X&!0W3b=`)eKR8{<70Tw0foh_O
z|CeTdzKE`&G|kUvLN+%LEIO4NhndjS3Fk~q0JZWRrYJte>uFL`{=MTYC)$01@8px;
z(BBzzcRAG;#}jJ}R0&ikqH(Q-Cb25Fp!cEDl5^B%Y7>7$Z2>L^o?<X_CiIjLUQsmu
zqF}90%dGwJgQncvoE+2H){^4>0L;`iTW=sp>sD8qOamT^_o_u1qsYw#jNZ5jG-Q82
zGM5bf-&N*o=C}{L=4aIT-}#8V=!jABqP6eCw_3hwyy5gPlW@4<x*tC<9axgEW3Kci
z1~o7;F3NUf^CR<6Fd=xHeZKC?gT4$KH(S36G=M7(A&L0*an*_y1DdwR{U#ceThs>|
zU0@C#*x<p%D8>eL#Ne7T()ajvN?f+pX(JOG6E4F<mC}12Lbv_#<gC<oujr?E_b!V*
zbR5wDsqnc&xJ5f7t}<;DfvDh7SB><i4MuQ3+qYMTAL<lznadV-^-<ZM3*yJ(mHDC&
zT_z>s+L&A>Pw#QwA%cYWq2KFCw-r@5;0f73N<AQGwr)hkI`GAi!5gtX7VYkhr)Rhu
z32_;3Ed3>QR2t-21_})20(=0EoiiEBq>j)~CB?fKh-o8m()VZ=G;%Sj{PUS5-)e;z
z_RxK)?KydQdl~IJ`Z!lZ1kk<c%9XgZ`jguj-D<*p_~0hcr(N5&`U><P3~4N=$ebgM
z!@|djlsq*Pv;p|^;Iv@;IRT6d^79{U{ef5ueD0{fztx5f0Zguk|I~lfCg>=nFnf+G
zAyH=7Shqahs{poWT}&>B3pka5PvfPy{pX>@flki}7(YnEhE|4MD28SCimI40jpa<J
z77Mg-Ijao=v91B78RWX7t}GUg+2Z*fe0xTb!7JV+t#Br`xTi@4$x>p|{QyE%vh@Zr
z(9)5bJjNn&S}<llX3E}|2a2+|-=7d?6!IsLR_oCo(5wmkJ$^l-=sFEGO=sFyf$%pm
zH`qF{9{BRc=kBn)$bgY_E4niEu49`vVJH10$~WPZ;^<E!{REYU9+TS6N+<QQj>%;!
z{-~>a>@C9;=&JuNNF*!0um$bgcPX{hzcOnO1~AgLZfD$sSx7-2pPZUl7ufLgyi5^1
zF>bS47IgA(cbCW5M&=ARo!ov^$2NZ6JS-Xz0%o7jE(O=Slt{IWTy*cWV0*@BQroYb
zDGtWp!Wn+@JAlX)&qU5xYAtk$vU}-ZxVkQTwslc!DXI9lI8&Q{rpMuqPfikf;lf+$
zYKhTgxyYpvMoPH3Qhq<)dOc;OHL5PO&@|A*ZtP9-$-BEPjaq|dkP0>))3YrCYA5|C
z-7Cqs2RPrj7$o|;e{T^NP3maW9i(>m@7?PvCwFXI#vdq>lmg3C6PsuKLB_{3G)Cwr
zo-YyKrOzs29+`1>J*~P=SD>X=El##0#F1D^<8vS!$a}tbcX7kB@i}Ypn?pSj7ZR?o
zNB}`&hkI{?pIIcrzvNh(KjF-}-~T^`5#GM1f`St&6gNF}22}uGf%TtPUW$27&@{da
z`8-*Qj2;ez0?S=BaWdd5$*HRhXM!{7?J({BF4GNW%sAo`2dKaHlFk!sA}~B(9Fa9`
z$`k_7!=|ND^C$X;?rJ(135~aw(JD6l_xVrZ+M#1dyUm+J6K25M0s~E*nO{(Vs&h2@
z(90vzKq6c$w~k*tWs0lJZEzMg&Z@+oLPHKRfG{IIzK-1uE~PwVuuB8)4A%0zWGmsx
zVD{`0*RmfxIL`@6<_PKq6xFneD&GF73QRUl3+B#W3f=F>sUxfUgUH#)Ie^3;p(SUS
z_oKE4F+JZ8q%7w+k};LI%K$%IxY?cM6s-%`NKE#Up+%@Gb+f|)h-;@GV_!NAaHZ)@
z2xOGC`$`ls`-Y$k%`71d+7LtkwYS@}c|F0MC#c&&_cs(0G>zCbn;L|DMFB~swYW$P
zqz!(W#m<Hl>d!Vyx6?h_Ww7CnANB}3IJ(GLbHFm^{1Dl2H%8uGgq9(7XZ6q=eFkF>
z|BlI6%qJ&f0eG024lKdLW6q#=EmfZW$*=w|C7*B8A~R|IJ1doFh$8X-n$8V-P(KH6
zlWXYjr<ZMb8gz@!8sNKkmm2}BqY$^yAF+D-u+ed<N}s-Jq7$M!aQSX}m|0u0UAy+_
z6s<Jai$UxwD7wfn1x0Z!yPzNuv5-aNHJ&J`L({V~%K7sw=eq+%5JSTDJJJC_xd(%^
zaG_MIRx<^IQ~*huaWa7tlMw9i6G6ch+(PLX_0`bihM@w5cC~BxwG(H0cr_ZiM)p4C
z>6uOsAX>Z-W9&KG#8@nzsri?@%75xX;ywq3^0jQXdvH1jD1NyXcpDvU43yq~V)e4q
z|4IuDx%B;aV4JfVz_W%hOKa}~4D+%$n?`p(4rvTNFdnMeRwW9F<FeO<JF5%G08OO7
zFb4Mm-NeVHucEaTs)@XaPB8%#=wZtZkPs!iySc{ys2AM}W={z*-iFgxL>3gJ4n2j9
zhpHsJ&+mXCkq%BLfGTO&1P2FGcbgZP4)E6RDkIac@lX|WoJ^G6WE>KgL}dB=dUt%x
zh8rw)@RzoABJil_Iyg8G+J612+vKhP4MjFLyldGFW?Ip;Wo!GDQcckGCr>(|Ob*+6
z0z0jWFqUd2*R7n|(ZnEd=l2|hA|#;mKGpC;H6zlnJ>VG=4KA+~kU~GzKMPN(oEX@c
zELD^HVc<z1fipn4X%U%3{=wFGT&oNWxFE^No|P0h?sFFuw4#*`JQY1X)x*0EErQxY
z*>GH3zj1^7<Y7Q7U5#a<<qw=|>v6+e*EAO*kbCPUL)aHwHhsr9z2RVU>D#!5^T5ai
z07H_p*@i*#SI{lceaBquiV8p2bAHv`PGe&}t#pfm(=k+jOhf+tBLvnSTTaBb2wmUl
zpFB>Zl_`IAyTpTxjt4+51|*FK=IWn_anph?ota(!@#A=a8KTL&5M$G4ng(w`yTCj&
zdbH;j;9L*0YFhJAlIlvM3|9BxIFNn$6e`@t$Ee!kXi@pd4I@SA)3<Nnkt3T~-vGZy
zn7tYRfJ5Y@8JUxA9JXgM7Q?qIfj2t)#kP<UGJ{s=a|j<O4af(e2Ti|c{5koJS`)<T
zfV4XZat2>O(IxH`Sn0p!R87$Vp4O&@x91(S05WJDNn8?$+j~w_s~`vXk+vN>_8GbI
zA%(=<`ru{<wh>Q7HIL&gbcXGkX3Olz1aJ=1;b02?xrI5RJ9<>wNj&CySK9q&6(qB$
zgd7_Pao6oiC2ukCnzn5&I!^EJiD+ILPHakAfT9g1;?iRBA@J56AH&X*N6h~&8!dxx
zcrr;ETaKiNt?yp8ZR=KOj1Q!zr*_p^X=XG#@*ugP{xdTP$!+8oc7FNjk#fI&H($QQ
z3bY_6XTe3s1=^obsS<J7&%A^H=0P)Sx#092L@P0=;qF~@5pwb07{vS$ofuctvrnH9
zqEb~cixF!<t5|K*`QNym%aE_swqi<oS6gMAzUd-O0AmK17rm6XAiQY=*+SpKnIYg+
zqy(@9V%`-E_1^lXb5*4%75PTA%3q^7ms`%TfT{%JFsCo~x^q2s&*sfZ>Cs8ID^I$D
z8w~*Deu}bCScFnZabmNDFu^_eMe=YMf-T%SsOliEk*jcqOpT|66_xSRr$#;BG?OgT
zWRdJ>A6H!VXGK&@!LM7)L2IkEoRDaCaZwKC<Pfbio(!wH;Z^Ss&#b?}J%9Ib{&^=o
z{Y#rUaDaH0q_Wcx%yH9tm-OT$8(X;J4fDAnJbV&zBqn<xKN3(#1MMn1P(H3fY(qhX
zWd4YH4@u=#>Qt7`v_?1Tm=7tU;ql=Ns-d2{n@NnWsBe-o8f>chue+SVodnZ2`=m*`
z7+_mHg&TA7E-LIr>^lphmV<oiOB=>L1hSW|q-ifat={aJE@B2+k}1E@|6Z#)h=CQC
zEda7jYLc#@K|#y-HDL?#=+V(rZQ8e|&$b*)dcIuWn{WIL&v6ioESTkzF^Z*v5X!kM
zCi|Fv2;Wc=x}1Bd6dUdzoB6v`)~x~Fh`(_|{?PqL<Lt@b%`eRF+^5fybHQlrY1w73
zttwg09f7PC-cF)=C<Yh8w6Fw7nhif>VkN>ogahYcK)$waQSlZLWE83A7nH+3eIUSv
zHj1+A-REkFe%i{BJLxj%>r`=zcRRa=q~j82HxDzVS9_p^-S!h)hM4F4d;cN3*v?`!
zURvbVeaU>L6~kBPkJ`c0Acpzo+Op*cL%XUAw#O-T3c`@3G7St*2-6)x%_1;SEGjJ>
zwoXYhXW*DIb`X@v89M91+t7@q(zuToAQuV`;fw&7#*4%F#`|fYeH7qzz-e(?c(`Ta
z;r^w(sc`o1yh-vd7A?Fe#GB6vHB#b)MHs=W7qcQl=MG>p1_>QyyVkFpgcuS55yl<J
zF&==45VEf~ztSvPm!4{#G%yr%eY)yBL;XNX&I9zG(B0*$fIz5%aSwBoq2v9fQ>D;c
zyzAI1C>O`2r~EAky{im2odCtu%<Dpe9eSS!9NOMn{lClbE5-X$CX~)U+`?4ZlzS)7
zH9{_)@!7!Qc5-rpWqs^0#w%qsd<{BBV~25Mj^tchcJGjMeFsH}tTAb8$x)fxY^4W%
zompDv`K)8J7$M1)4cWGlweS@!usz3H@3y)fgq1vr7Y19pqBVe(Q&k@SS?C>ItoQSL
zAI*FHuji`+m_Pxmv)ToK2|(ysl(+je<HyhQ@@Rj$p*NDrdSXCZRCYT>kO;=@3T|6*
zO>39__!}T5z>n^T%<bpTzmpGm?Avz)VQ9v2G>!l}A!P_AR1owcX6e;Nw*xdL&e?M4
zJtm&yU+_hQhFmx1PC{?pRS_@jC;Z7%j?JUj4*>ltDwH>6Ws9%6N1jFAjj#xh((r`#
zgeev@#PSvK$#!fTF62UjZ;NFq&`dhSJ)XC;A}2zX!%o;xZr9IE)H8u<%m2PwU+DOG
zIP#Q2Es1qB^-vNEE#v!;K&O!y(g;DU(IGXK+bU)2e=Rpr3k;Zkc*fK`9w3KMc$ykw
z00cAb6k(04$#JNv!qX%-ZNV34Oj*VD(32*Ur_!$BsPyEvJXk;?hbq{DiStHE_z(QL
z-)qyu0BPM0=Fb$2mE?E|LcWGXTcWFRlY7!N;>dsRiHUij)7KQVbd76{pz|p!!WU)m
zGpA|6Hb7`36Eb=`9305bK7*7)#2a1aCb{k1+e-*)DNhK8gAuBBdySCv*Q?!~TxB@O
zC~3MZ?c9>S8CLF*dtPaIRxc@ccm-M1a!jG|16VN*%LhMcr<%}@H}L5Xe*-#CqkK0#
zWMwty19u_UKv3#Z#GWAbg9*CHkSSVRUb$b0myd9E*TEKccQTh^oV%O-1WC_{uM+}(
z7uAyRKu8^}%G6OA+iGClo{*Xf`L%(ybvU^mvv_1I(SPy%#k>7<h8*SGp|E$8fpRO1
zSqFB13^{8%KwKI=nD)9$aSEFi32LZu*xQb+BrZA23p?sAnKN$F`xc^^h*wrxuT2Sx
zrkk*AQfHbPn)&E)7y{`0tk})rC}T;;%UIgBnV-iT?V2gIeKR?ac&0zw^wsS;FU;iS
z&T298Q>F;<09ma+*CkP#o|@O}VQHK9Wf%UO^w!0D`|jF7KZctODi)N5c3ZZ1AqsEz
z4e{zM#(hesb98b@>91NOxhxdE!cEZ2iyMygg#%mM?`|K7L!`80{0$Ma@Fob)#hXZ0
zyK*B(%`LZ|@KX4G@zULfV{N14I8Q*j4()WOiMkPaqniw<MIi0+bb$PPKqN$=;3}R`
z(VQQWm2wISD)n#P0X>MA4hb$P77wZMz1RI&594cCG;s1O{$N>B;-+l{fF`;6=gE_p
z*w_ZXfe`&|?|(C+<eNjJ*)Z3&`PaW7DD2(4x9B}M1OBY4q=d=2f!;nI-uI7LLQ^gA
zYgJq(N!70@{@1;G1E4~liw^e13&z4DmzMNv-n{t|D04-OLH?oCe0R9T{2NWpCo}Uv
zmfwjl^Bx}$rU6+w(&KR95D$M@VaJ<4kNxZP$hyz(($?%I!`B5IyTnb=l~xp)*W0%P
zUFim7+>|?ic;FfY33aC86k;mEJ30#_deZ7^pY|h#b_I<CeDD&t>+1?WqrLZv|AvgL
z$xu_IZLTupjpPgBuKwH$h9RWwuCyU9<VlN<7#!)T>iXkDum`O)3CktAXN;7@O}2{v
zumU-Q!Tp5ZE<KeWBrLv_C}=2NM#EV%tI-VV?c+dWWUd8wrotK93^e-0zt7NvSTQsS
zKYTtAW=V-<^yncQJaAxRSC9_jQEN-_L75`zkBq%=;yxS<k}YQE>?i!^aXTnk=j-=g
zBvI?jwjjRB>47E-ZCm*`7dMMI+mNeYNHiXwXesF<hC^5lP*&EbV@dMx!2kAE@-y`H
zt+~xIXW(pUnL5I~kgFyjQ28yER!6pkczg-(dvB<UN)?KbnKNxRZ|0EP0|v&{deuNn
z=<AXa{}U%lvDAF`sB8J<4lq^hH@~=%Px2Q0Nei-WQk9?WV1;L?+E})X9d-|TKkFC)
z`S>x<X_*aKRpYS@n8Q|f>1n%dn|D&E7h3+leFgs{d3%fd?LRE@U%}}w)~rD<+%r6B
z-8wcS#Fz;t3#A3NgMxr9&~0y6b4ewswT@Nrhj=&oZsf&cURv`0KPj_O2ZafChSKSf
z)yOOwyZzjM*(vQ@*(_8aRMdOS**;nlztd+rg`V+Hp=jtp8B`oq_1|M!Z98ex%sF$q
z>%j%XxpApVIiu3F25i_o!7At@$h^MvYy28XwYGPYDES8l2A()^Vp%xA)iQCHk~97r
z*l*@{UHsM(NlNhrQa?h<-})Vl-ho>+-WeepL`|pl4-O~8U6~9#8@C<NVrzPc+~MPy
zWB<M=4!84H(*b&Exx2X{)I{0Y*;(-}8WK8U=KGv!DDhL4sBHlw&X|^-OzT;+0vBV_
zJ>=wn5@46OIueJF4%onn_joh2?>smX_$ul{l^OI;UA<+}j0aNdr(FB0Ip?c5n-?Fm
z{qIIA7Mi7uJ-9?PwtvEagwRCLo_Fi!7S^0A{;<NG2XwQ!NFx8|Z(dy6CPnn0ooKoG
z9SNiR-YTAK`J(8xeCv#9bl&ie2X33(GW=0ntNk+~?u8#%zmAr$+p&E+{7S}8mz!@i
zCrmKtWylul7CQJzo`GiMs^)F%QF%}NxpW!n<hy@NJN%EUy%-&Bl>6%IXvwJaCcoZ?
zI|6KqPcg1?R+!gHvijox^=frv-oAc4_s3G|II({uJ|nQPs7=(OG?{*=r>5=7^r+9_
zsb-4$l`S~HZ^|CCwRqf8_UqSg--g$AiXk}KJ_!5k4}jJbNyL9crvBMH|M@ee;eV>{
z|NN1z)gQ|JKYt|AKk(=L{?DHoS^Oyy|MN#K8Gkq(2@6k7Zari^HfyLRa7l$)T!KQ3
zz$;j6+mmI0bcmp>AihYB0m$&5<L}24At+avTOW1%P78@k+hZqA^kNW=yu1Z09}NRQ
zcp}ph^6SGxm9`ZO0ZZs7nK@;WiP8_MBP2Z~<>ecZG?RfLSlB=;Vq*DGW8__>taDWK
zU%r0591{bAIf;Eu5jOXD*i^<dH?~VHf*XfaBK_sR__7Fe4!r!y%8}lKZ<6;hB|%m}
zfzdUKxfGOCe4V5pMF)^{&onR?_HNE40BK61n6>G*pFEjWI~2@BXG8Aj-M5S$36W%w
zWdK4Hw6Q4qD5g-WLig5H-04IILth22W5-5=N&`hvm<rwrd`q+zvRS9D!h5oiE&`3U
zPlF(`0lGjzMIZva24@voD#k(ut|9zdT;IH$FON?Mm;#3}M+N65W8fTNCXDYZ+1Wou
zqY6oboK(4fYxShAyX-fk<%QuDkLuDTp<JFn{{=Vf;IucpSCD;CVos&}MraLUKx5r6
z4u1xssXVEnibJDS2Ru|;SBG|#N&;7}WEAuiTUs_nQ#70F>_<7lZtB!?<doVuw~-W)
z;1iYkcroNdb2iQpiGvJ$TEg#KzfK15qoxLcjv2={ZhM(0fUJnu%>*t^|Ax=D;?$mT
zS9DvW^<0bkXuq!*K_&P%u9%VACH|uQsMrLYS<q7$i3MNu1<a%+EbPVHITOD<3%(Dg
z^y8K%S(B>G3-Xm=KR5hkUsT!l$W#-PV^zsZ_;3D#@Q}6*vlvD8>C9pe`+_s-)V(9F
zt>Yx_l>7){9D7h)TTCqX(1}Dr&;6ySCBuohs}8cntdd<{w_kYC2~}<%GHBVQj3%r`
zxNrZ=9<);XXT@Y_TUifpTq1;Jjz=3|J`5X8tvo&87E{iszezbfG*h5NNcVsE^$?{Y
zkY(IUb-CA-Z6(W6QeRiUVh{Dzcx|(FtIC+`cQp$+bKE-&%l%ey_1eP~s~-vg=7*`C
zo*tckf@2Q3>>8tidSp8sQAq{WujV)aO}xNnpU<55I?r_e{Ojb?6p7Olt@QTs-m3IA
z&dvNJM7peN5C03{CgBLw#;U4*{-WMzi~ZAk_ohZ=ch}x*Uoi=KafDs?*TsD>+7REV
zZyzlv(vv4oI{$prchISDK2C_Nc<umSU{wYRZ%RuY(PBRM`KiGpvF`AJBI{n-yL^_I
z=zh|$Jg|1kp#7~rO`E%2v74;pSUdIoh1Whrrr(wA()CbcuS1DtxsH<zEu-dkxw}NB
znRin3=f0I%mE~7%y0=Yjp8UCQjY9FsiCfAp<|Nt`A0I{4w}}lwj2YE-zH~ZmPgc`c
zkKUL+U!Lc=!s{Z5#Ol$)<Zf2$P3T`C5PBZkzIk&Gx+kkLQ~ms%-xIqcjZW*`xPn?M
z@2A%zZU3JMmVZiEiBBo%D=EIG&*tbWS9)P(Nd&{?E0B?=0K1ftP5?1zp#A-owYAZE
ziM_#`0)~Y51|dM7%Rj~`Nfa@n9TZ@<JR{^WW!UBF1<fQbvDN4Y*z^FQn2sR;84naO
zaNrbew*+K&tPlhp>=_|jhCRYIXy%L=!rh1Od{!MCxCbWc&!=^jl|5q?L0H4c6<@|X
zAerF(;4Wci1GKdA)6&vbS}n;Tg&bn&SMjrEi?=?eqk*Ke;D7qZmlHd<Rv95tr$8FQ
zMAcvk(T%^fCc2>jq|=r`X=CBt7^2DI;WxM2*vy<g8~Mu$lKWA|UlX*^5CeX9*29h&
zs8@qVF#a{SPp9GM7S07X8Ps9G;BnkH$0X59`A~?S=?MRh78{t(&eF1otssn&kUlBi
z6|nfV4ZS*dKAl-Y5ha6g4hn%;qp)L^pasU4*0TX1UkXP#PoU~2i^t4~AxWb30vQ*1
zUE-1kY6Q0JjGlr{VCH5LT*%fnqMU&qZQpvbTidQkYJ_<o{1d1?U#(N`@z3txA3SK#
zDsCs?(apckWaZW!YW@4`MP!Ltex$$ti-u@S=-A1w^m!SJE2Gm9ZaI}dhHpk6@da7G
zY*~<uCEfs(!N)Nss@f===%}!dwzb9^nQg!Jvi`+ONEPTYR)Nmc{i`{&VLhP~NiV8V
z{Ex<C&R1fQ#Kk|OYlMy&|ARFF_4GI)+DUM3NRm0jY9kE?%ooH7U@~8i)6wh5kqTn?
zwRPDecwnQthyj?uqVZ6aW28=Punq0;yUg5SeqoURCKUF&;Pf)$IRu;;v*^_fbvf_s
zU9L2|!>B9YH*kD}sKdmfU7Ryv8EEd;zcR%gsFGJbe3{<KN>2zN#8~NTM^;bq++Jbd
zO6j)j^QVtmT&_4#UZ0)h%_I}!K4xLYg8=j(>fSVC5%k^l<Mim8>FG9XQ+f^(x4b``
zqD7C3=Y0-|1e>{`V@K}#o2o4m%@Fr>)_wRgU!}Qv@^6ZZzkf&{Fsjn;g1QQufp&u<
zt&b_2GPZR@nI9$WqZV}YXLO`n)n}_V({v{mytk!}vcGf{I+oJOue}%`MItr4!w&>;
zOdiS%kn+F#zN@64&Xtp&Z_%KIt+2tZV<Q&lI_>EK0We(mMEaAKBfXxl9yu8iW87#m
zg^}czR^6lM!4J_&Lla`)qoPT3wwF4|;+0F6Jnz%}m6}>!I3wxvU8=Lp-ZFA>13O^x
zx#UKRvxV&W^NV*sU#3Qf-LqS_Cds2M$=|wW$D7x;@K8dm4G1W@_T_zc?v)aTn_oSS
zZFP2eI6+p=kO4F-&a(J8rZL;6LZ6Mw_1${wbmP3I8M%cF<bWHP;f1bZMDWb^tGWN8
z<5~H0ADSg^v^dyM6H*to_<Uwc!}lbb;|})sYzP2i<=|f;oz5B}e9@xFYh8?m{f5QQ
ze<R!ym+e(ldSYOw^_-LA(&_tK>7pNE5xr~@y^#zRvfdTW%@<{?wFgq#f_E?RsILX-
zG!TuB{PSMbuv4IzCf2I+qi?1rk$Q|>|DvBO^Hig2LPb&G(=dxc2-n!$BpYuIQPMyQ
zdgiiP7~lH_%uM^os}^1l4wEZbqwQipuU>A#>Rd?nnA<BGC~g;Lj`0L+zMyVW*6x9X
z-3qa66$=W=sh+`+CjEu&y4AN!-lY|LpCNxl3u`51a&b`&A?Wei?AE(a!^xoU*OP|k
zCDflwJpI2I`xCI7*Y%AX|HeY*EOVwZB+)=-QX)fTN+OXAMUhCP0U@&p6^XDEDv6Yo
z%1o1~BuQncNK#UgQvc7L<+s|$|9y|+eU82DjXcluy@zW!uk$*u&|SM8GIKKUJE7?)
zDk}S}sBk8ziiY3OVtU7;54#9=^wEMMwjzbI?>k!2ePijHt<0l}=n!&+&eRSAO$_TF
z8<>-bPH8DnH`Z!kWd6R$obg0AqBy?+G%U4;=DGsIs>2&KJE<*DUii1wKcL=qRZq4A
ze8*)FET$fCdjEDjT}=d4$Tlv#zUH=Tmy#_TH9|?^`e*l2c+j<WOI>xYAYgAA0qMF=
z_TUN2-kp?3AP&?zb+!7!m~u*<0pBsq7#W+*{`)V^j|}F*qp*S>v#y)fTu~Gx>dMN#
zejTi)d^8erD!tD}g027@r+G6CP~U~G&rfqT?Ot9!L)OM?e(2*B11wuktoRq<kPIL}
z3YJYI9UtiH_<YfzS%6R|cI`DGd)u1AJrs=5*@IzL+h<^T17mvo#>K@&JU%5Sw7nk~
zeD$42O^_lW)f$kNiU+qiC~_PW!3(757Li30u5uK?a9U`q(y}s@d6!Q=XRN1dr%4^V
zbUF5Y!Hs)xZ_ST7QV7BYPjNgJ@oIJqF{@zX&=HYya>FgQFmMv=&q+rD%;qR$axfOm
zdItSA`tV)%NoNmr-o?bI;p=8wz-xIeirlF~PTlz&xg4Iek)p-pMN3q37d}K;(R*!k
zE0T_BYSWJ-bS;C*n6oj_c~YM7>}4lxVLB!b47Lg&&H%(oxG*<<s_0Wm?X=RWtn@Tx
z`~~IK52}0hG#q%gmjbR>BYw16wqqK|Jr#1JY7cFrlp32Agt@L+9{7V}r}{4x$U9Sz
zxq>7tlhn&Zqw~UR{oT7T=}13N-|-~_c>^N<ftj^t&E=|J$hyJqP%7>zdKhz0-@P;I
zUvQHYlR+%y6#a~DaMtYa8_tL1UU2>=E`(08zL7#`;>(h5f1@!I9mlNsBRM!u{apw3
z9`v_r)$}eTjey%-d+cl~-5O&0W5h!Ia#}WG6;9E=68oPDSh&+f_6+E)d?q!;<%^98
zs15Bz2Mip=sV)VZQEYCyeKA>+mDPoLSN?fErP4Llb<!5BDAjU9$DIU0U0dzs5S=d`
z5}#p9yMBthVrQ9MyLR~nr9d@GP-QboT>3^HM8kqzcGzW`6I+#ebvbaCNNw6o>QBb!
z5jI%U>clTAFU7eRW{zKdbKUoq3&xIS+UfS&o?kALk;u2&NwMWzmS|^MvV>xe=Bs^t
z$m{8TU#)6OPoLPhyl+UGw#O4bHruV?d_E=CuWXEEfi0x|W4A3XVE24AQtE6p?pkQL
zLX=_iVKcw5iNr)rcRXeuhS>~!&L0h5*Fy|{T>1QhVkDzCypA1PG0x3l?wrflt{w33
zfF<CqGAHSI+RQfTzk6@MkY+K(?=|`4aLsV4h^f<jHSL8M(aG<%4i8$8C|z4+8uW{-
z`BR1S`yBMHbkwvmX<Ug}xvM+8CfoyN6ohv8)jqtL=Z-CDb8y7j#ScEb(UkAwt~~R!
z+w+akg*#1xmuh(~<Hz@VM24GpRdHnlCM{dG?A6|8(ch^q=P-aBUk$0>_k?L!#wnmG
zo2<;tO=XGwt(tbzp7y>k9T5tZ@<W#Ge*O64tE-e`S||eJ_gQW^>MY-!M=0E2sZ{Mf
zd(qdPnRbMn`LCW5?%;bqzm87Jo0nM)sXP+Ro>wHRN^!p0W5w0~77}Qc{&swOtIci+
zC0@RcnjxD566(gK#qYGj1<;pKi{u1bO%-GK?AcJlMvc0I*?(?0DpKQTv6x4;L2Kz!
zq8GabV=@~adOWw@>g1vk%L15UW##NKcavv{--$!C?HhPNGr_R=l_ynpHLTaqG<~(p
zVEM}Unlb|bu5V)m3}sq2;v{XGF<JRnE36*KD(1PAoeKz9w%EAfh}`?g<7h6P_;L&~
z0<-|3C8+PM0?!yvuF?l6-*YF-RmSrkd5GlrLbYRTK2oK}aWsU8qpec|sM>#fQZ7vN
zqMJUisPk?`&D8g{{kFdDBT=36Jw)^zy(20r%2g(n%=G=ccZN+D#Ji3na|kpN(l@)7
zY7(#fS?(~zXfgVE;g1Q6u@tfMt_bZX+OKS5j()PtgQWtkoA|w-zSkND2jgFh?A@(%
zx25#zX&hwC{OG}&3N72Vl}}C?<L%pP0P$hL1o%SVscJ%P$578D3_ySAeMTwChyAUB
z4~K}-Y00L!$3*rDjLvT>i{i6F`sb8qozdOV;z6Csx<0{nf{ny$-ailkqHQ-@n|<AK
zw0uhP#`Ikno(m~M#wph@=j2EbnmKu6-l>{~-^=^!0oEFdt%l0NMv{W#l{~wcl#3(2
zKVEs5BA)u$*SBg;6ddv7BNB_TAIHkM)^OImhzuxM<FZ7S0i)6P+ZXGMcn(_*5f9bZ
z{H$z#65S!8J4u~Xd49x4*Z@O<e-4}ONjPRXZoE-QdfA(Z=1}!?)cT!|B(erh(k%2=
z8>{))a~(A2SB}Y>8SA%P>8szkO)+D|7;~EG9<2AJIL(h1VPR<@=UkyL{*DpDn$>W9
zccNEKl^kj-N^2sDvT}XpP#M~$t$bLE6DFN{_8jUHs%_MlPvZ`w+WQG|8>}!Wp1%Ny
z&Tk6s+tqeNyN(^xF~cIu*@6t~eOK>mUqhk@55I-wP-DouU9d=tXeMRi%ESwQ!a0jQ
zdG}vVRBB_?O}+Zii!r%3ib*h(%xJZvSy~|yqB*SQ{Vpux#f&0cG8jXeV-_|f>7Cyf
zTT@ftF}a^UepHT1Q|e3Z!6XAi=#O&$ppQ<6IG6-fQ;jY}hI^|dj48t=T*<J=_J*x`
z`E=v}bQ(ob5!H4)Cd?MYp2v3lURu#GO0Z>6jQx;zjB~9r3V!(L!_j#eL9+z~n{<aU
z<vM%*?w42Gbo*QWM3jti91c6o=YtdhlY#=a6SbyRJw^{j>YEWj!-l1gd_Qd6yb{*+
z_)XslYL7M}*qBK`1^IibCGpL*^g;)bTTh=pd~<79%KlZOC=($@Mb~e>!H?-DM-L;v
z)NGr0ZtfFm<k{yUyBq&oPm?ZBd{8v!*do369{Z1;OtXpHX;?&A>{TYxa|3&QIofIb
zp^p{Lg>AB{YA&{t6r$eyskOZS@L{s;k;v;}`Z=S+gR-fSkn9p)UFGOvDN8P&Wf90j
z%Q>7vZjB-nwScwl?m`!OLCzhNGS}*af|HBO@^Rw@sc6hZ_{kPqk=msDQ8iM1!F4%#
zc@@Xdg*U!qW3eCO@%tV2KHfHT)yp-s%+fv7qIGK!3S;&tPmC-Ox7GCOgFGQcPCVfG
zrS7y*5Iq=FoaO0T`1C2*C}>FZqJ3Ipk1YiOU<`YcAT0_;sBq8;joI{5A2z&%HVe*W
z#)_fLxgfK>81a?LDNKlYliCj6ToM24dsfO#XL2Q{4<pw#`OgCM#;isIXWDMQJayQf
zJsjqwVn_4f+aoX2Y8S-twcA9ulWkzN2}u|XDUNmXlZAk;r)#RFIYPABKfQ~e_9MFm
zi~M}P?!8(TZgZGMlR2wVb^VaNcXgV*qhm?t)elrgk)3K!Dt5{nHF1WUr0t(<WxEZY
zzNdlUN2oO&7XRIFehWvi?Shi^p53To_zpOVR*vDMw%9`4mfxs#R*<WCXoC1aPT^wA
zJ9qrSc21C!=!q^aDJemmr_Gu9pWog-(XMl6A>bN>BJ`|p$!YQ40=b5!&BwD=N7b?n
zrcEoxPfs#PFL*FleR*Ew456bZUQQEek<9;2xV>`onq|uZd~=9&5{iL#D)$-4o*h=#
zs_AyuMY@HprV+{SFB#lg(pi0P#KHpK8M=9|EWd79*G=&rO!<JFI4TN4;bSl31RnDz
z*rPHZ5%sjaupl}-);*nOKxQR{j}>j?$#btPxi6^8#Sj1t(svQK2^KO5CYWqTkH@;8
z(@`rkz5uMSWS-Yzfr-e%9xUD8`R~;)E@s{+{>VksFZyiNH@R)w0yn_e<bL;YE(AcP
zLqcIO;xPh?%DG<fNc!5M;C&{xL&TofKR)HRYH%iKREXY$L)YxXn1!!DzAn{lyua;N
zzf=0MhiG;N5Q9d?e8T+?p#01j_fg<w^wJQniXt6eP=07aO%3j4TzugWLH@$Yw{IIg
zbi6D((qBDjWw_F%=HK+L0B_L<Wwp9pQAtTI)187gL2P@k2C<ElMc=kimC#91s3~#*
zj8)P}-5(v~yd*5lTloZekwD|gF{OhvG@foud{SdYyQzw|{&c0ZaH{AezxmqJ!l%8T
zKTor3Sj3yvlSEY0rDMkm0bn!^uPL=?laRMkIj212_Qb7?n(VLZPOM$??xRBtG`oOJ
zyYcU&RnOxAS>RAqEu{9Vf3ru<Jk66?4Z?SZYzF{H7*#_DU_wmFiagS{M>8t;_~d>Q
z0oykH`Wfkao6wm8if=t?WVJIfvYvj4u#khvIRF@Z`0!q|>_<1sb?}X#Ra`R;a*CHc
z5es+CCu$6WDFf%~O`Ldal*=Z(&xi+Nc$Hrv$13<aL%4?*LWB&<`%eAsQHdq7g%FZ5
z+p?u@#7=R!_FYK-#L5{I^HErr71ZJHTu>RIoxCh~K<Y9-ZnS+haDCTD1P~K66W0{9
zZF_u1vflUZJyTZuy0+FU*gS06y`-d1Fi?)Pe1y5$R9oY8WH8@O6m&(z$SED6$)F0a
z@(tjRb1u&Nda`kH#|Hk6EPx<UEfNU^+-+J9pX0|zgSw1({5`j>cCHQ6y&geaJG+?1
zH6k?lv@c(xE92{X8K?-9z(}+kz!B?s!dphlQ*=$UOoe_Pkk$TtB#{uA|I}idvatN!
zDmTN_GkQ---&NJTTd!Vq4?_04%aNyQq#-G!G>mg<nM!k6-rw%{+{qIsE*rYXB*gnm
z4<Odcn%G8^uM>{oi1R`YBh|((0JMK5ICm}C=32xeS$&C6IDfqNv}*&--?;Pfctk|X
zqQ*Xncegj)u9)4?Y}&G_x3@nFElGN_42oe!JiK==V9aOyNlEm*RfHfJ7fm6ixXY1%
ziGs7)$r?zxN&5<>A0L{SvWoU^LQU_UW#eY$xA@P=%|un+v+Zjds455+tsn01HbON0
zu>l3=Md#a-vxayWpkV8V;Ha(Shkhi??4XONS1>NkN(tHbPcYt|LxD3Q*Cq)XCjZFK
zzda|eF((e%bqF{Kh=%VfBnmFYZocg2mxVu^))ayDxvK>v`{PphVZb>$&0uZkc??GV
zYr*^zBSXoydX96QX9GnzH(+j{(J!b|MBN>9u^Rutox(Qik4Yq+5U$aEjickKA(lzq
z;KWJ^>+x6pbC18l%WiA=WPdBf?x^7ue@99H=-r@LWlk{a{R!7apF3<UqiX&bim>6B
zyA;P$GS==*>Cv+WR^H=N+KZ^neVc757T@Hw&Q|sKAG$UIK&KRGozWcfh>v|mCG$G^
zcTUcU-!!K9W*uim9af#qDP+pPtLWIX{$PruH*Q{BR3sD}D%++s$XIk4mVtza-h5?X
zU7=M5c_|wk4j&pQ4RAvYb`T93kk1|@TQBR2ir;99(2Sy+6fr4-utq$bQCJ2wjE%&c
zL=E7rL2|lhpC$Bfn<8Mi2qpJCm~GBW+i9>bvh2%X=)RmvaiTG-u0PAIfuQ&UuK5R=
zc(c2|xP5yugevj|4JQj$nxU_1k#KQl0m}p5YB6FgEQ5$w_k0?J&5iKysBic-UrIR6
z;iJ9Ef*ey{KM7Ze4;e%KlqQ@h%Rkad2W8sl`_?(Z#+gVDM?C(@j!<X@+^G>z7qRv5
z;omueRG5R2JH$PL<gx^0S{38eJA>9(s(|cc_VTppHliFEY%#V|KCuftP@pR!6hpcp
zr%&HZc)`CT%0VwvMp0N|lQ47JvZW{~Z2(Xtc}zV;UE&M9+4xQ&qSU^<Xsd_ql0&&f
zP3>__&Le7K4j*TdVhElt#|76^<?*B+9?&X^OPgHz^at#Fn5_Y6h4i4JK0vMFGKn$F
ze`Y%kIC4`yMRcHwVqTC?eWd1ctkp+m$8nKXA=Ko4Ov_~h5FYDOPU8Vbl!=WTZ%F^&
zzhsZ0J=Z8uUm$bwKYp#K4<AVQ{bDs<k{Nj)Xwu?`00|%)K5O@wFbGj6`d3ES&ZSK&
z(U_Qz&lS@<CSB#20Ui#gq3+0)L_JNOo|@I;Pv;N3-t}alPP^2^S&H{Ky50x+g*k=T
z#bDL_Mh%E2ucCJV)iTYcG$IBBr7rA>ZFiV!?G4Lw>TD6I4t>BVkx+9Fhf}9-TOfC}
zGuG(%9{xEE6obw-UlixuFU->02kJfj$vj}%1`2X!$3?k`sO{T(pFdoHN*}w)erf~a
z==!(vf5i=;!^OtxFlwEGBxP4&uei|FTt+I>Y+2V6OA&CQMqu$ZZw${x`X+h;h_R=+
zBfVqQtYfGY5JH)R4AZoN&tO-vjjm)e_gL5@Rs|h&pI<ahGyI1pu)Q4y9DXJWT`n}1
zMGQMm`D2*x!<4t<i4s6EVN@mUY6d=OS>P1+?d@%@$}y)NL%uccd!(cQ&$_>ov8&T4
zzV2^|AAN+esC}z6aA8bDIo7Che@`=l8S50`I`4ghtSSHSDqfp&?Aw(oL#MQz%K8=$
zv@6Jsmiq^Ww^dH$iKB^6LIz!M7SS3nx6iYQ*nm3VgI%OvLSRPDpgQxk$dd$5IdbJ(
zN-rrh=t5*Xs+eC-m`?XmiRj|-*P=svErY_#%U8XMu;%wXr(ad`U9GzI&VIzkez(qu
zX#6{K(L~d-j=s_`l3RiR(K`3_ogTG4+Gt;ZpK-8&?w6O}YSg5~D$pX>*K`An#Yc00
z2$tZ13^cy*loA@1FXIpsTflIu4Zq&*>cy=1CGc_Zc4M&t_(g7qT>JJ*SFSXp-JOSE
z4wRT*XJ=?fiTy4lUS$53aNUTw;C{}+@zYfV+f%zL?+E8tfSrpjE})aWZ1AE*i&%Ty
zcymSSwE+$zli;Hj0XNe;faMlSvXm&lanNYaf(O~8MF-`%mtI5CM%U5>+*5Op_$0EA
zIEsg)*~murQ&CCE$%&tGsCQ0I4nh;Wd)I{ztnTEMz9Exq(P_}UVB?@C<Ek%GUaqgL
z_U+%_U^^+!BSY`}UCpkVf%GwQ{RVcNDEpL_%W$fTva@g^pxT1#O<yx_jts<;D;wFC
zG$MEE3$%e(L;Kx`A%+$WmV$|U2{nL-&o4;(v5Rrj?c)poE$>O|@NQAARrZSaFaem(
zOAaF?PsMRzlAcJ-w$#<??#9b7&Sys98cAWo?@JTk>`*9fuN>nBeusSes>~sQcgKJ6
zNqjOeq<gF8zmMFXUlo$MG3@$Wo4G+odYKlUTW0>jmH+&ReIWW0apvlg#0SBvG!nG`
zFD8=|R(5ICGzfUu%j2(vxj(eq=v=znth1!h>s?XXk^*{4$hd$wHR^At(OJ@P$iq_)
z+}q)LyQW#^KN#s{2e>vyeGn(hoC1}E{L{dL^QYXcxn~$$nyIagnn;~6-gAw%8_P(Z
zZ4X7g-?YD{)@VFP_9adl;%^gV7gIGBoCc4UPP3K1ZTORhSQH8(8r6aQ;fEwQC$znd
zCe*%Z<k@&iM=7sU*~1P{(iD32wtHATa%1Uf<hn@4iHnX<NIr2wpjV+Lq!rrCVYN;n
zy@qvkX4Q#=ZbN*dvht?jWX0>&jdg3+9+I~=?Iz7rDqKoi$lfN!ZYL%NbLTv@&*h0J
z-(_vfHpHH%4aKqOg=+RCEIr(J1zx)*=$d;c#-98n)W~_wi{YT?dk~Kf=+<01wpgP}
zz`-Emoq48fi$*DY@ESOGx{arbba_qLL$8`{(70>1$Mz_pESkr$qPDEfp{&OG;#+tT
z1Jj<;X`spHBQwTXrw6Cgkg|CB^79S>31fJM*?;!hU@n@VK`<07E&IqwnxcH9x*xU0
zK!-s(wMOR*yJF@5`RX?U-=QnZ@A=<>Js>2G^56^L8sg~bpowzUXt}&TzA0JJKYIRt
zA|LUzR9kg-({QT7zgRY9wAyc~tvEKjXcbX(7vvTqfYd9J&QEv;B9z>(rv8_ta8uE)
znMKu*Qw_fBa$DPl=co5jy!ZP)g@{`bkA2ej5FGK$DQ>P)YU|KlnvgW9k<?Xj4h9x7
zX@0V6&h$T(E&Ht`>;|F~!zq)NEd#~Um8F`+UFMlQOA<yXde_eQOKOR;5154Q@D>Vr
zfu2!~FT46a$(lo`UM&Jp@ybc&5|R`2b1!AG7~Htjw~|{BNwbTSQ*%XzH;ESiz`HVK
zCvo{8Bk1pS7+{nKoQP}IpZu3pH<`L1mL6wDEd)s1YU$Fm4igc<5Z`-;OGxa=$bYd`
z&z86}fidas=7Gh%<>uddjuQ@?L{Ww_UUdY@RmA=m;;4Ood4{#Iu;KfIZ|Lp@@-T94
zDJ4WDWXa6C9+i|8^r!tv4XjAuT|)nnv5i4I!mYY?y##JBU(p>s59)^AZJH&V7xDHm
z%v->!zLKX;&9mo-vWO$2W13ftkd;|}u?iKLM_+$0jb!?UppTeQbcPc&(RN9>KDC|9
ztvWiG5<zI)<wXA1`F0b}yKp&3<*CmH83l<^;53>9_G#o+CwMdXa~P>uhPk83uDm9~
z@?H{WoAdY&K{BC-zL)YdC}xWHa0`OV#Dcrk9rE<lSpUg5NDHR8<#w4eO>g~SM68PL
z#4nIK>Ys81qJ-9S1{K2ccB|RXg5P(}YOvrd;nZ9O6?Klwp@)9Zpp^9VIfc6=of`V9
zaAF;Fp@~7Mm{7RJ%~e%V@r-?b5}~}DE^1Kc0G`GE3Jqi2i2rR1Woozh1-yv&0r0*Z
zhd56<C7w}N|1dc1Jzy3RJI;{>6Ydu_w7JxRoL?ZF?sC*v(V?6*y1T;7b!*oAAZrQu
zj%Xp%x4b=?Nzl%Pq+Y)(aFHe!Qo!9N&b(99f0iXrsq9b~xWox97ZXlRo+B9{TEQK3
z9pJ#6Jvh#YrTqt&O2FPu_XMLtff@C8HCDLkr@f`NUax`P2}Cu;&q;%tz%Jicw%X~2
zzDmY#xf#iRA8|6|789Y<T0yi*)pS6Qc5bdVTrXs;WoHFUIYgd-)PONdD64o6-aI+D
zl}48q9aV(LFs4sM`~%TJi__f*?m^)L#E_n{`W0)b%g*b@u4}OKmPAB_q%!W;0ptiw
zyxKI=8CL<LgWF+H%=6B4ZaMD&tHR8NMY3PHDx`M3;q3Ai@+Kp3Lo6r&7P%#4dSYUd
zg;=?M8&5Sj1l0?ePDdq|3cK!Z!239oY<5;*%Tx&UR6x;c6H*CVxF!7Yn>rXhK|s^A
zJjpGg@(v?m&gkA!)nmHV7=x(p4gh!^mINc*>Yr-g1iE6@mK<qt4o-36i7RG^q*ySZ
zk{+-TU76!;>|FHr?G^Ay4!A|q{rlu-s#FMdcQYLf7depStFffmo;tvF^5lmElsvwq
z6T^*HF4FqXFM19XKrayy^B}?&fp2~yUN8|~@;(i@K57e9+!6nK8DV_{{yG<S$}Ul>
z3Y`vuh}PA)Rb%mJp(bN2Zb?Cmx~?G(9oP-s9v^XmjB>Q?&i@n|guJ5D`SxT@ecd;L
z_<&my7oNtrmBmxJAt0C|JU91V1@;ox-SZ5~Ovce%5u1oF-hc*x_~XhRWW6XGGi^n8
zHwt>^IH=-oxdryIYv%bayl;a~^adU>2??Ej{(xL>$aAneG3)xxnkcCI(d-BVO*K+9
zc!?&z%#Bj)Gm;L%byfG>2M^lO;mp}%S9HwJr4<slwWlG${o||1B8rj$!YJpp!P40q
zndBG$0ap|{i<lU@9c&V+A|`LFCOan#4{?|&m7NChfaA;u6Dld*19MYXO=*1V4ni;V
zg0CDZuQ(BQ1f8WYb?g7ZId&0r7B<v$*|(&qEX}}y(;qgLJO_vmHtbYT98mpxZ4<Kh
zc2?g^jh2OKp@`e0=jXZz_qI3kJU<b_fNj=rKm<|07hHJ^o%kD5+zuY$h)F75`3qyr
zFvE-_@jlQ}Cx@(-@gWEW*cCpA1YH2;*$0}Gx|X=h5AT<-+SETPBp|u`x9@XXPz*{2
zUz#BPNNx9#Yu>Py6rMY@ZlPl?xPIie!b!=BIcJ34EbYJ^9o(V>bjP48V&|z_7%D6H
zTp%d{;j?5C8squ_sLqRKtSvOd)5U8RXf&W3%>L*ObUv&)+~dFI7zK4@RvywO#5uKJ
z(e$E{Xh0*Fn0Z0Uv*EVVSVcw2Ti;IbnE^%h?Y1r;K2sSvk{aH9@NC`P0BlU-W!{PV
zdK)QLlO!0%A67~Kd?~CHQL|e9xVt|7cjI!_KYYRW{Qr8R##%xVr+~G8-@f*`44i=|
z1qV<x`Oh7#CPk<4>=<R5byR;978Tuqd`>9Dv$$1OlAM4BLn>VK^D65AOcONg>`t))
z>%qj~FFTMF6+Xe8*k*?36TQtEj>=M5tFjG}X>YS{8~*URbo>RJgN3R3@k3S-J_o}D
z$SFp`8H7F=IH9rU7z7dl!7dP<85M?Z-+q_ng=?W(sOmZ5Il3m4enh>fcC71<%fAbH
zD7AaFZl$xjU8hcjNm0W2o-d_q7I=smtvbTHnC_14amyuXYXc+MEb!bp{(24GkRLpJ
ztpnoYBz!Cntwct{hf6*K6crbP@UR!uuDnMen!SdE4?{UeLqp13MncLDw`$w=Xl=xm
zE6-L8<#($XwdLKhC_~clMj{QM2mHCvVKnK8ZFy;_7`?!|d3m&E?hmHtmRVwhP&&_)
zODG^291ctkWxI=*1m+!uq?7W2^4~6%=571<@gubpw9Du&nyjanm_;T$93>>2@YnG&
zd5bwjEjtYo$}&V$`5vN@4=oI><L4I+QC8jdR_5|wq@1AFsHSp_V+8ma$4LZKqvcqx
zR|iR2HiBKV3z+g>{8@Hg^jRMp{9fSU8MnP5EC^i|Aft=ML5D*^LO^mz4!v6Zt@9X7
zB+?dMm^^_iapMH4;N{if{a_5IAcjFa10DC?+g@k7Gco*{HKoLv;Dsw<v&nJ69K;&~
z^xwE~gZB;z8Q8H(CkMA1V62Ef2iB2!uJM$JzD6>Gk7T12R~f%W=3uNcol~dC_aD+S
z@xOR6z7!-105Wvmm3LhE?(G|U`tX%`eyYzi!_2h;m&%-$P`ggI5!x@j7i0<HvDP9O
zBU(3@@)MM9qxMWjQqTk>KM6ukm&t+!;WpFxtb+Vy4c6)?i%ap@h5y1MLMK23f%^6i
z4i~A!fgZv!&O(rlas{5^bSw#b4^*XyIRui&`7L*j!-LJ-4?6`V)<Au4SU+885Fl6Q
zA4m!#tRWLK2`AthL@s;>9*>;`jbI1(R&4)AzBT^!-Mikc*Q{SpgQX+#<uKLa;x_R!
zCr+NkQUHx$`MT3znV3lbDcL;$f)O`CBt;nIhpUao|4Bc}!pJWL7V2Ei7()G+*>fll
zrH+?Lm;}yDt%7n2zj84`jf4JPnIWUm2O>71H{_jXbpz+BhXrO7ovl3@8fps$>vcf(
z@<>eCrMe>eLSM2NjOJ5V-;ukBVL{K1EVE_jQV^iF)tnnfS@}k;i=mk#E2}f~A&ASw
z53s0%7eqszID$^cspaClE;}r`i#b5!KHNS1$DQMN5@rJgVdZkcHV6l*VO1WgbwTE&
zFvWsUpCYO$*D~<#UxTuV!q|SBa36n)k6Z}mYoV-5Jny|1vQaX*a47MvRgoIa%G4D1
zfzpM#$hbNDUvZ3&%QK<QL1%)}j%#db@(R5=qC^U{hYyceI6}&VOjLjvnqmtrWNH}V
zkM%d4;vo|$jQR@ck4(}03rzF*SY}SnX1Y<Kpdj3xIaphYZTAO+;FEOJJkiV{h6A4d
zfZ?+bEylDGhYT4t0ep^$Du=QMddfH<Hmpy?ht4x04AG!%su`tWU?wyw@x%hU_*;=i
zw2#^lKM5PW^3xzsTM`t&M}1u8E2Hr3KL~~Ix6eF`Lw!_#712k(j<1Bn1akj&f+Ij>
zB$Q*&=|S_n8LYwlP?Pg%u*XyK>hxjwQeZ`8i6B~};vL0DzGDMv#YuWK3i}dI_pO?v
z%;}MTzz`8N7<^^8DK}{hGkAcATb^IPVFUBJbpWQr*G!j~NSJ$d`9YqI&E<p|I-`&Z
zme00v)UFR0E-7rGEp))Ff9_!quP(a=ydZ7@c!lYZZc=Rj|M@JTX+fAZWaWp5sHhB^
zNLsW_UrcYQ<AW{^O!ZKc8x+zZJRd8E`y0=L0>tqvhj}l22sa~gpJ=>s<*A2RS*YD^
zfdgDtJ)?<ay6q2K!V}#x>e6Z_b#-F&!68a`j!gI!)F-dn=t>0y_tVa&Sx3GiE%ZBa
zfO{*NjS%>gl~p#6y>8U5k>q&<8pq#Vy8IZYi3>+G9j45T1@{`=`llfjHct+ryk{%I
zm*F@f$uv0lR*9>moFtA2G_UjNHl%&r>J_gofhVwl9+Tdelq?FJKP<2}Fh(=TUDWfC
zfV`P?R|F$Cc5bSbt+7;E`#-0_fSu5tFd=t~KNqWtx{_0!lN5Up55cj;Pw_;IueukI
zarf?oP?MEIkK<=(>{1a&&UE0gVHFe#>(2>g-!CUWSig!WgmYqoF&ebLwG`f>!GU+P
z97Pit_cC|gVNh^av_nk)ZtFH^aeRkF@EkB=o5-J(FVu}Mc=l$cO2;vYX%jUTHjaoH
z<(Ep|qz4bI&{3k|3J3CWa4DEz!`nOW1Ym(6GM*GFZKYfotHooba1)RK<DD-2!@Rs@
z@73sC<KjRszlKGDQ=_jZE9oeK&aA?~pm>WPN5TEULQ-I^(CU|$-%>ZyToj1rO*DB7
z6vSl{E=!)4|Fe*f1%U+5=s4zR1}B-pGHmalgDrG?L}wG)#HdG8R)Z~Cf5?Z)doX14
z>({qiY$P|mXuKhvS2mG=%)=As<DUo#5dcEP(ZEYt(ftB=Ik3+VO<Q~(hxbcCfsg9=
z#yT&~G_aJHC&I+#O>KLnb6kkO*6uS`$iSR2vKzt@SL`MwvZ;0$7h)_b%7O}fT1Q4t
zoER)Vf!h4g4IE{Xz!8P2b1$8MLqzC79~2I1Z3b1(^M<}lHrpe1OzMpcnC051lBd@H
zEW>oBRpFvB`&A|fZHOg#W<#4EQr_q?n{5nKq;&LPA#R6k^Ju4gs&j(9#HA?nhWZG;
zfxsH)h?&f(0V6aAM%H0GXbcVVT`0hREE<7DO899R1;78<bWf5V9PjjF8gB_M>?&>n
z+Y?gU%FG<)*Ql}QH&PW!ULk3P{9669V&!`^2=U~Qs2Ou@ii<;y-3jPuRx=Dq|Kt!;
zFBIzhUaeWPW|RUjc~mNhQexMFHO=JmLJK&TmBmFQIK-w|PJ1nR_W7Sh*k<fFhWZ03
zZ*sRe!G<TFiFyYJ98UYHrjE3C&%uLjn>R;t8ap=4EXEkG4)$*tYC>k5L=WB}SsBPY
z|7Z6(gp{oeopD&HG3!_Ioj-hlLU}vtxag^c9yK`gV?n_+%mm&kcJ5Q@^j{6_Vm`gY
zE1*mh%^Zz_@&froBob6whSt5rA^L+&QBKM1-bdQEhQDBIxcJyxzY|}7TQF<dlWyMB
z&i^5~rI4U8SYh{XcZ8Xn=Bzb#Y(V-8zoFJ*u*2#bPYWGjZExK78{JMis$O?sd}FtN
z8ZMBRpt#~lNMcQLjOa}R4Txtob$8=O7-2IE0^`|TE*m$V80!>HSTm~KK*_)!e@yI*
zVk?0<mH)1eqU{wf&)H}76Sm*(68HG>8ln~&-8Aavlo5W2DT}`<JECC<7$hVe+(CFC
zzQ80Ze;gSr{jayO6gU)CLWmVaaikpi|4Jd-T0^W8Vswr>$*1T}O1*oxk4xhw7Mb=I
zw{;~QfA~et@Y_O7{JsqNL@)(spH`8~EBk+fO2v~bk@FCl314G6W|Wkt(O`GnPKXW<
z!a)*I<D4>h7z{i+!x2>RPFhw<wm5K3aNp4ORu`%65SUh$m#2JRBa`<xv=eR<#tP~?
z&7B&Bszw1*2yn0~r%o*b>gl*gPCD(yc2iElfCAj$-F?eVF3fyOch~HYbiwetdNui6
z7vT)a6}ekp%Cma=ZIF>h?QG!i6h54wQJF#}=iVpEIL!Rl&AMN9G3%qkInhE@dZWl4
z7%M{f+iUsKi<14-%a=8*7vBAUxkt)`4hqdCPMk<Tf{>Taz9Hre{IRw^FD6&g4ZYO@
zo&wSEqrP#lff{<oJa;l=N_uS>sgd)puy7RhKWxl=Gc&&}tHAUjMFWNhYJ+-0uN4C#
zSGDS_W8SW7SEsy$cB}+AP}S0jOlb*vwgNG0u44X7$#XfN$ZyvT<@xz@<M(RdSy&Sw
z6OT!=(Z9m4y1hQoa9v!GH+Ew4Xj44&<k6f9GZ)jqNBwit((F(%xePSmZy~d1OCb<z
zoY_IuEmj{YJcK~PLt&2T%KKxEX+L?%%sEC733D8KkiITR-AMKRW()b2>!13D$SLnb
z^-6>*PR$#*R-Fyeq+Nt8b_*QMFrRE^MPVA_4GKcfA&<Krgidszf)Im&S>n9P+Br=V
z*tMTmYieF>z2Vts6lz7wToA^`x11wO0KjD<0Jwke-m&uV2cx8A?St*sUBNO?2{tHH
z76D#f6Pm6D%#XN`I!pXk$KM`)FWw;xdmNu`7j2&}+Qw)BiU0P`VY(YtKv3vi_g=ra
zQ9=p#9~V51r=+L&<;!{hkA~R+!7l5_`HXG+<G=sUFyrO~hRcD<vo+2wSOPd%lza2$
z@2`^3QgoNaIh);E{NN84V13|O@h#oPzC%5N$A{PJ#4L}9Kx`uJr@MxQ8;C`9$|{X6
z^RBeg8IGYAim4F_U$}|CE!~jU|3yFw{rcszP<Tx?6ku~i;d)0$JEq$*1qZ;sWk;2O
zDejUea{BH2?XeU>{yod4V1=##eh9Raai{u_K?}`P*4z<7J@7?B0A3C9(3GM9%rbr)
z5Vhv*9W3+QkaPer;JVDt4><PJqqPCr;1zt%YANNlpcRd4iasuJf^%8ko~dE3TPJc5
z$qzeV5XuzUz`@Z!H{9FZj{9Gat%G!Ni`PLDSO0CJ-74uah@<23_Vq8<Fgs#lhOZK(
zl<+Gux5jXR9^c;HjBtisMOtyc&?={uoNyJNI+uiZQ0{u(0MnZe{TvM_!Cf^Sr!CDB
zEzlGsW`v~o9J^6t@Ze|H4Ql1SS5}6ZgdQgh(Ahkx!d6XbZZ3TO2sk>F1K2Eu%0j3I
zjEns1_kl0%Db&6|7{I-f`?aH)16Io~^d7FiRd*YU9?d6eafGBO6;Gb*S5Cvf%}{x~
z`04SDar>3I3(@(6)<z9o$z9nfAc0IU-KL^?MoTIB4xZLw^huKttxZ28rdl3FQWpS;
zWJE_thak>-ubiadZgFL;I79WXHw%Rj2i^R>V;omA%qAibrFcqZJ;!WVbuGuEhh*;$
zm)4^G^?&aCcN(W!L3GPqDPm#|{R0<>wI-dlh6xFuD96c>&get>c01(AkeGBPqS^gj
zH8F@+-y}+I`82YQT>r_79&Vy~KF`n@o0rS9J)V<aAyo(~a;#w<y>Se0b!ym36M%MC
z*Vu?aGI6eMS1soV5D0cw=Nc2RzgLLYNen(AGfP<L0M?o&P;g$X4oB_<*bu_+p(Ygk
zuCTLqH#(CT)SRctMGAl?18AEmPJzo1>2w4^2Tbb8*m*V)tXV0JvJTQP*fLBJ*^P*G
zx;6}B6D|@93Z4UK8sJlEC-l^~=m&af#}lO_J!mS8O2Gju*S~Bl<}vG>0FB6JC>XEf
zg`T9z%U^lG0}w$or7WZX!+-cnI*;{TDz+KMJ1c5vTqdRv!NSJ>x=Jt#BZMdh2_{Ci
zQ~UOPY}dj;MAnmNff)SEs#OdF>8-byKuRzO3v)(^%>y?E9I#g^IZ;atBc?lKU&80o
zv?xLe;6}3Q%(7*Yf(tZ9_nkWlFG;9H<9vY7GOU*!KL@e}hh!$`!7a0fE1`i>y%qI1
z(Aq;qtATsOU^+PTpH|D<Iz|f!i5=k9U|6Ya3@{~w=OKD*zpYR|j~isxXOQW;&z})-
zo9!{Fy`0=kfRZCqQy2bt@e;napp$R_d5Gm1nlf&K{_fZjDOkn&IVB)dSUWAP`YdJj
z)6-k&R}?8(PV0tF=Vw60>g6fNMD5U)0oY;~2{{7PmQWVU;%zW^4h1}xD;I>H)Qa2$
zhz9slcQ?kK1UIb^vC9Pwv)96{{E!DzgLaLudA<UJPk(WMd26>OG5=3iC-5LJ;ww7T
z^n}wY+0(!~-!gs`14Ng_Aw!NKk?b||B^^KmAnM?L7{0!UmLmil=W#MAw@^3M-Hp!`
z9blfGluu}ihJ(yeU<}JLD<p2ARps$lB1#%$I>hf3PTG`Q2cBre{Lty=1@CsM9n>b;
zK`xnuG;Y+0I*Ty5o;w;wzO^?&fH45D>L<q!y9}g8RC}}f%MD7B%H6tAMu56N#Buf+
zaP}LtG4L%S!n4cqat0^g241**d#=r64m_$bD><+np-j0Ue3{?(px%6Ks`+tcYM6ud
zRxAplH<akb=7l$Iuz7BBUBev5EWYE8`5P!cxF6hHq!yVcEZSuFtbp~sVe+BtT+`+%
zx$#9p4>B-6M%jDYO7<zD0R<p}r0R1fBW70C0}%`j4El(Oq?y={%)JfvBhT@9b|W_<
zU?A=zFUs?4nR-kFfN(-wabGA7BS(%LI<#EiF5L4C_nJ$GIo4B?U(@IGikVK1<XZoK
zfy$`u4L@U%_>3LOR0R(8*2_6lZt|V5ofe_x)RxIBAK$t4{~^&8OJ(1)!BQn0yl|t5
zWWYG8+;uyFt~V5>TH@dE%?=wjJU&eA#|$DEPLhO<V*p1p@IPb_S>=C?T#J7i<my!h
zlh#LZwgvt!Z13jpwdqlZF|f6yyzBzK$w`Ns%j6Ag0K*wFLCz7K7G$Y@k3glxeUeyc
zAGd~!(Ag-vn1jO=4pgmQkL!RxJbk#$wVRAL;Yt$tv~AHMDR>kGhyU`3x}_yKjqT&>
z>JHw#bm{Tb$vzWWtlN9#U|LG%<-tka<#+V<S~q<Eqh_W{_s>c)v`Q!#d45?JlZ3@9
z60F-8j5@lvL*U)@ziKybxml81QM{)4*8^XoZu-aNgjE(f$L8A3jdnH<8`%3D(NdEG
zud=e}i_T6@7x}M`kI!BPUw933_|qZ95Qs;Ub5$j;wea)<Z&+;D@SP$tq-6w}n22r`
zv#;?+O+ovE=)f3_*R-$N+uLtFOLanU#+hA$&uZP;I=qoo)2qhfUsnxs=Ql`IRaIGe
z1+E+w)70IFzk4c9K$3YNgMReR%1E_0aXm+`e8#C_HzKhu;IcROSIER3ARQ5WCycj9
zb2y4IyPn&u2Qr87B{+nH-vbzlF!TGfKO`sNQDzfsKpI}NMx#2rkxtj6US1B^w4)Il
zfq`&Y=o*S9(Z-FpA1zmw%v+8hKc09{413(V7LbbiB~vc&4v&(O^bf6)+%j5b|7r9^
zH9PIMOQC96_{GscESM?B@A-<M>UO^%)dItbQMF)E!v_yWL}j(J^G`wrG&{|6wl`Al
z!|XN!H>AobBjDE0WjUJ>pY#8?(BB|v41po1(N}sCIiZpY+`q4!?sywt7#hUgC`1hM
z0E47+PUucyxRQ`M=peM+uTNlbN2gAm*jyzgC9treNKQjTzj9?TgMk=H6COTf@Zcc&
z+o=S=@27o%uVaWNLTKnTL&LoGl<iAVh!X%1fdijcgQqpmPnS{d!%vah%~A1&Cj=uC
ztOt^kX5a)F$$W0ZGc?W9%PRsOE5gKXUFExVLv(5x)oX%~xx>3jIyV<%&bkf>NKl&Z
ztzYpvnJ<d1N8zg&{3@_>r&~oTi~+zr>h1=}0qyz)o*>j%p;b`dJmWA_x>dC6%fCcA
z)+aUx9egncbk~j@=sO$q;T<Xc5#L=Pz70nSy5C!dM8&lHg0K!+UD$Qn#bVy?+_@7w
z9zs_*1s*cB)~ySVOF6Dg%ixrPmr>J*ftG1*?mx6=XYmqRRKNxMp{-&elL~M}=0CK=
zREXcNun^Q`a7>IoHmq&6wq?tGJSu>CBphL}Mau7R!@>!IO6IqVoG-L-=`lLXn}IWN
zT4rhVlq`7!Sl>crh@u@7KY0%&iVk3D>@P_nL5VeOX{lXhq_Ac&f0}cVm;VWv`3k@k
zIC;5KWjKzFfJk&1Q1_|-^xlI9uO(CFta?pmA>S4ije+pEd8p)h?D%of;|Nwk*e&u5
zqJAOMK?bXMk06iy;zL0}CwX~#NiR4UK?7TU+TqVrV>Y_pd-$-rs!A}`0rPC)A|oPB
zFo~VZ#tr;6N1EBjT8uCAgkY5z8buNXg>dh|gQteu%4^F@9fDXNB;mIpsYADYo~XYu
z>_oIck}u$tp>MbiInZ#&dy;>Zz+236G`q4yFQ{Y#WKKb|Yxb9<tT13e5U~)6<CBz>
z4!R>pkKPG3asNJG#|$J=Pqxu9J#P|HT~Wl2g8Kz?1X@xb9rEBnk7^jsO8xuuGt$fr
z4Gl{87tSXA2!u4i<<pR=le<HaiH1g_+)%-skdcv+=nuKabmh*3W$sLYr<0B%Q#TT>
zO;M8n9^$L--$hphTiUu+D|b0`dh9^$Q7`j4&{R@B`|(fDFWG$NdT<{+M&bAD_=*>1
zWd4Gpr=#OWX;XaG-sWxF+Obzm8mT9nlNykNhtOb~8ZE_Mf*H`Idw0Z#YJnMBC;~r-
zeqA)g^a>Y9ZsEHdETxwom><D%NnE8lJq9Kr9d(ge5#u1yb0L8Zo_S`Q_(XlthYFra
z$x=>q>j2fE{T7#cMv84tp%tccA=gD#5s!ckgjapqL0QuAZD_-id<bU{u!-js<G77-
zZMlB*Q&VRY_%`qCpd-26!8nuYlhwKlz1ZBFTW%71iO(4Y-x;}_O8}TH6lr+{=~@2h
zc%b;4%q15ZoJYiEBj(#8*v+OfnNZ#7X=bL=9rxyhZ0F9MYr#$Q>H(|RzT0a&Iih?a
zt9nXyeJ4z~%Ms-?&^!e(kFXspNm`|HvL!R!p8n@vq#c}Q9D7rg<FGnq76h_1@3f>&
zH(^3;KojXlFZ>hTR>+4M2mcOhl~2mv@-zJTW(O>{mWoO<{1+haz=(EXK9l}x94Hd*
z->=g(Or>t!wk^V4u>Fsi+0msy>*gnpbk)zFsnEPUnFM^W%V0#VS}0mQ@t;<t>NeK`
z7C)lt9&-GtfCNd!_4_wsdQk(*tRsW_<h;_^Nlhc*Ha?cFHbBN#$;k%X3_slHIjHnm
zWO?f9Gi1MJE<d#i`@tK{G^cHB+N<?rP}1X65k<@y-A>YRYVhuFyon5Ek>rAJpnFIP
zqS9Kh`+N5ms?D4*!m&*7{%|GU`^!ck(=2j6ujzlCcCDgG2iycw6&NA(5LDp3k>#G*
zrJ>i=<BAf7v;SH|&jc`2x5Hclg<Jva8?r<ASLUR@;4QH_?sCuy=Uuh(*ea9VMlv4|
zIX@0@1A-03dmt!qb@*7`{^miXJAc341kznWwGi0y0YMM&in|<zDh@0Hiub71(lrvu
z@#JPw{HDuvzU&zjF<xPXon4qiZz=y!XlSu$J{96|cpgY*bHxfxZMZ`HfFh1F)r|%y
z=)FKO3ho*j8uXTE<J@V6peajS0Ugs1MMZwz-u>Kve7^bwAE6caUQ)hE)kur+CyMvD
z6$HIO4&52|-N0zvG7$3waTRD{lE!Oou9+vhc*&9>jC&#$L52Uy`0Y)Fg^X?5yH6j&
z)E#n!^{r%OXO3*`W^YIIK^WXg7j)Q>LY)3X8A*9(Lqicx=Hcw*g!-kzph2HS;wFoS
z<2=(6w4RKcH!~isAib$Loo+*Rs+dli9FF|z@V3%1KR+=5C;0p{zjNs}-j5nfGg~y+
zTV+vq5bdU^0g2S#7{yvogB6~Vuf(SF{l&uED$0XM!?<uSxqt9gApYW8D!P9J(<Ov*
zmm|*MV9XiaM$+lpVBt@9z%M&+OEEkm6pv^&51a(9oD+YB1kK!F<Ee#ign3U0lLJMQ
z9|Js+cFIV5^aGCrl^Q3{m@T=LJ(yXD9Q1SqSO8`}o;qt1!UX94>3nF{u3gI%z4a}v
ztY82H{=HepOmNP0?@%^@G6G~~y<%QzmG+1cV*0rek|vg&J0ES-o;cGHwq(#HzBE~-
zLD_X?+UFAx5X7b4h^r^x<uj;_#=7Me7H_acdE?n!N>;1D5z!wOrY!aa3=oB1;~(9>
zAIRlkMcnF22qW?ELMMz7G;8QIqDNZgq^wjLEWBD$LQ2YKW(3!cB2@JC^))K~@S$_J
zZe|+~5C-13QB9&g2f|J7&P1<YFp<OUa%yOWw!ZiH@oKnP6kH<%0|UpZYH2NeOoJQA
z`%H_IP;lwPomv4&L7YzQvC`@8uRz*KiI|Clvjfn5Dx!712Cq{4{JcE6pf<x;B+xqC
zjO6Rb{mx=wN*6{J9BVC=Hf4TauYe^0KS@e5Zg-f308vuieO|n4bj?EtNvCQ&0kU$+
zg9qn;-Sv7hH*nxsp+6eL<Iz*PWd3|HZ%9zvsECj=cRognm`{#@vh57nSwJKh`#Pds
zpa2FQ669nMm@&Pih4w@w4skP-Yx^B!Y9i6uxZEa<q$EE`b^;>v8xh9;&}zyK3lJ6e
zBwm4oG&R=|Nx~(9lyh$Ek5hFu%W*su&Wy3vVz;{DxcEvt<k&v40u#<^2ZlZjL%jv|
z5ky|0y$%ynY?i**tQIRN=5kUDMw*72zy?WPWMyedZ`u}u4;s6;u9ZaAOd-rWOH@mB
zG~Thqf6tyo0%!_^?s9~)q&GzI@l3T~$TR?a%drzN(g<Dxorh&@a$!%oK?b}0tMz51
z{6}1Ez_n4}E-V_PNq)_E@xr8Vrg%iR&fbR)|1B>s&v+k_jJB9~t0b=h*!LDqdA!Bs
zZ|keS+bT|EQwc*ZV5O83P*a7oBpNGeIi;;XgZ}-%_9PAi>OG(TDPP}AU{(&PlKT_`
z11c%eU!IA<W{@0ocMAruO<YIaF7op9K3juc)U&gHahICB_zV}Fd=kKjg3;LPtFuQd
zPQ-;QDoCeg!;si{^ypU^8Bs)-m<yZ^O=ySy-K{6@Sym|CP-Faut@L^BywbwrDMpUd
z!Ucbdr&V4k%!RR-zn9rpnt6iYe~RAQO<|H`H<r-*P2>keLp#P~y=$QeuXmuZ=3UV0
z*g6~bQJ;=RW}#{bsSy_8RIBmGsZ-3&b2vO?TAaN(uQFL}8e1P@`q8MM;K(f77`Qa(
zql>tJmwsOqopN(;!j9q89mz4aUbbu~nstQe#V@bCkE_g?Elh4}<n!7A0EPO6Po=b4
z*50eetBItnjdyU%@TZH;cuKGMK9JxK9bXGuaVFKFM(1IxrM`Z>Kvw7_-)Y&Vi-u0U
z1W;iz4Mu%vQ|0bM#0c!!lknZr(o%2IBr*1eWP&&tVa$XlPv(w>^@zB~3eL-C&JtVS
zN!|R!$&;$?moHx~t*opAR6<6Rk322Rcef%CE?rqr@+!N<(##i(|4gRpE`%P=J9Lt6
zg-qf?$9@ku8oNUdA+$2VbPsIZWS?u5S*S@ysM%audR5d<sa5A4nL#!xv`Z><r&hnG
zgi@D(ubJev2nNL>+&;iMRTJj%5gM3;%=!H_-)zAPFqe!>M+lISSIEwSlJVPt4O3Q5
zc=l|T@B$-%7lR{83<-X{9ygMHLaLo2i-qx^IvIFX;?PbYT+?wn%JH`~GMmStA_F)T
za|inzpj(-~i$SqL7cd=%mV=PyWB)jX<luz!F}p+`Aa@j-L@`B3G;{Gt(V27>$3)TQ
zYS5vUBUTzp^8D}EEy69rZPn7N^(}p+Nth!Mdp2#Fq&~KVq6%%BSb8SE%ri3^h}E3A
zxdw}7bbDM82?<^X(<7@0xS&+Lo%7sZ#0xu4LOkHy>-7ZOJwR|>o5pkoDA0XG+9Cu5
z9#Z#E?g)sW=Rc1ixRK`i*2wwrwJ_2ff0QnsG-)Rs(;YkRr>QaUp|dyrHwZ3b5yj-U
z8#i2BTo5mq{o7#7G+$<ACE7;80)e2*EGV_W0>I{3lD$|AXC>FcDh#?D6*YHwbLnG$
z?P-ZK$WrXL5q!)%=HW4orQbK!c;kBx!9Y7N=G-AFQPm69C+1tl`%{+P%}r!4dA^{g
zYOCArA#*ndrMgi1Owb&t4)eV^`ntQJZ6B@Kn-t2b59lEybx^XK{Pi8qQv(N$CPYJH
zcP*k*y|_z<H2{v_`J#fbdH!lLF1U@b9MA~vh^(FRDTciV-T@`TJe)pNGn9su#+FAo
zBZDyyI*T#qh($Oy?+<^gQMdf#zP<>C($g3NAFdvMC>x-s#u%Xf9Xakh4j%XOV>lJj
zfZC8D3<wCP+|kc)p8tBF_CnkNyriKddc1JX06%G<2|_v2E*E`zaH+Kb8~8CU>8ZTR
zZz0CPn0nSEhgPOcK&&#H(`*&<qZy5984kw)HQG+Q7{Hp>O&aW#YZljQ-H*tX(cg!~
z_;SAM6ae<#i*q&tBYEvb<NyC2?9H$=(SwERVl-=si6mHE4<VUqXrnafJORBidkO0)
z(Dd$jKhA2f#Vt3$wC0Mmz2$k>Ghu4t=FPR-JJ>lUWJ-<fM%Ej<v8OncqO!sXiLra$
zOT*$8!3WI_7gi0Y;Qh-IDNpR1j&_xG@O6(X7-1D*gx4dfygs)ErbKhv6={XU1!0MT
zdBU<YctL~vT)4dpRG3S6E2R+xv==yfOe}NzjGEUy@I(A6l}LRtQOhW#0SYf!+t>&}
zRN?tPKHKiNT`aCn5Y0Zm$il*O7;AJ!w+a6m^#CT`t8{5a<d{LC4S}YgnER_%I?HDW
z$%fOnXIORyL%s37f44|<>k!d;U%9e&#flZANB&E9ATJh#Fzzv`gVsAq0qgd0<~MpE
zUB3i67)wE)URP2X-X?DX(aMp%cX|!z9gQBD+&}bL(84vqe%qBRU%3pkULDF^hFh33
zx{;*(nGkkB=od2pPAo8k^<eXD1%3z&R}v57wT`jI^oCIWC}9lyfVL3qoS5VVjSbZY
z5vE9PLY#(tnUF$vH}&Rofq`9_XU|kCsl!(BCF78@Sw<m}AfK}#Mk6v-zOM(<Y5`oR
z$BBuG3>3Y!s|>VsN=oIZJAb$U3!?>3M`5TW-KU|JNdRaLGN*Rt==xDXv5LfT0M1t^
zy9*xIb~dXgayyLwAyVB2m?xb!2uV+J>HFcshvZ>&Pp+)@aye8Sfvo&}eMb>WL5-fd
zI)d?$qGw&qi0wY|>6N9Evu9spsA$J0l~el#GBhF|w9JUQZOCC)e=%JO8#cgQW7sfT
zI%sEq_)s6`Pgzg7(K@e@+mK`mSF9xMV<5cOQccB5zIk+rIt?Ar#D$C+P27$Uln@_B
zWgD_`{kNStj}~&6o)ngG-Vta7Axj~GAGd!*co(LHJKVCRjx+><N&$6~9z$1n4jo6n
zMv9aF;R9&Iu+gLEf3amIC%91u-IuRkk%kTMG20CWOlE3OA<py3lc$`aE3ZIBRPIh<
zB0U&ygT7(b$(H)2LpkftXvl9nn@F)8AOUhAnAKt@V8q{#;X$dw8v+h>U#9g<?W5uy
zjPc<-kYdjQw3VcC0vnFVS+D8bRsTCWq(W{+8t_Wd6I8s1lVieBz-@|}gpQ`NUwMu=
zv^PY*K2{j{Z>}{yw6)?rLOS6^H{M5Yj($HBV`$m{tuidCN4vfL;diLuaeGzGW9Qk#
z4I4h3Ou3Zxq3%8Y_YX<8_AM>p5Vq&GyE1MGW(Ex$AS4w?+&sXOF)~&R8WRPVZo>`z
z?N+ZAENObrc9apt4}g!r!ohQ6o|f;y5u&ltEtNq~SHWw+Zz+7u?-}vCC=W@3j*9x7
z<fn2pNGsijv})a&sdNhN$dj*~S}+3t0iNzG?>&xj(`*WMj$e7)u3fswDk4UOYjkyW
z6^$_9YOoX_Cy6@_iLi;eEimulGO@JRcWI0makRke#lQ;gYU(y!+1azx4xcDP$=~2S
z_sVNoyz420%`K$U&wY;{$2Q$fOZz6|U*EQ800;LXk6sW_bOOIO|Cusy&(*8n=p04*
zk$Qu0SCYQq5zzJLOzr>-%t&T#@jV7#Bqt|7Ev<_Z5fT&R&<aojMCF+{=^HRrOrW&0
zwl>{6mG)Ed7UY@4Cm^Lb87wUN(3go8As9UQ*=;xE2+}?>FQ)&{=8HA`L@gWxL2@-_
z^k|=DJNW1+PH=h;pE$8<$r2+{W+oa$kpz=akrq91-imXdykdo@`19p#ZEfq&v!DiV
zL8g+LySRdB)Y<q|5I%Cz^~48oZ_to=#$1cvxB#(!((38l%~&F5ys%I|0M=j}C73oo
z3ncU7>=L7(1+VD)BF{Jq{|E8cep*`DECFvrEQ1gE#N=OE8m3-a!c$`~_c4!^f)1l<
z_*3jL5Aqu2Pd>x@<0Nw0cM%(HRq1K|Dbz_86f#>#avQFpF|#sP9*Q+dIZ@#prtG9X
zn>h%yu&{bPx@qDza38F|TWS1dkZbC%5A%pLsN$ns8sadT`<6SfdgVsIihV`rj=?AU
zk^-vO`}|4b8^`%bYPix~5`6ukchtc`(u}PWh<FCpDDxi4NlEhPFdpg(wB+yQMF$0P
zP9nX}>khjESj8u4TH1=C_zhZomaL6>{Ne?zgx5>==xrXd+}QXz%fg7ZyS<MB<j~5!
zr*<q3LFE5CcId5jf@~g6&EZb}pH)>I{bNA|kpKnH?i6i|tPpm5pFtTh%B@>E^xVHY
zEUQSs<_JNr_Y8HlgSE*Sb{xYpqu-VG`;k$j$BgNPv+TF<cpkw)j%eNrzTS~aBSUL@
zb?X+g{1EpU#9n{Kes-~6|Nh#QJzUy2lt?<!6<&^cVij*)wP6;(%$ZNg(D^!mFQFgw
z=GFhrw>=&n6=h|?MBZFS;<g?C0(`IO<#(9k*mx`(*t84T5zkHiLr8dlFjLJ*=<yE;
zj)NF(Wyqm`;|ahg?iqP9W_mC()|hm~iC8(9MffvwjY%6eTmwkK2NFrIu&~eo8>Q-u
zhEl-!^9bpzqE$-8TSZm%5$(iqa{D=r^qBMwA!a!A8$seoo&_J7!40(LmN)+g9Y}dT
z6g+$17%i+9J!1TJBe^WOFW(Ztju?#IIl3QnFt{xgF*NTB3#*U}^C^{QH-3t$9vZyR
z(i!(e%GYD?5_bpAjSV*J{>keU>|hTv#e%U?#5Qox__E0I^re}PlTNqE-?|Hs41kS*
zXHZ>y%*cUf;G<r>e2M0NZp?av7D5s}sMiSE<GITOFayh88%0MTAwvh(d`2=L{R$6Z
z@L(Ykm6@4IhQm15OWxjdR<Hg-?!R~C{Z6`~%1d;tg<`Xu5LJiP0ia<{EgFFyY`P2N
z62*Jir5B|9+||yp$V-krnOb~v<K|cwY@B471P&|4J7c8=^!V$ETbwA8xn&p-&cJ@9
z^yE<YV(H?=)W7Hu&<ji>Ahj|7y67%BOP@LmJYG<3Vyb}+kcM8*O<&{e*3-gf{fabW
zA(rY+Sp26iN@JK|NWFmIxx=ych{Zk0L_VN4#6_j5qaI*07UoWSP@evFtAA3w>>R!O
z%R*sGNM^RGy3@o2JjkBpSqs$Len3?lAg^Ld=%wxg>oD(2r7uAbW>sHwF_%vVt7&29
zucGa#AE>gr^J1s7nT@*jPo9gMqPn#%xM42@*7ABcDf4=l6NPhUpEtjab7}uKQ<Ay|
zj3T0=$7gfbqt~3iSvd(vvgM5Hu2ta?QI{}chcfIzAlI#1XMKp9E_B~-xo@faC7MA~
z-61Q*hd<eN+NGJ4r24(SJT-GPNX1QR=ft;x^^x(dZpwriC?*gdJ$h6OD3kie%r0&i
zbxO7ebDSQwXc54_6dV6KJ6o`-)5X+Be_tAD=H1*JCno^Du-=(<F{rRX*n2y75$R#y
z3?ZDJ5q2sfD?|-bX}xZ?qWf!^Dp#`iGI~`Egzx~}7$zGZ)7OmI34jT{1C%l2W&sEb
zEHq_X|KQ|%_m*Kb?V|$WX5$myuEVIRrgLxA?4LXr{pHwsBQ5xA7cUO_<wVs=?(e^Q
z*HtQrq`;V_2uPIEp0vP$JpUE{(fk>{j=H_O91H4Gk~fe+cCnP@K*1Zm=dMzLJMxLI
zKmqo{{m|0hM}ZoL2YqB;=|_CqoNm*qk}Y*+uO~3Fp>5lPu*&O)yozII`{|-cABbSQ
zKLPLWa(C|#B@dn`a^QboZs7yrj4&yiT=vP65@6O;3q0RZRWsVGn7N`*nZ_yqfJ$#~
zyFxmuOtVi;ZizH^KJUj(_yxl~F4#f0z9hf^t2^(x(y6w~V3^K(I|}avICu0Dxm>m0
z5}{wY1IzN-V?NRVX?ewVqRzHu8vI`bn$Gt`?cVwAi^Zc30y9z~GRf-=98q?L-p?SX
z`E4;zXw+OU-AIQM@Wh~WjOX*Kx=;r8j#;*JX&R0@2tDWXwS5C0(|&ZlSX1PMQTFst
z7P57E(_Ic32u(R-Ol4=pm42li#vpLIrdGfJ$Scv4;byX!hdu$4kXXO##5~pe4XU*>
zJHex*rWHcDnX-VsfkA!OHVF-K$(2A*RA#9I>M!TtwH|DrH>L6Ry28`StE&MR#86QE
zBl%xa=boi6Vn@^<Zy`;zgKiumfJ5u>E(1E^^qs!6V*9jVs~U3_%%4<mfpnMik6z25
zaiH9MuZlrgXD7F<dfZtUz35uh!W`N%fM}7Rmww`Ai>}P5DYPx+jb@(xj4q1q{_!@^
zq2KV3decb>pV~w^|2(`yj?ms`Z|BeMo7?cv&IgjBMr&)|TywB&-=gEMm3p>or&zbk
zSTyxf@1(f&b^^2|`)r0HXbsL&dP91tb+VkBH?jdwAW6<+l9)V37)Q&h0|^UNL0wr!
z|56&GOfF;@MbtmK4+*wB*F4(Vj+2ew*zw_WE<PkbnZ*N=+7wwhkxYHxm}S=113Gg4
zi5m4D!!6lnEL?}<PJ;j0m+}i<z56(w8K0?|S25?#2l!CqOZc<kbenv2s!;hcxD@A3
zSGXe%4qoE)lXw;Ser9y!Yei065P;Je|BmHHa<%Yp!7ii1bQ0|<h%)?_Zu)gpsj?P0
z9BJt92qNkOW4uiQ@TQz(1R{R1PFp)WnTGc@&3iTCZs`0+sL}Byl-y3|Gviyb9zCK}
zV3SeCwz#-oWHLl_1fn@iwhq*!8}5NTR9$04eQIbJ+|hrE%RBJKMNh7UsS{C5TMHej
zV)lnF-MSg<m~@tN5!3B>(v4f4COdAcoo1dN|FSm<gQlz1nd``*e(^2esVFom;2i-D
zHtMP7RSfO@9)wCqIzPLLdh92x@KcTyFa?g4>66|EaLWd1L=^!eOyh=-@;_TwF1&gX
zP+RX0u?c~#v7@WkY{80B95cg=V74huGqPr=w)UbK*GT?A;PfK_b?992Pb{mPFT1uZ
zZ>p9iVh7_Jl*yDo(~1Z$SxB^SX9w9eE_nB@j0+(M(?wx7U)XXaB=rfLTA_oK&RJ{n
zN``LM$fXb4oJk9)R!O|EH~!6u-cVlV%!%cUO%<b1@@B&_Dh~@9GrscPoxDv>`%uBt
zixzV(i%1$0BkAS!Hvo8yS$K@#CH>nPic<G`9FFRww)zQl@-cQfV<~z;4TYgXmO2o>
z@igsgjjK!UHNM96<mYv{Znb6$UvKg`8~-LSDE$+}r6<P*Pl&$@<pN$TCcuVsO2rR1
zvA5`F`Y&_Z4@~PSV~B#vmi@|I30^1Hpd6XR<s*7o2Yj%0ttsQa;T_jkgSi%acJGEJ
zCK~i&lQGa6gIcv}CA^QI^I|b$CJTE^m4*5)+n{o2x)ZfQ`(Mv<XE^l7o)!59ZRwTP
z$e_^bY3QjvwW}?h)Zc#4pkoqxhW^cGq}VHP=wt$fNAr6?*M1UL+j5A;(~LgGJfBxm
zAn(SFE#lK4FyPwK;?lvq+qP`ElG-UJx@BFnZ-?VExbq4P?!0*<EH5B=3?g9SFyuvF
z^-KX4K{Evigk9iK=L8w+Km1EN?b?-|gIJf397F|i6-Aq@?imleWZCSvS@6foXS_IB
z`rO0bAl@r5(DINI<I~Tm4n_^HbK>k1_{(A(vyU)wCurEp%DO>vRCXZRa>3e4Q?KuV
zzf~&*9T|hdS^Y%KVs*DKggvREa`$NNx$3ypgz$+sGhlpfK0w^y)o^dZrbD+Stfo_w
zWfzst)-T}W677O5W2RQZnGSDG={kyP8u3P;v*?HjGxz=k<c`;&HRl1G<Y))=WS+dD
zT%9wdf1IjqzbY>h^N)*WJ60QkUu$WedX*4z!;6CGf}#SaFLARBo<Z!+8_hK3{9)Pj
z8C<+>XF`<k-@YRE)4MNMH(DRHBBd%Wj>FmVpQ`HW<QobC(jjn6KWX|xY|4}d<@${s
zAu5bxbwN7FR@7Bhr^&BMJNF3g3w>c@nR&u^OUw2!6r3D!Og9_~je>l>-|oTS4uG&F
ziC%E*XGg~~Ik`45#^u0pT8W;dP@>S9eB|(93`*du7cZ{y-g*TzEj&H-p(FC{$BY3?
zVuAgVb`CgPYo=OxyrrW2h}9QeK3Xn14y}*G2@jlq2tM-1=@*dEi2YSe?X&FLHv=s|
z!6yRw`xA~F{YVr~&mWX1>x@g;MC0{!dz=7e%>(FR@7c4~(J|wfh|f=-Cb&toJ;{b6
z%MwGHKo`G$ASy0i;hA0)x104!sGxcxj@Hwl+5NRNX_*N7umTIdKtv?>^PinG=}Ut2
z!psgq9sb$JQ&p~l23a!3%;NpaMjbwP*Wy<CpN*-!jFGipIREH(uuVoK!uo!vbR9D1
z=a;jB!(dywOrhIzWybX2z`A}FV?E`TKrw@v5@<vMtXpS@Dfp{_DNw&ptp!37-u(-M
zJP{;s%~`P|Z^a0<Xxnznhm($8(<o@SIC5iET52kDH`Q%jg(xuKf%0*K#q8|S>cgUE
z{P1C;*C}v$xE4miRDCFYZ_-L8%QSoPjQA!|Xj$z`zKSG})4tcev0>`W7nOZ>enu~?
zG=Dz;TX+V>5c+Am>@Y%zV5;}gnJPwxJ$c3t8xW#=FD><aXi%9-_}NQqGF<`vJ7P+k
zw1GC1RKfm~Ym3ZC#a&~_5c_rOTx|LREOcw|uz`by4}Sp?l$dDE4Yaq@$8w^ui)vPd
zo0|koDNdu?JEq>xH#0L>kNgN^4R<-LqsdTXUkwIg1u<w_fXaLQhNGrKxh9<~_36z5
zT-e!v{@FX$0YaQ$p#?KpXV&$#CXc@%7f+k}w{P@ctGR0k8hT~MsSnY^>~a>U2({?f
zZNf-b!C73V%J{2{9UFVyzmS{E;3CuD?CfkR2<KcPI;h+79i0z6c=*uZJ_Ewc-grU?
zdcCTLyu7&n?>aFo>LFOt4mpnICF7vf*?nZB($@_xqjC>|ATmQ6FcWHnm~RBJsOJtJ
z4o<2KgNCyn7aI%i=Y4=WT=`(A9N)kDd?HD~s)K69#Kfrh1rJ&SV=GF5X<v)UG~bS*
zTsH68DqmwJv0VXU`I!OjJe438Lf#yUu}o@(X*F=5!B2a*msEh9%fwV7=)-1jE8sv)
z##-0>SgUxC1RsboNc+h;-tgE0$k4-8RYPz6<uQrnJtia|0;Cz<%<Rt32gIChL5e*q
zr-boyR@B^@D0#gLL@7n*m0`?fv?56W4JsHq)uKBVAE4-JL<F<O0|P6HvqnG*D(y{T
zDvtu0+?~gd)ig9pgPd@4?HYs$u@{Ji$eYrcIm<L((Ud&&2RCz??~s1u#%(4iAp?yU
zMp!4&qo>yVK%>M(KLvH(rpY(29^i(xB94;#O;+exKapgWm6g?`Ne>8$K!inGvxgm>
z?;Muu=S^U1Va)^p$Z%Tb(|yRxqoTlE<7jeAN^zbAq$`w6r)}|vc!&;%1GNsauo=^)
zJxWd%WLBJIF1$>z?UstL#>r*o%&$Ci8G-VI&<B7J$mte}f5*o24iOO%kkRp97Z*eu
z-5#+7^=?Pfbeo)5L85^e2E!4)anHa1PQ}TS^C-Fp1r_pFWZNxH%!@h7c=ls5jhjNu
z7y;)RK9;8DH+B$guL>eNojr1QewPt;%(w^`McFZMQ;-l&=U@QA1Ipn@w2_SA@Bx;I
zbe@8AbaWMW9s~NbcT`Ks?R?t1$bP@JDc7@MeuMb2K48oNTTH-Irq@G{w~B<?8=Ykk
zztetI8*sKobmHwDC@@8H1IxEzZ|!fK0YSpZmCx|0Oj13J=#fbX@<YdKYdu&*`cT=h
zaqH3W$l__=6(H|B@wg_izm~!OkFhrot8s1P{+Br!c7_ZYGp5K?h9b#WZAt@5!#1lB
z$u47|wuDR#TQng}$dp8bDMJY@rCkzAs7O*$Qoql&V(;hu=l8z%@f^=SP;1@yeO=c%
ze9!OsJwYG8)|T&^d9s!hPl)HeNDMqgi0rXEb;U{{dkyF-CK2Bcrzl7XBqVSpmD|mg
zpdEr&h)ruXS71uP-!Cy%*DjB$N%ikfe{35$aNsoiR_cfG5TlpeAp@1~hmeN%{1{Hf
z(=WQ{zYeRdx&PonRE0d?A%$n0`iyg*-n{ABr;m01&XKNm7(=LBBq=b<GB@m(x~M`r
zd>qbr)(g}aH+S2F-KO$oQMskMFzwbCMTu^wtyenz@@+T9VgosnhCtAO*2PnSdO)%0
z)5Ypr-ds)HCv`s(?(j_Xy!eN%1syI=GL13=S3p_9g55%LMQlRxc(U?zVpfFlvPgT4
zjEwAdNjr-=%Iei0*p>k2LK=%r1cXBL0G#I2WAv@e%+FDqkTM)poJ*w!>08JQ*aI>e
zdYf=8Z$5tfku}3$1{{8B3ZuXv!N&SBBpv5X`Q^fuE9YP-6T)uWx)r(fij46u6L$_u
ztz|+qsWv)HHT1S|^e<o5pi>JXKWddOppYW)4*FIYFrRpzRA3@48+-_h2_jU3Cj*OT
z);o0@@jRGR>Vi7e+UESUsPo{z=*ZsV;{(~{D^?~WBLnmNOf=Ynxr6I{FzX<3)ifif
zFKO>OS4=9x40$NfK?z8DRVwT5Ox(^%`U=3)PMPX;4th^;AGu<8W9H~9-i*73b$PH2
zI1DUPSjdbn@6hTJ_I$;w*w>C%P($1FUCQ30W~!>x##MQeZ8P13)Pn!|HwJfR9yi0p
zaJo?W9_;#*_~j9tY)r{2fPDxFvMX`S|Cqe@M~)s9?0T30%%6C$3;h~`X3Aq6ocU2^
z@^=Z2xNfKk0?W@!H<JnnV}rc0I}xHZ$Qbcww7twc(AWCcEw?TT8X>XKp0vK1?xz1?
zO5bQ?NULaMMvRJYK=o)-A`41tTvJhj_cozT00t7MU6g=;MNK?~`g~L*xi^21#P-0e
z8_#D0Do>hJPr;3>LE5g4K?K%>$bOH03VHet6c8NbO=DwYdT1xDBK<5c*MaZK-c#w0
z*qYOwygW-)uyVgS5Mdq|nO1h`E(tvz`n{X(9&*ZZ#4Kq0#e#8^Mg1FoCH|JkW5Jtx
zDquKjTrv%Or=VXUsdjT{$kru`7teFeWuuq+fHbr1DGgBlsX_V<ElvNe@5$IWN8R*j
zZLuTHN}bV%Jx&w|IZH_Pk@8J$)J(BVX{amXZ4z@jx7j%5VMmC?jv(`_B79mj(;Z)j
zd>1QWd^h#(-M>6N&stm>f~)N-JS^2B`g+$9(TI&>TtFR6Kb6qHcg{`|4Q3K^|J{2C
zf}|Noo(7bpO5+=Q4~Lw7GgMuoI}z5Qdjxb~A+jvy5jQUqOf`h^v2YvPa7J)lWbo&-
zzp;(ck>{;kAt=381TewvsZ>rSyclaI9R$n{Qe+r1{3*cE{+Sy%fxI*h3&Xe0S6G0d
zi_RBM1zOo+=asFYjfiUAm&!_VCT-FKpcxdKl!dU59}NUI#@>JQ$cJjlzI{yG*b3YR
z)-EQoGKS3=X=;h&$vdJLTDuskAS5N)P)!#9j^sQxtl|^ttW6TJ_3Tu3?5_m}PlkLc
zNnFBdaP=mRbNiApUeAk~dEK{f8f)^h?%a8UrNx+_PlB;JSH28+ETTC9{>Wtleqr<g
zNC^_vEibD;gW6(R_4m48T!5N#iV0-d!|GK)x$CHOhHEvsOQ6s>bLZLs6^LP~eY8(a
z1H1q!%6R-(4B`OHYd8Gxv18&zD0{@kRg=vbexI8t=ICUIwhz4bMcrLvI1DGNs;aP$
zoJJ}2KiUsiX_!3wiTSm<i}zCBPndPo8%Xd3ULi*0IXZrVQ$>T5sWA7x8`u)@mo6F3
znX`ZIUQ?}h1{qC!k8o%CP+zF8Gv$!mu2EW}04GS1SM_8i2hF5R%)tZ%Ua@A)i2XvP
zjuMa+-$m_t<GbCvcN?A+%v1)U9cm3NF_YuMgR(Q9!1%i{tTSLde@?Td9<+h+xFQgs
z#6;1Pza@_GR3J4IR-fDEM})X3ab<}*<jl>*X^P6z0n-6J3_guZbOuo{`TR)xlghD{
zl<i@z<4!ykU`m1Ea6#SXC84ya;#9L)yb@PGXsYm~Xh(%H5eP%+0G&Bmj~_cie~|7*
zCseW=AMhTJjWwP<XAbC+si3mH=GoZ%-L{^mEzS_SIgK>MZ{Kpq^JVIw%hS6hRL_5l
zwk>n-UK|2rZL9n+BSVzzwF_t{WWbp69UKTv(u)qyVgfwVG_III%&_SDTl@CNi&29M
z)X=vtJ4DEe?5MoRUj+m1u+6gEgK*{*2_?mj4K?EsA(pxXoDaVyy$_N~v&0MP?06XP
zF{`A_=AtaYsJsY{115q8iX?L<4LEG+2&Iqw0%kB-cv)GQp&%YCpAOsQ)Do~lK}asV
zfBnD%BMXRCyGV;ug*+j~+CzaQrAudolh4}tRhZSP2P4H0ZLpFZM}-KsH`!XiDEGu(
z<z$I(rt<MoaW#nN;syim5u0v=45vID;W^8*>aVA%s8EhAYP4i9sm4FhxqY<g$G7=5
z9Tb>%idNmcx=Ji2x?Ct0!W?<RIS8-GR$NCv4iQQf^CMuV2;7#U=bIOQZe-7#;|^JL
z9=b2pPM}!KtcXwnR6;}h!@a;9z|2mrDsLuvwI7yVVQHyS-@e;9<QxNHho@-7YdKSI
z!6}KxB9Z}Y3gDf&5HH1T6cRmrZemV^guH}Naq1?$I}eXf6nQH-@CRx5URi?B4Cc}W
zGX{&A%J{W>ZWKlT-7+T@942ih@fBtFBi`()fMv<~s(A#Vw`NfjR9D3Ht1Nb>%WZZl
zj&b9?-5q*ef#1y$<9o2eLWq__4}KS=u!NKVuuScBU7Ap2BW{srhLEdqO*+y3t)YaE
zTyUE0<MO~6XLna`*S`Jl`uc~ipMcfEYXQ9o*ItRC6_$+v40g*|?d?$TM135dG%FxY
zz9pavEF^J{s4`{bg5h#!#9+Zihhh4%JE8h<Er6I#0h6hb+hxi)Y7I3*^Y{pS3IT~G
zY&pu$szJgw-8l9Zn@w0%3y%rB=*?6=gKV||$t)eJN!>oZX#--1^7~n;s5l7XPEz|~
zmv1d}dme(%0$D-X1C*Mj^YdvTan1e**L~sCI14g8mx4_ZgT55^u?I_&?+5tK6bC&o
zidPXH&FH4IjdBI4cD{#~z_B3<>*RFt@^9rRi|`Hf^CB=|dn;A(-KS6es#6HX!fz7x
z%_v!gYFV&i^A;^AVdoLE5f*%8^@yfFC4U!cF5!1A7$_N#wV8!|F8&o}4>2&16Lx?v
zLlR}h>*eIQUvYkb%jS7oT*+KWfYGyYr2IFa+8m$kmj|xo+l!_@syX2Oz<<!(E4OR}
zOc1nq2K$kd>OU{;=9<$%obpK_Y#Yu!!8g1a)3u^Wv8uWnd;T1i1pK&NQY%U5@UgIQ
zuB6Oi@#8x*cfTlx$w-G*39`+TgB=u_h33DG>Zf}F?!2c0zHxB9=4dp=RoUHU-y-o)
zcp^ey;&sCwx&_%{WM!+4aS)D~ql61;FE1}<Akk;1u!KV=_<OGly<<p8;1!(Gg65?P
zF~#`S^&l$wROE$*3l6LD6|9bpF5xf8r!ZcK>3{$F0%AbckKrB_U%!^a1SfR{&Hh+i
zY|gCM7k)fy!wZ*}ZIwwFC@BpM(pi(IAvCzmNvdhZ2{?3Uv7KE4n4;Z}h>LM?FGzOu
zSD-WD%0m&?a<RTINxeXf01?fVr&F?~tb_L<R(?Sm@!|#bjJs>QiFX_9M;1u5BizO6
z+!No&RkQaA5Wdycq7|mr1Ih47N64gvIFa0EDhk)GqY&Lj(r`d97gg!(2wUA*;ywUG
z#`rRkY)M?nVYk-KgyUwkrgob+RVg5%gwcXUSQ^2vk;8n=dlMT?axd<jjn516R6q^1
z2cb{flgZ8IhUm~H*7C)(Mamu|xReb#v~O5x`vp}yebKo^+CMzLKwcSKToA>G?V;MC
zf1sB)W#Kcjjwn~+M0TTw(j3UPr#eMhFGNq6B0oJ;{QM_Oh{542H_B5b2;4(aWrEIa
zcXxN-RYc#hv26JTa)*M0lL=Yzt`NS!S6uK#R5>$$Ocgu@$&72PBf!ID%oYolwhJSK
zhv+%cX&G@tS3%ecHezcC)(j_voQCIp;C8vWgkorFAMBT0CNpISzs0)@l>qp69Wiw1
zELDSsS~-GE6J-~-Zv97KbdarVSM`L7&}B@&$q!y69i@gLT5Ig>uMsm0aJhUI$xUqQ
zkl+9AzpZEI&L#y{?)8GE@_OovPdPLIdnmx!6rA=Qc09m^_;@1@$lLc9FI+ezPY`No
zm|j)oOgwEjeQm10j?PjJ8R)=peKX36hkxJlf~rIH(YFK!LdN{DvjMtxRYq42bMiio
z&6vWh2HKm(294qASN^COR1^K?>Vm=dl2j8GtcwlS2ZrPwUV#wG`yxmZZ5$kNP^;6Y
zaob$fOw#Ey0NCHYSJqv0|G)5M&2xXCyaNjmGK8=~94gYXoiq#H*h|rgKcj<$W+_P8
zt4rOQ`S!uY^G7`|OsrZE7NNa(t#isR`Cj|O@D}|Ns&5{>XpVqexSeNwtf8Ty+WEcF
zICofbK&&EjF6=A1tjS>>QCEaq!-+He`_daRuXoL@2sK8y1i2Ipb82{T-K7mG%6}N2
zosSrAO4cQFLSQ!H<{_LiboOM2SH2%89i7XR+9~B5?3i{f=%UGp1NX&PdJj<@eijmz
zn9GYDn-S0}$WXK?Kdw`GtU7F1iABF#)ORfUd2TUx%8xl*WaF2wc6Gtysb}ZAp+Vhm
zTUDkg)6;H3KiP0iO}Zrd^_xP6JNB5E4jKPi;+EHH_eE|S`av`a;k@Y<U}$as-;{dv
zAfIc4?Pbl7+Looj;d~(tZM8IJ<YNHfiB;3RC|a5BV4t9GBB(k4Y#v9&7iVPLnk?0O
z#j<5)bYT#TKqm~S=*k<70)JC+y2a8sN0gB7WmDHw6Rio<ogqeYoQALc&H5<TR^#2^
zkDIr^LWXJ5d>38&sTpwm)SpZn?%}FG@-rfI(;8da<>-bzoo(%o`&B7*a^7+>{JGkJ
z`!k@Rj~&`t^2!c(MwoY<bAYv`g|D=4G?kCbKh6fn7mH)6G6j7_EE`E)_ZLYmfcfxt
zki|<rd^lqTi`h7hUX;e~FB{>Ho5+nuf0CYyN!5;Ck_r<$4JA9HwaU}!+@@$F-9xY`
zlIIK5Oq@vf5WYd*n+!POQ7p-?8GS<6D>6AsCE*nVTnq`S%dqY3Qn$4gg@unfF)>&7
zqfJE*4xmh{*6kdt7O4WQRT&+uiOT+Qb+WazTPg*Z$3cs+7huII4}J&WsjSe~wX?H(
ztYPbN>~&{C$zsTv!PRG0O2HaeCLN4u$XVFq|2%W%=$5&<e#6<E+W&mH@X|Gn|6Kp#
z9+_72si@}sz)-JV-rj{uSC>ZZ@!#Rl-dp}{b9K3^gHP9brgbg5aN}W1%@#SjBW{MY
z3+-gM_G-9S)GMD?240a@FFcy$cKoGF-@ckHEIO8cug*F8J^x+sZ@W+WjLorH`DQi^
zE5dSSPKgq4-bNWKCK7!y^N5(suC=1PC)qXiRrXNy`$csS^KIzUv7|^#lnFRXi_*}w
zl^u0$gq{b;w8Aae(sbZC(I|Rfs4F&-C6|-7=y|bc=AE4Rqim~#gM(d>I5_3m`9J22
zJ^QwdBara6SC1Z9$CZ{ac%@IDAh@i=t&?1KnhiqGY*)Z`Z{qbS&&z5O$e637k1OS)
z{?fkGt7p%o=|`;--j|e^PdnP@UG#?u3zpwYP`z%?e|-Z3l>lP;_Jt>FvpJtq9X0Q(
zR|!NH&f485BY`Rq)XKGO3z=*~Nw4L&KFSL8M?SrKcjjHv(9r0T(o(cC#L0uPBjhAi
z{jiNkw7a^GH(RmdEr^7&#}lbkzHQsrrtKxJM?O1B+}+-kmp8x|pht}sD*w{wplIBO
z=(h#H<e}Ts`xWZ6p)kRD<3<|0s8mt#5b_jeDK~KRxQcel9Oti6qtQ&+L7wsA!!v&w
zi~;Tfw*11nykIp5N(~P1%HS!}0VG)6ZrWlE<&@>+<<ajZhJ=x3Z~zTs*y|z-A*gm*
zj&aE@fqI0#CdCy1d#dj6SNb_BH;>#tW5x{pm0xlfG}p}VE(u*2={mi7gu7c-^0qCa
z+WI^m#tPGJY3tI_3_E{*9V{Jm8l}_dd{}(@FF`4S&;jc59<yF@lpwN`9<Q~dl160S
zc8~G-gbiYz^2Li}xuq~pHs|1v&8)13dpxPHv(W?65{%tL9u%uZ|J@Zaa)=RpZj+bM
z&4DLU)T6Z-iGkQgtw!RTz|M=o`_-pN4rT(E4wPNIic?H3EXvEvqeum2w}C0VqemYa
zt<2kT4=e(n!=DKW7_vu=T^wHwU+}m!miUe3A_iD?fgu)<JpguKWj+kj9<QP@yL`H&
zU;^Qw4&%%eL<3Uk^46^Pylw4X%@Exj7#Gr!=S*@WdLX{^E5B-OV$!*L_Zph|<FIwk
zwQor}xx;qt+7ZrC#m@{k?5;c=wUZ`&01pNDl4jXv$4KyCGuH#h()b6EMd43D4~;I?
zu<rZ!cFF~t%g*xb>3@$JI&>wuvGh-V(V`>4<zZ(pJfp>1gf<+4mv9P49_cJG9*wo#
z%Vc?;hXJ<9-<30^Qf@L}@7{EBKVh|&>WQ7BISY;^-^y%v=(UD!Yaa6oDNo|&Q8Byg
z%GsX@4U9A^1|lQhiS_{C3S9M(E-DhogWRg9>B=o$wdyM61=W5Mi5Vc)A-ov=){T^q
zQ$R!&G{tQ@kUjVZotnO)>?azKN^0>2*70kDXlDabU|&H60R4tFK^N#Cj~?JO<l4ox
z`USJeUC??v0^J@246^b|Vp4Vq56Bj2tBi%{3hFk8LYny?_6K+#8kBr7kxRG`)4eY1
znp<RH@j#JCS)w~<1F5DsB8*+2N)R}es@Tw>Ly?Op*^5hh)nTQB!*!HtfYy}Z-tH#%
zN!-5zfef^oFpuFM>DZ#(-MnBD8%$_aHJ;#MKzjhyP?ks`k|L`N|IFSV=9k9$>S(&*
zL<Ite-~jBV5Lz2Mz>G+{cW)&Do28{?c3{K)@yuOiZ$5&oz(`j@YSQaYdN7|5qbP(Q
z1aISLK2jmE5h7^-&>~11K3PmEAeI7VxCT7z)JlXC#!tI;?~azK2!?1-LZv_OGhwXJ
z(J^$`B(Xck2t-{oLol0<Ph<|f8S4C$IS8-8hSK`T-*8~s$~GeN!{wiZDi6%E2ZR~M
z15x>W^^$PVLDr6#At+ZwjNW_qB9kmdFn-ONkMACd5%VC>)ZULBJqla=52&GbcD4CO
z3BdWVPI^#lKNVjj$cAmO;74#r;1yR&Af5`1&IX3<+P2+P@oH-Bl#TCk&TLTlw_8i3
zgG+8eZF^r@s-wMuHgk3Xs8zy>Wu&>RbD+{NWc^BNMDjjH=lfpxJ^JlPKQc1Xo$mi?
zeWra_C4Fm#<`560Rel(zqR#akBqb{M%K7u6{RWJg;>+@VEqR5Zrw;|bIA<9Lgl;-E
zG9qF;c_SNS%a$#kddf@TvKue`fmkmNlrAba>WGwQ1`ZgYQ(rMHVOlfSHFP*L^tqIM
zpyx#?XFF9+Cj>*Xx^hNTc5_CE0k7^m!t47Ut9XLc)FivaaLeT6hj|oq&_DEDUzzTw
z&YKq2Wn}P|4=+X+P%4%FMp~`<?fYf}j9vP5(KE^!u>M-Bm~t2H*u94jR|93T9m*j%
zWPAcgl&m5K!utwkCaO&)pH%kHSV{vKWz<Y4E<KBAl?pTpGp5Cg!t?Z5v*@oPpAy;)
zteexj<`qhUWGxtLKOUz-UO--I>zihhCrv7xy5XZWGyo1k7W+}w#`)&UDot7ai5G$U
ztO?n<M~z)Pqd_>ejty9bt!U^=(q=DM;KK4@_IQUQRaKo_T#O71ED2XPI5x^jMuuIz
z>dHQ&LW*Lio|mAg{d0cmtle5_jMclHLa&@eqI!=<(W!mY1yh@_OZ-jbRE><aOxK!k
zVsfIVPyj;c5@C#IIDOf(F7Wm7c|^j4b%X+??jhHqgP6S~nBiZwH#NU8C4nrM_?<d;
zvyw2>zF}&G8mqjS(@GL)_pR?lzQR3?!hbshnK;xcDy$HOAK$ul!mY7E=<bx91vNp&
z5{d$(16^H7IK{D7;e?2-P?-1dm>vH*FWxZ(LMA9=R%B<1%}&d+9ow}l6BUuoF|n~J
zO<Ga&=#Y2qCS%u&=aT4#4;wxlo4ArTG_b0YyZ{y-Am`cb+b2N^WoJ6JCk$fF3?@%Z
z!y?!#C@7#N0nAeNBy3lHCZz^)_m4$IIJdqfi}AYHbWs>cKPJzf&ipO|9K_d|?c~MN
zc|tzGv_4H`<*BMk*(~4s^{5A9<94E)E}kq@ipEwErUb=W>20Du@%0G_;Smw(z1rW1
z88CVBHnKFhedxX_%8R*Z6aiEKgAyhGi4ndkR_xl9*M)IR!HmJo;{;1x!HQuAfx30;
z)amuBS2xaD!g|OaJ$9@Z!LoGeQYMBdi_C@7v>K8W<oKe(LfTj{lUdX0UB>tcrS~TO
z)TtCT%IHLz1pAIFRt#~`mFVt2J<vrwtn8{VxQ6K_6yv?RAhM7VO8y1b1e}(XG+9wR
zt5%2J6Dl}O1k7US5N+_LR#u7R&4fw_pVArlAT#p<m=cT(r+Y2?UqTX+f5jS1pD}~v
z$NnzO`W4(fZHIm-C`~W_cbW~E&~APj7y_08KTln$CoO~@K?JE6W+C${+rHzE?$~it
zoM)q8Ql)wrA}_A4pC-q~>%^aX_dXLF>j(=RuS2d2p8OcO<*Y2<Pwkq;j*qk41`tOf
z;{|4t@C5L~V9p$*163wRkf=hTILf?5+?~}#+)VHP<lJ?S9><GgayZZ#JYsw42lh#p
z1_^R7olmg;ot?$(dIAaJ9Zv;D&k<@<N1~^oS(Y4H#Z9^fpkD+TB3VdW68WDV*T};m
zA^rOF`LX)ab8dRh@d=r1X}(UjrCrsL#iJNUJPL!5`Y?v6j7UcGAgHVRYb!}|SCL{k
zIIxj7WBhRc1Qf#9CPx3ycRUU_d{}f-QR4y`FdEAt!kLtol;i?W_M}cJ@nz`o14`z=
zWy&7NFBk#4+`oUd#@1w!l_XXT|MIkvX^yElTw$R7{lB8NN(h8fSwUvzdO||Mn%5pC
zd8Ih**N7sC<Xa=H_&*3?!PL0_;gcDA|DD>`@4=H2$P{EDdZZljG;W!lH>bS6<jsW@
z8A?2KRGUAszycIpb}BfSR6j|ObIfRK8d3ZRm0!(O5BmJtyOJz9DD60HP9Z&h+)|))
z5hyg9rw^eN8-~3s<}y-zn(+go4Cf2Gz~__oOaI{JAbyHb&lompCe%M*P52oMb~Kad
zc9=yo=+CRb8NL{RIR+<#<{4SSagFkHLIMf+13wuQtU8ZJVG0!&q<Q4BdUxq^iAa$D
z>2HC>#|41RK^TSBQMk!O@F~vnrm-y~_Wl3cbFR~f4mf86aa;&rlB@)-Ct0L^!jKga
zeu)qjKfLjBTpXe@T$l>{3!HbFG#H7oNaPt;naGaj4KpOe!WOgnd97dzc8eD)yVw&6
z{?F-i6b(&l@HaSJo+WT9955Cr;rjIhP1%%eH%i<~mn~mjjFetc5vD;@1t}^MII*^J
z%E1g&qID3x20#?HB$0WzNiMQ|1Py$j`|<xa<f}RJjExl(6&00~0{{LS!2``AsGdIu
zkY`8x?E^x?=Gy>H;aa)yF?dIBA0O&?&-vT%Bs>c3bI=f|eif5*^K7JI1S>-wS4iB4
z|Ic?-Bc?%>?h5;#&istfD_8!+EyKp5uX&^MxBKF&Bn<(WT>vOt7mNiIJ`u3wSOZU<
z`ZSs&g`Oi9IW}Kz6lDg5>oN(xQ$%ZtS-%1U5qFB~RU8$zAGGv%czS}Np09Gn#nW-5
z0bH!pR{*o_-7mqlq^-WX8jT=(pb%$gXSh{Ur|wWFB$TDvz(q|0oFKv^%Gt?@3@HAM
zPkTwE6Va3GTj(WJGMGEJV}}j^^-qa+G<MM=LwrF3Q&3<NW_(|O;y@e|Nfc{fF;rG=
zhU<sh<e|*W%;wMcS2%I%)ElBU7Ov{c7ns+KLc?_+brDQ2NPbi^iTVG`N3!Ynf8X~u
zzrk8C5o}i{_7L4fNJ;P{_ylPI=?`oZF3|pM;0stn;;0z73yDXYN&NQyr*vzx3C9H@
zLtMg8s#){q9a;gHk(Hgj5gj0Wqu*%RMCe3+YHNh6Txe!ckP9D^kEhXZ94RJ2G#L$_
zs<%7AHTp~Rr%yHWnM;nqkz~kY0R)Kz(-sodX8(EM#9NFOzKYQ$BCdt#B-{Up3f&+d
z%+0l6VocL{tN4qFV3icSKYxCZk%83CJ|4x}`x-MJ6i!^d`mO1W9!KzpDlMz$><m*c
zUN7vRwx;COU=nCnCDf9m%`PH{Pa)WHS}EdGo5fHTd1K&?cpykQV+)WmLc9g;yP!8W
zEoi|)b|MOwTqq0xYx?ZjlZ!<aMNn}dxY)&HjOcPPjkz~FkgX=(adG%yTzzCqK40Eb
zNG!5YcY6@P(~e=P(v^ce;?o7@yb;Y(>cO+kuV294RJ_wtqC1yZ7HAYum}1aUTt`V~
zam$kBzOspX05sFEOY{SkY>fLi_~S-@27fGpHtPi8fEosP5480m{riVf(Y3T}TomgU
z6x640-}2H@pxbHTCOwsuEbIJ-NZgee%VFlnGpaVYkt1?Td^Rj(ns{$q+6{!QKbGe6
z;1^roKJ01ti7ZEBmneR>ety~R%j%N5@D@%b(MA=UAJS1X4W|cZWP`E?6vVeP><80i
zc1<sgR;X_|M}-?7yV6Ht3A#!`w_&qCBf#%4EDSdbhspIpD9(PhsvmQYmJ30X<kKi!
z!Ns4<jev2c^$}*@RTKI5u{;)glrQ^@;Y^7KmaSPMxM0L*NSG3ozJI?iM}}KPuIJru
z-L?%fJ@C+WG*z*cb5foY)(}FvZrLKb{#g?$Wt^YN)2$@+YSE7Kw$Tm5i4_{}_)FGZ
z$mbI>UF?>EWIq%R8;h8*Lh42R{d8<8O-F|~po&aKSqXYB`1a_!gobQd)ZkDBfERY_
zHvj8M=&!OwV4aiGHL`wveY8YLIOzI{zIci_IEFYlxu#%?(8#603NJ)S(?q4Ko)<zy
zfv<!;8p$B89)0?JEG@Ms&jkUpD>t{KP>7l%MLI~#d}9Q*vhuA+EhWA)_-qP%<ckzK
za3n&z0+003B+873#ioBKNV*Q5PFBVFS-dzbaYr-B;~CW>qK_vfC7I2NG>IZV??Zdq
zi%Rh6)zHcPzD)-&ICCevBAV`fztZ_aZ%Fa8Sw2ev{p$q~$h&ehknTdz5Doe7$`O8G
zOLx=TO5Yrv;QJn5B_pt_glz=(FZ#r;krIg$l?RRs3p&TZptGJzS9!7vKn^HhXx3Rh
zY1y5p`L>cX2RsB&N;MzYX}G$2H@#JMc8mbl27thp5M2=e`Bh)(As#<eiyf{tVFGe?
zB01#UpyzfzE-t&i`zKHjzK~`&4v#_j7=#dsS1Vh$miW%x(LMTrURWf=2}Wd+Lf|_*
z6zI|cC5if$My93b#j~!Y-mz6P(Gn&PI1*-1gYKU+%P)K|RU9D}1IbG?D)c1bv}FDb
zp*s<Sm{@GI8%ICaav8zK@~wBN7#j3KqDfH>wZr{Kf`c1qqhMdc9G)$g3Bu1dr7@HO
z9AxC<vWCdLL$308H%yV9>Id0@h;`}GE<JkujbPXQMGF_AZ*Bu+aM-X1=z(swm1%ob
zC}EzUa*%U>1MuOld-v&6T~&3;8e{FfXHQSC55jKZ8zLqO%R?P4CGI5N7m>0E8MSz)
z%!!eR#C+RHlFb<6-3H+TQcYcOJ4Ot{(vy?jMNyjNC2#^3O5paKS~`dszh@$w9S*o(
zh}cVM>1K`!(0t?5*KjH4le%l>rs-`6kprKXzefdh=0VE>raIXXMir)va3qDH_jpRi
zag_Z?Ynyh%K_pqRgD3@&wD7tRyf#+Kp>9Rm=<S{M@F9Yj2g^r@I6&e$!ot!Lv>Lrt
z<d0|0n1L|{hMApbZerpD1Vj)TU$9xI7!lJWKq6gZ0-SxIbkY5jfFjbF;Z@0sdcTM0
zSj$}?uz^QnU~ygxe-D^Z1*eM#z*uRF8ugxLUf%4yd6|$OC?+oicc#uvI;OK_HIV=r
z6KEmn_fqPlc6Jm0{IIa($|EC=;ykvz=Mt>m<8saM;}@uOH8H<@m+UZn8tmp;vP-wh
z2Sa)B<1{ohkS26?o&?Zqx)+U)>7Q+(%J>Rfd{QNxz<#8Qf3Vubi9lEBVo}~1Sk0FU
zi;2;uzDKFU4OJ;DX*WG|y2YqUGBXUwS&*8&zx6{&Kw$t5kEV!_x70qTZLsXd7?4c`
z(g-yUac&F2-8EJr`SVbqYqkNtIBQC*BPBNCK_@?7Hy&J6MJ1BN6t|o6^eGJ@nRse^
zySZw%=~(s`4@w%x3n1_Uhozi)@X(?4q{86*M+tF{9;E@H*u37fwIp)INJzjRmNgka
zUNdx_)r9=SeS9Er-SGXW2(*C%ma={FfB||@X7&ya>BqW=H8)-ZZ0l>VeD>`7U|{G5
zy(%mF4wJ<0>*<JyHf`F7kN_)17DGmh?>RUc$+{h@F3N3MW%t9e*oDGe_;3<8AwU2N
z)seM9@!9X<3Qa>JBXWKJx%a7yqJ8=1!v`o?E9{v4-N=C0_c(k%9ig^Bl{(yZiZ#4k
zzz<G2NaYo?c8iAP68}s<`;S)jlEqXnK^+jsM?4uXWIfF2vlFG;fonvT>m7WeYU1$k
zwIo#$BIcPuTLu+*^TWS;!Rdeg{5fE<q2R!}f-}E#Yy3fw8JKq(E(D9=ms}>e9(X@E
z>STOun0c&rgug(J5w*PBhcYp&bJEQ@7dFnGO*dNZ_?KHvXH4lTd1a2WHsmL2|Er_Q
z=Z6{7E3_TpbY!{7NX?r<aUvq(9XWsakgEa}M5!;Un=sL?BVb>kZMplezo>usH`G})
zC1~XP170kq;f8Q}$W!{+=wZEdshynMY$Kxsgmr{gh<9XFkK=)0yD|kH9XIMfWV;BY
zl1cj!)Fn2&-_~x^D*I+?W$Jdo9`yF=d11@ur|r!C1TG@jyI>BYysLbN2nx?3^p<&3
z^x;{_tw--J>T%5U=x5tF^4ynb4uX#liGmC?s-uv;h5hE|N3qF{EmHt+g%^eoP)EPF
z$g#S1eHf@2Q*~LCmVVd*B?h!c+b>eRm}BqfXCMb0qoX{Xd=)CA)t+>DWr`!)VT%$}
zislm#OFb_NEFxUMtZ;|PY{0#;-m_Dkzd$!?e0m(jf7RS{)Q>eDFRF8f2=l(x<1wWB
zZ069=Q2rn5LttlgY~0x@k6X8Hku)*gYHpbER=UNgg8+ySR0e6-9adO^`83+7?Ju?{
zNCu^G>8BbHebKguTSKb@MAJio*1wSrpPNY>&p5>;ZtFH@9RX*>TjVThfw({=ZsW!|
zvu7W)sUqT-IAH=<BWWpuXU2erV#U1)9}XNaBCSGX5<(Q?ns5i<*2LIYQeFnF_D6-T
zVu&qut9}=O)fmw;hW$U42}ufigtNK12^al?P-^h^e+Gmt2=llREku`uCKL}}`~Kts
zZ2wS?RrFN?0FpmX-d5)vwW1idU0^L<UG$bK2(2|X1!4AB4Goe37i_UW?G<=o0rIx0
za~CYAA>R=dfDRqRAQpDWq^SPY>;U%qwvy$vGMLWDr4l&B#gkM|UKxsDm9%GY8DI14
z*)vLyaEG<2EN~z_6`=lo1$Iw?&PEoB1zf=n8oBT$kXrO`cr{V^tCb+np^q;VbBR8H
zQ<x3K9bJdJi0!r8T!o^&7}22TMF%^AvG=7?6C<No(kmX(uXGhui$lJ?Rat3*PDH~f
zr62UuG=LVRm|_k!MP*aD^*C-!)ZVmj9hysQdNrjM>@e(!;?G!$KVgq!t;%N4SqB6X
z?jLW`B_Hn7{5OMzKBDmk7L~1oj8Ou=#W4V9k-7rxBBim8#up+`!e&gxrVX;Z*#0Uf
z3z0~2ZvpXv2@re|9v)rcNCJUzhqg{l1m-O<4_b!Hen%l~aG!)0uQG_}T`-L!lfM<R
zrg~nSbb}8Yk)<km|GtN!VjS;AFd8rVpB=}+&(5rUIpr|sbBbF-;_1=n^}>X>G)WkW
zRKxan?b;Q8U@n(Qa}t4bY?T&?aouCLhOv-2K^+L#$@3RXIu;)v9~EU+atI$I>x1^9
zYM)Tv2JGngks|{=(9xJ$d@i40izXTv=T0bk$);8q7E!rpEVP>#DuAp{y+iy9eniYJ
zo;*3p>5?7Y6U-0M_j?wO*P3xD388cuQ4RP!B=9Xh0X0zN>7YWCHcPD8C%%FnNt-s6
zQ;mQVZ-4v1v#I^KoYML*T*E1=G4Goiz3G#*Dmk$USPi=t;NZ#9v1wn1&ms>`kGsXz
z4!O2gcqM|+uZr^R+My$4z4;W-Dk+2Nf_?1N?%lfC_a%bJNlS~iQlq)?C47u8#_TRi
z@x_*}@mA03KL<EdFSP3ChU1$3w9I<R^5tR#uT$!)Ylqia+hsj0QzaHagz!-5T2BS;
z9;|=Orn0q$M*?4gzPoVL$e-<Zs%f6e+4?}_9Il>SyLG#J=MKDM9p;~08y$Iykck3c
zAX=r)Q`Cgbkfa$xKES*~@c|O1xejNcr6st3COOZ+*Pz~U5nc)a1FYr2gFOuv660Z!
z$us;6fB${!mDwkF2<&}05qhys4%7hJQiY-nFA4qH6sPX7_JLcP$tj<j8~@k`)j8eG
zuCRRT>!DRJpYW2QU2DQ#%VF8^kCVT@B>T5J>D<X=(+G=;<dB0$Zu`cwo$9`q++bwV
z{SA!6sNvokFX!C4#qf?o+wO-<^s8^0PYpAsBRT1hod)4dPWW4)5T+(E)iYBeGM`O(
zP4x1(t+O+-b|nMP-nXi}7v?~*zC-eM<ehBlUSmd5AR#)P9R=A43bI4rucg^#tpW-+
zh?1X?<_h&sLDLBhXc90y(3_%{RljDUr*NaH5KxNR?VKy-uQ+?2;5Aq@<omrv{<~+-
zs7;8yn3nAAw>+K=LF2k_UwWP_6I8nrNLLfH`S(TrK;=`)Hme1l&#CPX++<C>9%ZI#
zR|KQ80dj8ekBErGQ$wzTP*+q{;Y6deTDlOSktrW4J2r(CykSVM0D**PekRB$Sx+2B
z%0}gERYB7zlhB>A$U~&wxO!|3SSd|KBQ~0C0v`h$`~CNa^#5aG#4Hjs)%52Z=rIWN
z&u>;S1d+18F});EYTJtGUZBW!B{WowXYflFT&Av%UDtk&nMwIFc=8|^atTK`P$7BS
z|D3*Ipk&VHCmW7^6^+Lf<f|^rZ@%JdapbIXmz(x{{bV}|N7K)>AKBmhj6akuMMd+}
zI^yc>>mzOxC`{S<&A9&dj1xWGz7E)QGdwc)Rd81n1y4yADNiSe1LLs$EM2r8UcF%Z
ze5t_hiIDhADn8RanpWdpN=kWr9FQf=to8<uBZd#>1rQ~;^3arZn@V{S^Dqn%p(nsN
z$s(v0u%R^n)q-#n)X?DN2^Bu7_@H0({70U+1tlbjikbPNf&v}tV{-ErE_B7&!0TH~
zHcTK*LyI*nHYG<bEtjjd2`DuPF$^<i)WaGwcF~}RIa2%EYG$Le;qp1LQ!|o+Ll(U_
zVXOjGhv-p7(RRL7PdCkMAv=&q>vxQFs69c@rt6cu8&6B0JyW-SXt{@c=38Ogk;$_)
ziuk{VYLn@96ja0e38G=58Z(S8S6P2(jB=`Lg=kT&*}h8Pmd*B_NLBbbw8D}_%zw{%
zZ(1s*?l?Pq4EXja900gPzPkH@m?;a0OoQvD65pi7>0vE`Eng4b9?`Tvm0Ybs@%Sg)
zA~5#yWy_!>+@^&E2%UmYvGvzs&}?DiN$Ic*-%YwM=5iAYs9hb-RyE!TafuuStVRKC
zd@HLnJ;#IeqZLkC%fO(FtueA<LWSiR4GnPF|36Um4_Tkv7OWR>*E6EH(n`v0$S@bC
zk4)J*_499%L{*GbbtY{KSFQl+1JeCT1_CJ1ri0QOHu_QHy8}|Y8yy?9_!cBDQjZIG
zdha3IR}@pK@hv+j-atlZSX36L<nyI14Avx0^iWvt7(9=d%_29goI>1)qHX7B-SqYI
ztIqlZ=MS6C@+>UXvpby?H@<t+M|rxylvZt{*wfv}^)vf9cJ^*R`G(fEQ@}A4t!^Tu
zzEcgP#<!+*l@W?PXJ7qhg5CEH@dh`PKWBW?exA%w?(KEFktV7b?-2QOYHf4;X6{cq
zN60aj!5u`Eu)zdDgg1?UBG~+%eil8IPdGh#f=UK^gO8@9G)9DMPwO?Bg^0X>>Lhs^
z0^k%qKT3-L87T{qx{zuFTjla2^tK_F44w=9zNRJxqAYPG$P{kc`CbtmHW;P!mO%a>
znPRC(Z>dqt-@Fev(h=%5WhoHA%jeHWR#i6()%rYm%l$tv=qA&)2$e?L^?OeHm2Bvh
z=ddFAo8dSD%bv)8C@dkq@l(9oTAJ8W-^j@A&rc$;G4YeenrWD)hhJa2Y9=^~?ck{~
zhq+<1=XF`Llr%kjqv=|AY9+Ka%-p}{@0~vOQMv_{bBc4f(KMn$bjm-T)RAdf0ffG^
zG;9jcLZXT!7M^u@&R9|_I#Vc3gjb15xTZ0$0=$=1pmRW>M~}!&A!)?-Fn_6^Vt8sk
ze-4xVB*B_cd{0kTgXqBG3>a_=c--^?{I{Q21`ap(pMT)&1F>>)3^v*L8K45cR$0tH
zL4Sl)({xyRb9bxAWY6}n6S@yy*x!RptP#mR%_7P-(lzBEY{8S6-?b=j6%F%ybBO1z
z32dMo*v`){;dDuU!lkS_4b*9KlLBu_#~+3Y2d3Uh4{Y{JR4q5`wl;OcjQ4b0^&&&9
zkC@0X&3OSCQ{v`)1UynGWamWZS13~Q9-1x6Mds()6FCx7Ur{I&fxyT&QrK$4t5<B_
zbA7*qS^aqGkU^+`0B%7rHXPxIG6`=I^9!Q&dCa+k?7VmH0HMHOCn|gF*fEuvvm6<A
zTFj{NykX<*1i0KHHJ7HkC3O?&RrV;(lR}M*L7{8z+*TIfhF)ZPc}{BTU{KGYLq&W`
zRWp2wufCxn7=)NrK!6ORj5dmPs=8@~xT?Uy5K3Ms98s}Mnm)9<BvP68J4uw(DRnB}
zAs)cWpSnp{ll0}0Hlq*F*vW04PpZijq|2A<zJB~(n2OBrvXv{zzG>4lD|z4`w{6=G
za>jPb!pwrINUtN8QZH)>!Gro5gs-rIX}$Rn^OV%1jvT+s@~BN-j0HWlpCE8SU7?aR
zbsa~HW8wSxAh$LQ{6$aMdWZQNFn6g-3bZiOjgKp1#iRod(S_`+?@Qb|a?R|d6U7%1
zgi3YO&%V(ZotSbHr(_qrgldwa91`?LaB{tMwod5ZU$;(mZtk&H$b9_mAJ3bPtmU9n
zFo65=mR^1$lG?e~8LLBh4HpTY7iOj!9(z+($g41TB7+0K0w$1H(XooO!P{4_LTgh#
za;PyE*4EZ=3I9QgmIU}>cRSy1hy&Dx!DNw_2P4TL7f^k@QsQnut9ry60d$enDNhI4
zExC3>aBbw)vjr$O!BP26EWswg*NAA;%s?Az66#J3WpyUspTlf3&;HH0#c<p&8M_QS
z15Tsqk+2GM@w{83=%wkdM=vpj>j0fQRIB9HzrN*3C7&incqM|nz-Hu<n)pTXx+r7n
zdEu3iyAboQoN@wa<TJ{fK-04MnMUcUNsts;v~GWJG&1~KfP+01z~qlZyT=5<Z$Lfg
z)9csDewWnz(%eR@1Q``k8E0$c(k@m}#cMc}t2U8_yDJGM2{r}*{Ue8mCJf$PID_lj
z(G<)_(ny@oJuo4#;5Z9JtwA)g$<p?E5&bk(MIcq!<=}3JIO){LCWz!Y-P{`Ayjd1!
zX?@p^lU4iS4=U|swGc>DKenMrtyeu_*;t;M8)&CYpFx`p>k=U~O9!I0^$W7XtxSKR
z^iS24pxR$+e#gF(CJY*MYWQB*hV)@1rd`aD;~pqB9|NH#wV6C=66-3+M;vAb>MGxX
zE>KVMHEH90a8;-^9ttE#mzjk)YLp<@vC0<#T`~;A(@b~)0-g~x73AtA)bXQgBdZK!
zhhw{ms5;AV&&~m7uQt|(G<MQMi&(e7$cp|k9xj)@RV?MI%mExJ&K<dztkDVWo{;nB
zM+sJpjpOpwI}OtXOCf97zxD+59-u>u{&<bGy!*mJMgjXBJ=#-2;mPSay?&<5#w83x
z{#0I$*xw=QaDZ<pYtm|B{}oO45lt~48CKQTuK;4|#Em!YsTS#QC^ontt+R|s4rmri
zP0X8+W|HX6+3^SWLomZTcq)iwhU)x^tu&)?v%#a6{)G$W9e_Cj?LzBjOuQH#KIY1*
z?CflC|1;6=HSc29*kX;$8Tj-4lbeun$qq>wupV^$Iif(W7v^feot8F^I$9NLC=kWO
zJcoS=Xameit{JUbC}sf*ryCzn;<Y_n@yYCz_rT#KU7_HYiG&LqO0$N^_^tUC)8U7q
zYyqcOiZ2qQKxWN~0IUT3;+KEm)#<?~?kH@1($%)=BSbD?d?ftz>7V>C1kuNgxfmHa
zTPZ#+j)v<8w90#V8LRqjUDICd>x#Uw4B}e1PEKC_8cqe`2%dN|!SpbL2b2UlblB+R
zL>UgcJ=Gro)0#_rC&hnWV+V>!cHuDRdi7{YqWg&tFH+`-(Oxs7wP`P|268I)X>nhS
znIT(u1HAxx8A9KxOK)Y5it1|QNY?XUg!%W|j1p9NfWPD)U!Oy;)VJOIEIT_0Mk%iC
z&)2tF+3A5OLJWI(&RW}IX>u4~=c(jxL<&@o4{0+1!u=ry$2gEdYRi}vzrOL8FQZuu
z+MVb;aEN$FNx?esDlj$H-{-&qx6((uu_FW&M6cLAYAXI3|F9T<p$%mhi4ttLeoked
z#9e+rtZnG&lXYJeXI78sFJwfL0k2qGW^I?|`lMtJCx_#v$yS9Wfe`kl0~1pASJE2*
zS!(p)!QoY*VPUWN<79Xz$rlH7m(gfEga{B;4=9nogPL~34*lW+Jh_~}yq_LDRQwF6
zQKHmSaad72f%{6vnoWSSK<2Nis;Z_|7L^2k$ny}+(mF8W$CUE!8(2pK@gVgr-`&bR
z?M^%+&ng-7w}_o0;3klt^5Fvza${Y9=KyO1w6TVoh7?&i5peAEK&<Re@v`vsoVG9@
zQA~HzmtmWBd6%PL)j@fByGxhu8pt$24_;n!=o}f&7dI&h7JUO$2PrN>-zGl{VM=|;
z^*&7%1R5CvlB(v*9XfdM2`%7Ib5!rQllY0|l-NlqzO&%-<rsgU2fJzMVbCbjwW)HD
zPUUrIBYtH8vIH<W1zaAftLp#{v>W+uZ*0XEM~>s@)6W)v@nR|G1cfdENPNausWa7@
z%8m#KD=QfRduaSn*{#OP6BLt|4!21ZbasB-2Ei&QHs{Wjr=Rr?3Yxs^z5+rOPJSPo
z>uxkbkTPNHtHFqA+@*(B?gD5YP-Z&qhvfD#_io=_3P7&s#argF?u)+;wMPx!jW^38
zLz@VH*^A=lSDXzPAM)I!Ap1~?;&Kgo^Y-oC)YSbH%o!e3JYRHRx={({U7D{QC5fB>
z3@E#2{?FLcOf?(p-U-eSNj^T<?k>b*Zl@Rp6Y68IZF>ssJdU;{2RDWRv4#Qkd(t$I
z$B7TwUS~%(=*R31RY(j!KlVo1FhC_FQ;;xxl$NGCc`Yxeuy71=M_BfRi?aKy7{Z8;
zbFbv71tLTw1h%=s)cBRnvU>kV{;^Rb&cir#XxB~~_E^b!M(+~dLqvnt_LGI?KHeZT
z#&Iz31rJUul&wJThP7+1+p39#B5umH4^SnPLeJg1KmBvzBnE8kx!71@0yhenU;*?4
zrTwiSe$st>8olSSXjWOou;}MN^O=f3UD7XZh$K<tKiXRC6;d#?AhAfW^?dtc5I1hv
zAeucnLd9va!sm@%=(<EZb$^K12J~8Ca?e>iFx@U;ZwuW7IQHTjxm*Wx-q(O|kc=i#
z9w_;YW8xM6DLtjnviPa>lKmK?Q=+_=3s4#|1$-Kz_OJS^jr+wWhPL>b3U2`*<8Y|B
zg~BxxeK3OPNO_s$jHO++PwX*BU^L&t6S?Hwjp?piAw$*w_%Y@Chp`1MCH>p|e99XR
z=Mq0vFl|RPx>)Uo06h#VVznEgAMN76kYmTT0D?I6kz$Q23%2gxVE*Uvn2SYLexhNE
zBAYAy&QVqK4Gm*J3#a|nz9&o_%)UY)fWJ|O+mXkLIp(?Mw-dznavy(>I-HrRdG=Xb
zE30~T!|79}D78GM&;%h%R12|+1V_>3`pvQw2<(Vy$RH7r%gH<x(b50s=Btob88%Fe
zTmyH8P(xB@b55|r=bx*EnZQaGYY8Q5jAgObo7}YX-L>Jak{~8@tc4qmiLpI5t*O^c
z`;mF>`g3V(0b=MoAbb*ZVMqx+K63@((JFsRhTHAO<_T{%oJ%Vo{77b2=^Qz6qQNcz
zw~zDZm?kH07k(`}Iy^Sk7F2|c2@Nc)J&75?9T%DyIZ>wfpaKKLga5$4+VBd`0a&(;
zlLhmI2kl1Z#yg`np?}fYaQ@1ba)2BREN@gwwr9V^{^xq>W`vtW;Z82I1%#pkxT~C)
z=vFx&Hl2+jku-UTzqYV&>^U8@vjIIO#d}d;Osu*H7m@<?ODaGd0JV|B-4~cs9YszO
zg71aJw_7VuCvXS-Lv&<bhvip)9#Z!}m60Qn>VD@&h_jt7EG-f};g0>T)2bTa&|sLa
zSI2lYWL5Py9Z(GrgxZ=mcF^iKg?RoS8|G(+U-X0paIiq-;5f|MbX(l%jhMMnX7kH;
z3al1I2hz59IuxZvcveD6U@i>P*m2|h%lcMHqD>iv196C+lr5CtVdE7RF0m6zr`$ZV
zQ!;n=+dct%Q@Q<wnYuh-^~RW!2LI1y_?s=k6Q<O5+}ixVX$S9lhL05$N)T|?OjvQj
zUJL!eVFGWZI4@eU+1P0^%Pz6eXNMv2^_YTpWY5fJ&rp7f0R!V`(iKHm$m!IzA+E*h
z2y+ir7q`Hu@D&TlH3$}ZK`o1RwNd>cGoiX2F0#@9AtXr#qG5y^(4^;(XLqo(dvFX~
zE!0YoRG9YWF@f+nB0@Bl2@ODsZ0SKV^JCMKg(p{i8RrnN;?AvGOYrwJHP0i@T;9HQ
z=-VebIYK0zoP$B1M}vZX0JRCFKXO_<FI*;%evf7m`uKoCWAfh>o2rRLqlbvzWd|XN
zLiNwUk$4!C@EVqYG!jQCt6$no#5|E7(FYM;wGLVV-~jn)_@eW&j|eslH$kQ#kytlS
zO5N@`$DS<Ll@EpC?0SyPSoiIlNP^+=VJD@3FvDa2{tVzgwVKI2b*{=Bm!#5g_H5Di
zBM7d$p+a#qsa-6GZU3eNX1YR3_7A6m`b8m$_qI`_kVObe3GL&jC3L&-BWKuq&u`vb
z%s0zA<^ekgQ91#BK@}$J*)xCbKJI~Y7cKhE9~lCMSfvF?zo!E0OtaBl_hv01fT@*P
z{G#eQsj92nP=`j<MaDEE*2KEnQMls-6{A8WF>WjFs!t!dV-PuEeRwFq*L+9G7>$d;
zF7!(irtA@Jm`W+Jr<`(EdC~1dzrq1Lq=`$rP%8-2RM<c+ipXMLEFs(zV_gGnMk&(#
zDLz1Q&{SXl0S7hwBe9`q%lP<wLBVYP!H>X0=x&MR1nIl}9<=-N>eyNF`3d4TLszt=
zlZA9~#fsAkCkU5?mqzD-kVw0``vN~G_RL37?9?d}{cq(v7#>2ncz(`1_V-_3ek3`$
z0l);EDIEs>#3t9J0WQ*!Nq=AJvuA&TtCKbAd0|s2&!SXmQEn{0+Srt_R!bP{4OLvv
z3zQdV;W{VAX?ODqt@otvfR-E=4+ZQtAV0uY)T_W>F~w0$#M%j?#CSe!G!AtAFdpAQ
z-oQCL6`<DR(!@;b-Lkz?i~O(<oC52VBfz-8S!2_Q#<{YSd@*?!%?~FO^v=$Qv*iL7
zp=fdNrzh$^g@((7f!wcI7s8*$`gc>n>!r83zXfR@JBkDRDmxAy;+&rOoS~GuQZ=Zf
zn+7&>{Q=BYAJX{6O$Q!lNf;O|1f)i3%zxpG#P|jAETYGzcG|?>pkI}5WfmcU#;Ke4
z?#+@5GgiUWo*K5Vg?scUY!GmtWgbk0wF?*ilYaJp+7{f~>(sqJ3DaIqnZl}2S4=1p
zFWJ34%=)sDk|;5V8SsrphJ)Ij+;gbb9igNmdsTG9!9#c5Gs9U`Tju}7FG|vu@%I?!
z&7^mPrTG@(FGH`rpf3xDMl+zE986y-T?<9?ZC7I4L>dMFtH|1fur~AOX!>+ItNULw
zf+QE*=73ZdUpj_TeurbpD9yIY?5XjLjiI60_0q$TDJf*xdS2{Aks70JYPH++-xy2W
z1xIE++d*HD92YKJh!CoS-Mj~_LoMwb9K?7k@Hss%a^aDSuAH&j{l(?q9x2)HhSpJc
z{)|3;XFVEGV8X$bqFeIDvHMwB&e%VaKpEPE5PQInh5tp0vR{^D-%C#~6K#Ux>6=`_
zjbky`Z|qQKy-C~aU7>w1pYm(_H2o6$Q<E?v7j}-q2_im{D3Vn7@j=iK!o$NA2I%NC
zusKE+&HJ^#fBjzcF3YpQQ{0j-fBf{xHfH{aO<yGLq>iGqeU01a)b2<x4xjaFM@2t_
zxrCd^ANK3lE1zS$=|WIAf#nwv8L6?H@*R`{30?+yKu@8)Ws5~exFoV)F$*JhA-}=c
z^`7W7y<Aj&WY<x8;S77Rfq*>=MoId^wBrxbD{o_?x}W^_mq{h$l||by6v}_!y8Ce&
zNbvL?3fOi4wOASsUo~!GTTyKo>G2QR&nPXXJc{`BF@8;bLv5Aq>0at{=jR=~>QF;n
z)u5NiPsg{FOmcu(C%p^A)wprYPg;Cvw5aZehU&_VCQTzqYdQns1C}wKnrK8Os@xmm
zmAD5xfFR;hJumJHo&4@br+?jnVoAp(dS0;Ubqfal{CdeIbV)(wO=TL?V0?V`ji=lp
zw$oAh4wP&RdUfH0U-6V`7%?0Tzd|&bK<|jG%hE<4x8tiVvq@$?{H9^ADhJy5U<A$X
z1yr#4^Y1~?zcr<G1=C5mlNesCP>4WEX6EY=m^pVR8l7y3e@YtD3xe9*IexFPg`3NM
z=ZM>W2bY=x5*<D<*ki=Ki?hoQZdg7r-Y?(5VsAIQJ`c1mo$9J*dDysX=3u{bQKtj#
z-`3oF_x(q`&dU{VpK8pR^LcB1b@1bNlRrIn&<RAXM<=!>J)n=4Gu%`66SrkHNpf%(
zdF8wgl&C#hH<$G9G<j;KssEsF*@CequB^R+va@*Y+H?O1*CPq+y>VkTzk=%ad^0l@
zt)Hi`miGNt%F}IaMdP5iPb-OTBJPU#8BQze1s7_}*8c4-exK{rqA!q=qZ!=2GIgeS
zYfrk8EbLJ@?B|zS?4;PIDe=;^CU8PYXO9AEnqlu3N+kanFJI24L(0VBn)1IblBF{?
zZ`R5l1#Y!kwyJYF{WuT_xuEHzaY!ztClN}tKF$+A@Zr5(DdG3<7YF|}KGi$brJxoO
zKOb;FUB7veKmXBo2egQ2(Pt_??uJkB>GyjKhiA8L+qyId#Ga%${Jo0op+ZX-xM%7t
zSD~q<u0F&=MNQ3N)vBh~<9>RUq2XS8>gmJAb(a0G`!zKNh$;~yd+%>4aqlyE4gmRi
z=V@O*=7gR((>`TPQyS=g_pf_JG-_vO`}tj{@o^>H#@B3%+XbLQyk;_cl5IHbXK*R|
zHiUpQ(nWi%3w<Ec>_)m9?b{F#=Q5$i^UhNVYR<_xNc%Pvw9Ydo(DTkTyW{hk-48l_
zdPsI?gGE!`@#NIJY9%@4$Q)xAILcSacH?#do%4NFLPZWR(!7lJauRm4PoJA<tT)l=
zwEx5NA-}Wk7<O-;rkF0dHMip3v01VucsT!k^9Q55C=U$vKsAG|)L~l3W4i>TzaW3&
zw6s;G<<SD3ZPh0@@B6T*-rm$r8TBF`kY4<ymdY0i9xyWTEQh@1?%Rj9JcN|N!Do9g
zdC#8x5gi>(_DZnKbQ2m-VU()ZsF{>H#Y#fwznPGvVt;yoM<Rs|0%Nd5l2(f4RJDL_
zl@N>qEes3{e0aOkHIZBdW{b&4hz3HXf1+pol=p74+O}+7OY4)ko>aVV%=;s=heT=O
zu$*NFw>|fm2-u~qt)?Ywh2}In`s(%v_=Ilb#yCs(8P5oA^L&UOE2t+$)1XjSnyvP=
z0Yi|7A!jx=8Bq5TmV>4}97D;jZ#FdiHTTP&1&J=OT`iP79={`0U6}cLcGoUliave}
zLgDQBcnW>Ccsw>lxWn!zl<~O=NJ|)f0X$EwNbu<Aem;ytCi`jKrp?&y{piawG%%p>
zhdgqV5TAL!8mqTk26rd=G1DyOTr;~@Wzb)($B9}^(LGa992z#Rhh@>eQni8iN@mO(
ztEhY`GV(nrXVQju9l!j+>*u1ij|~Z0_nGS3gK^(@;P+WS5hwS*gjhL?r=Ul9j*&M5
zt{7nz$BY>hc?CHPEPpR`omD1KrWiDS^7QE|)YUS7)Va61*hH4$oiFRux9{9FnT%kD
zsJw;t(z@hA5H1+1ZI#J@Mb=H^w$1iX<=c#2s@HM(Sq~qcS*%|7F)}(@i${TtJb01y
zZ89Y&LZ%<dR_MP@qG*M0n?G-!K+3(nwV+M1_T<%2lHue)ma?rW8~||?{2CI;U=PYJ
zh_K`_*pf=XnKOc(CUA0*&bu{yAO<Zit`<4RXQ#_r1!|H=#ffuA${&rAQzmN!?Z{e|
z-P9x{B6(};$*^+>E?L`e-#(v=i7CU+3$d){H%U*=n+}z2H;F`JBXAm^g}J%8$3dO|
zwc(uQ5uuJ9{(V28Hk@hq^y2NQ=^9z~zN@Y`j2hdq*IRGjii&m7h>Lr*zW#oEA!#JF
zLa)7h2YLWHgF~iG@+I+ucTZYVGLU2s&q=;wgLV>WVO!DDVaGHY!11!q8%RaS+@e;G
zV{@@&Pd!5>_7+pgps257K@c$7tAJqOQhGfIbFWz)8gOq*t0x{*I{tt{VD=rBDBE!D
zw;r5EjemZhAr}=Hi7d0Grp3`?$NCYKK#bPPX8V)He6OqH#N%FIRtGcW7;cO7H+E<;
zqsGFF7vAgN<hq#B>tj>M3g2>?byh_Ux%Z~)o7Vo`6Dz?ILS0%n*6kzYAss>Z5*I)W
zIj4oydeUb?mYl3C(ILlaYGHLqZ{<ru#Bu-3NNI&(VL0k$E34kSZwt~JyJcBl17s3_
zFdur13xtVFh)A<RWB`p)XNq)6I5*+;qq^t`dbL)y{{!?rNO?RIcIdKS7jfqSYMm2)
zZHNvNO*`p<z<n!BZHLHkG)l*dO0pK^p)wKK^GsyqDqzSsrF!xPj9l-I_aJ@vtZy=`
zbf+Jr4oG6jL>|j4t2d#J&~f+4_g;aWm;Kw5d0)@MViQjYEH?$P7000T%A0qeOkIip
zP*YKQ<>ZSjx@xxc$rHh*>hR|a#~rjetzUluDH~ng8<eG}`Y-Ei4*~MTp5>RxR`&H8
z?_>X^wjTWU{?;%O974}|YbYUN`t&E$=QK?dq({i6eNi0dl(;}*tmbswMrk#67a5fA
zwYBp;k{4j0_k5jaZVniGG7r81jRYng8^G*IGCJw;HOuVnEd!b+WwV%F#gv@&>(`SU
zQ@io@3EwmIYg2B_KhC~0d}|DO#6N!|j{)6S1fW9%pL*Ie+hG5(%o=i%wG6^Z+ZqPc
zSXsL@dNPG&|6p91vu8TF;r*>MynBCAlw^7LjtFRdj6=#rQfjt^5cKf1{XItS4347T
zC%os;$M1whXMY8<5-}cYW7{S#&OHQbn9peP)ne=IUod@o%j_NmqIxZA<y*CEd6|^Q
zXP1aC1J~khoWFzNjj=7K=<R$g>CGja7JUWqgW4+7=EBY|gE;aq$1gppxx3||qAKYP
ztnDBT4PSe>y2#frFp0-9)x6IfsHGLCKpi!;=rh~!pIJJXIG)nX+#ErZ$TR!uno42Z
z(o6M!H->JPAqsNn0y+VHaLml-&qv>m7M-V<7qpc!!+!K@zN0^%M@0w22K0|l<gNki
z{&aGRJ}@T+S}xF?7#K<w9zDl!6tCbB!qx`k=skADKC&L)A}ouLnN~Ov9-ewQsR}Pb
z^@s<cno2Mj!yJjyQebPkju`$WpsshO*(~E5f_~s7icug3P}iCy7-_u|;m`KtjMU+j
z&6qhe85|As1bv6w19Qr{*Arp=MCn<jF+pZ3U8QTNZX5;ldJy&;Bl01LHhKN!FFWg@
zZ&!ABTg<_%=g`O?Rq<ro8|o7h6Z3(0QEXw=1?=#;FLYlOn5^{luv$D3r6?qGFm&hR
z<3Dj$CwshXIUs5ZyO)>kg*3^72c6_S`8aY@7xZJ#pEv%)%u0QH|Ni~?n8}l^&CTP8
zy!cmv*ML}`yb)PC*XHTo-n$VI*!J~9&uoDiQzT!g#kvHgB@^^;FrbbO4|<unsp!)u
zBRN{qW73ZUJ&EgySw6*zmoJ6L5_0b_?mz~?aIxRf(T!6+j)E`DCvt}jeBmuotEi`<
zfC3gV+?yWycsSw%=p*!fvQ)HSFI%<hpE!pF0GLP>z%|dZ*9bH2(X;0!Ff}A;Xqtia
zKsK}%Q;#1Ba5_6gTl+ibmKzrF7A3U_y?x*d(F0F|C*|MSj#mbf=M?+^y}`rXPfH_o
zFQvRgNaGtnnt`1YCVT_SW}$G7>^Iue1Wk^jSx`_=@%J{(e!u@qo5KC>qukty6j<|a
zVsJ4hRDi~)tNZAX<8y6-!-8QaavAlV_a$_tmfV8?S<BA-d)9}KA4~eg*zq<=ZHIqm
zBYO3jqN$nv{Q0rD(*}7|)Y3FkPaC1Ty9CUgJKDQ#;NU@n?!ffSn@Zy|P5+sXhynO!
z@cbbmk5~46OdOS-zL(www}yJMZVK3f#^k@(gktNgVm$!EeF?`6NCXP-C7^39n>X8b
z{=|u$_liWj8@-V9FH<={j+5HYq}{i}{$eLA?>GikWYo__9bl5Pcfk>26EXw?`D(y+
z0M?~%{~~zd!m0&N+T?^cqWm8H`AfI%-EFU~drf7NH~Fy0N}aG-M2E~B#d8WKypz87
zL)F6{G#zpkyLW%O214-HM-Mw6M(}Nnt(h`-I0cKe`o6G7xp3JxAlCs@)rG-^gZrUB
z_~KLc3CkToCoQOY?sFOW!`3!LN9|s1w9ldYIj{*JuPprdikbwRf{aNQIj8uR(FbT-
z!|2QiG^}t<#OnghjQWlG5tl$!*|spJ!`_~#^AdD=2KdH~Gb1u)QJ~tO@}}?BrFLrr
z){XZjx-HnuUfsG=<*uXR8{G#&kWJh`!ZtRa*%p8v0OW_4uoh-E!;YLiJL1dU-{p=%
zW{r;@f5<QKboYrJ{f-~EGB=+QM>YO_K?3l#v-YrIdj@~Mg7^ScB*HARf<QX4AHL4^
z4i4^HJ>Dlc*ygrXr>4VKnhhF8usZasFM2f94~G`Bc|l)Kyr9OS{<6^@=+WFcDwi@b
zcSAcc6mivUqu@6a@s1fIeJQ)1JTX?;==l^hzDXCLE09G4MK0$MUOi=HE?TVURGq5+
zTXs9oTWkw~g0OivO~Sx&X=r58(22wWuJi2IyYxl@fL+MD34%}0tq2GcvP{XHjEyT;
zIMH=ZOu%QZ*fXoYLOox_D}DRfwdy9sJ3w4o7ReEayhKfncqzL;rDXn=&6}~I1UpyQ
zyjlzSD1t!}x$ob<v*W_p2}=e>MMu-HT?r2l_~ht4A$sTS4abcGNb}1hmTQ8x_TJ<d
zPoIjm^t_w=KI7{56&JPW@?QX}k1G`N??gl3<HJXc@bmSB!7L^Mz`W7(N=b2}L`0uS
z%==OL;7YeV+eMu**3^t++xhgJoOkEWDTJ=c0U-Er=0bYMpDRG0?)Tr@D^H&`Z7ER%
zD+C~<DAyA3c{4Jv{2~bx1&ayLOE76Uf2VaJ$<UlW_O_gI$P_m^{K3%>DMLY;f_7dW
zN^#Ltv+z3%@Zzj()xnBDC=P<6aUy1aW#>+vE>MZZlj?ba<<X@kN`Md>*~zRMl{%6n
z(CU7gJ{v&&skL;{Llg`pzbOnOVME~=x7Ah-9x~+a%a@IK3qi|x_edyMl0BD|6lLwk
z>pod?!7b2a$&f6<)(b8xPM_QUtzzoiz2g$eubz}WB1<OEz#GdlAi?RG479>c_^Vs1
zW*FmDC}M|qD<uEYf(i&jwnK;gkjAKd5u&Y6$fst`!6c!9YkRFh$E;}2sRlXETa;Jv
z$|Sl*e~%)!>qwx$glkxrcx=6}3e+k{p@`gF`WRk*_KY0@QE`QX1A{Ak$#nw!{2*xw
znGX{0tvYVh;Q5i*16uDpOq2P+&5aF{U*Hl}e0uod#@3UOWD{zP_actPmUs9%a2Xwj
z4i3*p<@^kT{~-3YKox~=gkC{ujsWQWyLT=~FzDS!D1gRX(E>o$C;HbogU;*M+W}MJ
zrWmht4LYb!2=ag9Fy9~r-XqHTCvggJ(%_+8^q`KjCnoaZSe14ycjCo@CU5aqJ$;@E
z3Gkh$XxvQ)5}@ABb8tv}dX0a?7L1G(qsFO}(cCvlc!?NZr{%`qEG;aKUcdew<JVT1
zjm`-m)E0_p^mP$qFgUFf`ZGa6LZkr?&3)*9=}Cb?bnC#~gEmXpf#J7J+PsmyfPOw+
z544#gPZvbFnDj%XBANn)peL}VR5Y+6RC=K<A)QN3ai1A%BS$(STP$taB5vo0Uu2L~
zAHa2|YqCY(_fBKCQQbwVe)n8!Dz{qrDda48wX@|WZmx=?S?%!Q&KsW*RuH(Wbk)+<
zjyQd~vmPXA?o<8BA8XKGP)03v10tAg5T-ci=O3Tk_W}omgvl_xz-hv9q=a^zyOH*K
z9fw2oIr2_Kb2{*&v_BjGF0(E4mpxRybT~39N{s1!SY=ER2x<xFHC2QV)z=A}9dM&d
z?M)yffT*aNIXINruf!!UIJ0nScmd)#P_8n2Ijx^?oL$V`%_u0?4v6=#xx_=^+G>)O
zvB<REv0)>e%L+#e2_EvDhQ-LYFsn*8yZ|YndN4F+3tLbCx^B@f7B#Ma&hOaPdk8@S
zwnI7+yo>qm|B%%DVB$Q~!=f*>Wa%W$i(h6W&ZCPcREe@v-%)?KKP)2Qe)17rZQ*ur
zok+Flbr!e$G%8*+5y4$wxrb9ikt&C2cFgyAc)0K3K|_beT3q_ez?AV7Pe2XcN5?=`
zI0ktUvM^Sdz9Et?NP;y$G4pZ`7i~4aH~Wn!>1K@=KmwcIH){<MmDz11L_mrBw%7Bx
z^kg%L;)q3$yJ>0Ob<bZD%X=K8Z2<pp<mjXk_aaFwBNIsF%gg8H1ihf_3@!m_bb69#
zU?g@+NT?;WggNtVYckY~ID>f503i+;7dc(6att&0+}C>fj|j`Wv;zq!W`~|yD?O4^
zHQ-(ts!%0Y!orldtc168!ST#nY+TlXCu}k7{c<1b;o?eO52+LV1EZNUp=oNyZP*LS
z^UsVGGO5w_db63{i917bPFNrNMRoUumK}=jx^ZA_QIYs6aD%SBd*?$TEHxwfrQS$1
z5NUtW>{==G3AXx7pWgb_c4$pt7pcOMZX3#N`cX%lw_N&PE2I_pLlZ}HGc%|aQ%CxB
z(Gvs${0eqY9Z3O|uDC=L%g9gmlq2HM%>L#ZavWmstegB;3Ske0uPUX$9Xp!E)oVj%
zAaCiU2jMasDE7kR*3f2s<|fg?K}0iQqnLSxV+NV|n^c1DkW(hAhYbSJbV*9~q`r>A
zS|W^)6WecNG%k#ysmOXxJP~XKP+kDbi}~Sf!ysg@8rFCb2N6IA1-+n&F&Pmez55ss
zYA;r6<}Bq=vHSx(09$gdcKmJ0gR|#<f|e59Y4Yvc!>>IzU-JsLM0xjQ9#)2veeSCe
z`PsZT90s8MbFT#(kOAGH`vha3X*O;cS#IK%8Xx<|$HpI+o3#AyC3-ZF<48=Ll&h}%
zj6$Nyl`%1}k&}{ee-pMGN}-Bue=-InnJ%=%yj+O>`x{o#jCI1)@hA-im~NG6^>KP<
zYfU~90869dDP&|2MHo+fB0Ud9>U4p0LhnZqXRwrV$|N`sC+9@7eATQH!{Y>9TRlLn
zcw^aPf}O}oc6#^OFC{Y;_H@RKuT9zZ-9KsUk1l$0QT`q;R(*Dzq@`vJv;)F-LO)U<
zL<G7R_1xW^<wa*{#P$hbWH>|By61-gg3hsc_9FNs0(p3;GBCEZufFLaKR-?Aa~HOZ
zxpw_Jy@31e36P$?^lD9Jw3{~pB(%cb-XKSVrof24%B!5f!0xY)g#2=CL|x4SP0(`Z
z>!(glU)^tq?s=>~0TkPXVFa_^-VkSkONEt3Dg9&~F~Bl6ov%Es$3eJ(7j#S<2iM}{
zATj|WDSc^7<^1&2CUx9w$Xg9ePS(jzMuNfm)>TTBL~l`F2~GZvioeXUdD3nqSp=<^
z%leI-Fu@fZ@Pxyv-W}ilf32N)K#lnu_Ot$CEXg)1RLa(dDQl5ZDpFFB6e5{}7LzDT
zQHc_2v?(M-t5k|2JJEttmMk%iRKm0(;{7~jd*8p`^A9sRr{_G+_j`Zu`?{|Cy1DKq
zqa9Ep)$5I&oyUc2nJdni+Y>w?A&`HSe~xzWq)V4Ha!r;#aZH`l$s_MDH!Ne%r5Dbq
zfF`A_83RPbv>pk$S+WvXK<(F(+~4V!luv51vuv(UUMWmr4vKHJ^5sCzuD8rQ;^vy`
z>0PJV6qiF&7OHZEGxO3V?Zl!N&ef-%i%4ybGf4HZqQcQIM?E^X_o*&z{qKct=*(&<
zpP0?IxR@|GH9~~hi|Ol)c4%|cYzO-iMQDSKccZkpRwdJ4heje<H$`*h&~qn_9s2{6
zGEwEvAxKIE(=aFq&0%Vqno%e96Gmg2WxH@FT7uO>Di;so>*Vg){PoM$(8I#vTy<bl
zNZsS-&zI$_OeG?-F&{dDRN1{6etg%iU05e13IjTd&pZ1=F(f36DVq)<p83!H^%1H6
zN*XR0k5D)*k6gDp*<-u5_Sf>`ot(3u!1wRc();QQx)U#-XB;4NV(`b?<{%NS=Wbl|
z4j-yeWKMp9|K~z8vxb%yM;(hh9e`-`_(-kI6U*Y0lQlQDB*!{(g}6t|x3nB&trq>o
zd2*B>vTdR*H+%2iK5Y$j4h(q1P0>;FIKSm%y6W<~0ZH6yIQXE_z>hNTICUQ8sZX4E
z{PpwnDAXU(qPPBmepg>`3NNjbC8#!yX_4=9V8@Qm3l}aduH={}35_jB>Ah^oa%RC4
z&Nv{dj>+O^Tobd;bJD7x_54Ufvl|XXqH5oXY1Rg>tE;zwqMLt0$1}&!u-~+K;Otm*
zpFFAhmqYww_tvH{Hcpw8v%w`F#N|j6(`|Hw$&7%s`O2na^RHd=5mB1!n+>0S#N<s{
z9Nr;Pwk+EtcTSYYWM|I>0ddW})$08EzDiEfHtD$RTY@P9pPKuhie=zNHIj=-lft%r
z1b~uCwN6D8AFtj7#tH~^-$fm<G8)wRZwNP_(?+#5@7~?naw}Q&QFt50s%zv+2YvEX
zTHES$i-*+uXv9TI=$vr%<l|Ajxf`iAhAff)Q<5HN15aH&T9Rj3ScHfw@bi|t7ohku
zxTNN`&wf68XV2M=LRi>%+-^|F`~?eEykls~K<mJfIDw(`QShOP!aTcd{KSrcAm*Qv
zB1~S3ccYbyk9_Dzfh6y<=lAhCr+cl-t;Oyww%a<7kG>JZ7WdArCPXRz)eGvY>8%_6
zk9O|AUm!1X&3*)|akp<wfW4E`(sgsRMtZ31l~_-JuSy^Y-fNzooRXrb_0K%<ArNm*
zSGmU=E`hZ*R9Z|xj+Fu;q$-FR$`3J7i2Jta27sql8IGV|KGcucJkHemmuF6o!Q46*
z%InrCPF2tD%`I%$A3W$Hy)xo*f_^32RjJY4wRmfCI321fadC=|EmmqjcvEoaPFSfh
z)6cxeC6sBx(1B^6NcwVYR;;I^vVtQQ259t(v&rE-cGLxdaqForjD2SdCull)z?-{*
z%<W1Nn`g;MrAWRLhi<q;4EDZ|KT;c)h25Y=SJ%);Xd&0XUgeE2cD%Ck9(SK@b5`7V
z)mRfN!e^3GGu?I7*WGN67|+RNrKJx&dxZ~{EOJk&+@hFgAT^_9#sIR9)(7UA!`_kA
z^p+eziX7Bb@OzHr6gdIuSpHAYV7Q?5M-swc{z>xmaM~Z!c1<58D<O`s?aHf0K*}1F
zn|YVMJ>1oWfgZr=x;i?uKEWH?WUu#?kk6VZ$_+mKd7k*uuyic{6jTo13)sHFs(zW{
z^!vya$SwPr+w?Hx5M6YeSx(-L3_979-MLUEi|?xJ&8>sB+9?7)N?#adZ{;Iw?Z<W*
zPH~r)j9MNVD3)z%;ri|Oi;1N#Q`NN4RU5Ti#csBE4Fl|beY-VBwF(+lRhz{sFGfOM
zp~HjV#i?Mz><=*dIYnlBgY00v$1Ggb9XAe6O^=Ta_KXsy?xczSq2_$U_kI$RLYaLn
zG|N78tNne)wVO&RR+12o$>X|v_2}`Dw1u)x=nFM7lh4v$#e&h(^Q~@82OAJ7%(qV>
zuwmu#fp{3W+hh|l1Xjo1o<^7|65lnS<qiY`GP8j-wX`S-m;y++1auZTIkE3v3yp@c
zv9w^-p6N#WcYm9WN97s|>_ZP8G#b1?C_?X?rw4lFaQ|mlggru7Q$7@t5J62>cKg1|
zh2Rwmfx|xy?<p5cF2!n!E(b<yFv@orRa9Z=vq8tvhizP%H}`sSk*;Tg;utlx!JqW=
zIe0Ne7MJ6MbVyj<?urTY;|-1Yv9;Cg>mOLQG$BSz+#ueo*T40f00-+n?JN5qx;MiU
z)y;~b)0=GJ0k>8I!MR4B?5xxyqf298O!fIiY1H_LOXD+J*-1d`-1NmeHvw_8S!Rik
zQBlD(&MrZwIMG7M@I-il7ds(1r@o!X0mOL4@ky4)E)H(kYSq2dBiSFiih~NFfpu{S
zmeZ)*X}yxr!C#&zT>(Hm{*GC<k!WH8C6)g20d(YGD%Ev=*(4a=KRx#zlHKm!x}=~|
zkS5SXFHwMMT~FUDi?^O`9WkJ1#F&D*AcF;SDIr@wePYgK%-*E^z#OzVuC>vRKgYGR
zump>a)vLk0cIC$CuIoK?Y7<Eg#X6@es1;}!Ga<z@ylIMIHUK!^p{<Vre5Q0awFY1S
z-wD3G*xDK|^j#8WixwFSPp;e;YPD!8h!N+A+0L<9UuM}<DSgzxB6yuDhs>l<C>PUM
zEQ&IS&<M|7;;7}<IAhVs;Z+ruC3DLIeV~Xu4km<GR7l-B7%?=_qUJWo;m9)_Lg|Ha
z-}X#8jqP*>$J}?5A2EV-BgrC4>#wR_ax1CH^7DJ=Ues!gc#+f8?==NmT-N1&uPGB~
z>+#u7hS-W3?8HyuO*=P%yCvQ*&Z(}m)-jklv(uri1xr+|+nqP&Jj^?4@8FQJvLZcs
zwPA+a04b?jW}ycK$r&pcukMtXp#6gzQ#Y&|D6^)Cr-k<ANd5(oF1uA3<Qb@b13p*u
zuCTvU_&nJ~&P?_7337WxzFR)Es5A@HdZ=S)a)$FLd`y?gu%SbXCd^yARyici!O?ME
zs!9pRe)z!OR@$yeIfoz5Dup_$2PK~`VVEXJ>CMjp38EQ14N%4w+A9pGBg?ocwZ03{
zf63|9kQ!M#citc^j(8k*D}eSpHU>*(BcKU*1t!f5@<KHf$@mgVBGk@Y37~|lXK3$I
zslGY-%0cqGkF-rIkV&)*Pk3`Wr*L6IU~Ev5i@g2qg%c_w%0(ZR?kiIw)HY9AF#eBX
zo`(=>&6CcbBY{SZ4#k*ko4eRHts`R+8<(AvWA^Bbxs4Yn?!6O2o9=-F*IqbNUmQR6
zVE{TsAIArathZaGc()y0V^Jn0IWj_beA0|@l?ek|);`Zk9xC*=ku#RcXTvWbZ=v)e
z(fi}rq^VPL%CCNwo&EC&bn54K)rA$(^Dk1XiK?nX|4L~B+2gQ%5BP<sd`n+S77kRc
zx^cf*$iZD}Q>ebtciT&kl9%5h5)SvKWbTN+zmbK-z~*$UgX8+R<cfe=x#Net3SL3g
z8v?@|@>pYU-%NfoXThxOe)+d=bFR7Iw(fTowW8D3*k@*dT(hy*Zt?TEQh1>a<CEGk
zy{6dR&y9CE<5A-WCoOGpV%Dk+<f+nz3-Y4;Us||)j+D=<iL-S`L=!h&&3uJ|UB%7H
zda^Jham*A+fB&w^;rZKNh-$~%&!{_fZQmMRPL=|z-(1@)Fy1%mNhUDYsNdu0?ynOS
z6@~Ku=uL@%e%aZ^fsrBOC!JY-|8hvDzeca>RoFu1&0e<(ODeU|*S|qCtB%S(X`H<N
z_rC}BN;CHVs(}F28KtPGr>hHQDDA&jPm?)IWR#KRYsQQoz3cWWoR;K$Xuj>cL3vSp
zQqr6=5&LciSX9<74`fXvkD8Z>0hQoq-e~IDLUc>1wqkD(fm_|%DAlS)g`@)RhGuZ&
zp%j2AE-i?Okq_NaCiTvF*)rLuwwyx!jdL0AW6jO>@Zc%slT=Wqr(RkHd7-T_S*A~0
zo0)hYKVAFcv=v_z1`iwvhFfvl79z#5V97vl!>W%)Z-Wc~#+Lo*xIX379kG~)Urld*
zJ>+lyMPu%|W1`cdPcwz%<^5cUB>fqR4fU0?g>#?6HOQ%5{>B~GfS?$ZS%Z2i7r!V<
za07ABSz*b^tpr=?xi@v#CQ>(d_p*mYmo9lZW`t#=+~2OXH2a3HUDcInK&*$GL`(j2
z660n>&nUpbcE3lG#L}|k3%^CWyPE*6DO9BXcsStZwQDOeh%%oBO<lf93l+(fDOSeD
z>%agv$Q+5dc2&9a*g=PM#%53qd8DL(r6Lzf5Bxhk9QJRNjsaCbxP@2*zE3r7B2nVn
z!KjmBq)=xCx0asWHKxt@WPP8D(8(fn*Z7Tx<o;D@Zfbgpxsf>3KnY@lg-y$nw0bB!
z)86##tg_#wUx3&C?VQ`UZY@{yoKy6>vTm`SLq>Xfb<d&+f4g4@-mI{Gre~Ys(^Xy~
z(dUEAEiYbItXbgUpZKD9IJN4j&$}Pgdf#;M_|q%GNLTk5Ijy-MRI7XW3ItY<$9Nk(
zQ-rr>$Q6$4>N9wMZ+GwM`%SH+Ed|scd!)yS+NY8_<izxEQa3n~pPaorU{AJ1pSGxr
z3b39i03@`xZJS>32xo-W9&TD%xd1f*fqvQfv7<qGk39xww^*i7G2sV;f9cE`)Z6CX
zcZBN*kyu~<kDks=bRrSmM%dqD3U$t*{rfMK{EZL@Zhmjris?`8*xm~o`@hkKKrijO
zgxKkYQ+?}(2TDw!yB`!IkWSEY7x=)$%VzsBZs^Ueacbad7u|+lC{767kiEI*wU_JZ
zVob280_1-L@VoX5?;~wsJWkeyO46Hs{{AuLfPks*3U2vQYmq3H4HEClU9+$G{H1Mk
zsKCaN-W)9#2ETsskR3N}f9tWMBjwXpMyA$|2+lcn@r%!f?(yr^&-gUl2(gD&97Edu
zr`wpC%-7KTbGiZb>*j4!{_c=zK<6gt6Z{%C&M(9Lzhs|nFPX{v=UIGHVXPhpn1Y`0
zXgR)yTfTgDCw&L}G`>)B<xGk6gq7k?Pr8IWC%<qNX8kE^!C6_|F8QsSk3{PW+u}{(
zPp?UecZF7E+U&ZvH}h^J_qmi@Y;AnZ=I)oNKXaoCHawfS;kMWvo8f;Rvf0QXyG##Q
z2rN9I1>9-T`fQSb*W0E5%I6#k3(G6{LrE#%rShSz@EA%7cfkH@B#*j!ronNNAB)QY
z0VcWm+e*N+A82yR*_5bISeQcS$Y@%5@s@xOWj;rV8Y&8^+h{VM2t*QYq$0y{bMJL?
zGZUa=NNdPlm9Nk2A%|eoeEceSfT}C5BSy8EltSk`@@r+dZnYcP`Q}$Z*Zp)5ivX7?
zGzYbBhz)t4w~AS;wKf+J+XcZ23JRwNsO{dfC)V@PI2b7Vm$hpun+I47MYUDhqEgsb
z{?8pLE^cwR*d#Y2@-JWBzU{z20?1sd0&Joxtrm8%_1g)%avXV`hf*XbgCO{X{D9O(
zR&Ta;nLmV`TF#R?P@YADa%YgRI#dW#%OE0@ucoW5UJ$T?jB>DHJ#vRI`^!fp8lGS7
zlK9WgSciSD_o_zB6+b%Y#dXp1%;aCRaL*r8|IbZbCP+hBr#Lx~rrTE>G-|y}r=fl(
zTrw%gb+`WFT|rfoKE0f}@$FoPYgh@)EeZXHMQdzg5~gmTuixEb#C@V;c95k8NL=*7
zVNeFuk|$!Sb4zAr9IRef0n-ULGP>1PHOzX&@nwh*Y6@@TLu#lfK|j&EC)ro0o0E;l
z>*Hk|tFxQMUlMD?8u0*6=1~!53lCmv_Q|UxyH=h){jkJ@zpCaUIEe5A&GL?njHFo#
zjqxJOwFv=2A3^YKSne<I5<Uzp2%$1b+w&LtERf$RQ>T8R9cY4|%530f`uDn(A@&N6
z)o6l%PusF@G^-1U<3u?Dec7#^=pMHrZj-x4jhgG9vbHhg$%uq=v?ktd<bkAK6J`>7
z`sgfA*_o1<7_5Ge<^WGm&qTzzns8#>0}}Tl5~_ByvKrD@%2=Mez)VcxHsAc4);Y;r
zzKdc%-w>9?^9Unr>&Oe&;6TF9nUoL)6C(U2O7JRf$(j7^AZb1UmE&5!Ljbp@toM<a
z0AGm0Nl9JxI`Y(k0+JQ$7_bO0N$Fg>3>=SI#<R}b5J07!;2x{NU1bVI$ic~T^5DAj
zB~3s=Qd8CB09gdP#=82Od$QG@Kr$(AIEUGwRyVP(j|3MM;q_l&<YX6lo2CPa-4?ys
z-I5ayF$_SSkcCgv!zXvyG@}$0c3Cb@R)HfGy0+F0+jj;)y}0v9>Ln`n<wJb7y%h`(
zT(i5S2^-S$G>>wZO6msy8`GK$Y7<w7a`L#EtkqYdv}Hf=>>~bj2u9UdQg9au+5na9
zmtHmI=H?a@IOtc<*(0@9m=RF$C@+F5d)m_6OxqrmHD_dgQIY-d6TiZfxN<|NYHah#
zp~rJ+nq9Y$@lzm@DG(pnpQeRQ5rIrzB~Aju=!d#POz6H=H3&$TxOg6k5_(QNa)D$@
z#5t2;vKHYU?&c=IAZwnargTzA;YCVs9$D}bUz+)@YqHSP(?cS4QAmKA?u-j7U}LH5
z%j^H9rR9xvt#v|D(mCkImhtxOQGKczvZt)9<K-~KG5v4ygc)+C8`d{0n85`QXiRa_
zGZ{*{{mjA{dfv`c`SqB@sS`9b;vOTmah*0}#*xG&@{5b&U{72dT$zPIPf=zc*bUHG
zjd`aK7Y>-P0AER8k3F&67YGZUO4-!9$7l9=wCFN52AeTexo<(Jv>3K1LEqhdO6DF-
zKv!S=QfAIi7Sw#^p9scC|6;ZR^9+)Z(9UF+OlHtn%XnJmEK_aUh!-OIg9<nXYy6IE
zd{d50Az$(iK*cZ_8Re<lEG)`|#I#KgQD=B|)`+FVv*RN$q5j>bEvyEjz|KkP=5Ifc
z@8^;C5ep%kY>h)nN(S+-##xXhmTgGl^;eCQL0gbLpm#x<gfIhu4(3GgHph9C`Brbk
zSb9Sa4RRykaAounqx`kBu%I*)_+hMyxIxOF!tXs2*u`ZR2a)q53kMx+XG=)B-@kuv
zf{{q$M?mh06YoP0Qjse9kX<#i@dph(3QyQ%wA$2kKiXg{L5|!}V=e97OAS$vJ{Z3t
z)5;T$xXl8w#BaEPy{7Sz>1`bvfRk{|MMb$pCm4z&m^$O=(BeWuK55w{u|&bGiBN$Y
zNKxpOP4q8#$!y1EP@ZhOB$vS-SdXiAo{a5zaQ#gC){<#Am!b(-YP?7vZq@wd3Zua6
zQ^+?+Z!NJg0SN#bV%5^_jdu~zJv1sRIVJ`+_M)IxwfBKPzqby-aO!&_4+J!^o{wE9
zL@<4PLd({vQ~U3{Vm~kjVigto64i|`#2S6<nHG&zG3U0<P0lLrz6|E^p`n0**duTT
z2o^oyNms7ukT%i03wG7g$^LNazrzflGK|MBf7$V>8}R-x@bEZ*8612n+&V%vYOIv9
za4Kw}Rd(1g9v})8WhJG``b{7=JiTVfy!h#K<6E6|UaqTeF<Ur!zV4XZp;O2&D8R=I
zvDvJ+GXF2V;Kz$E9zoCF@ZWz7GM=(8Pf1P9lMkhxFo(cTjz*(#+LDEP?*+uH8r#P&
z17A(@9UTK5ogGO@YYYqwY;A|_%fl;-U~54M(;hZ?5`eoav*SBo&&vZq0Ta+Qf3&PU
zWQu)QEHv+tb2k-0RhrF7Jw;g!1=L@M6^#4H>z6N4n88ax&C5s&%}<Nb&n5o%2A(<h
z{CSa#GejFz6+AzvtpPs3te1j<-}>}%WBSbJ597-m2s*)*H)}R=V6YL-0_Z|d!`f1N
ziTZ5=5kPT2Np2-xu!7^}gRK(nmC%dA%KE57hu$MLIdEW>N^u@8t1L4L^Oz~uWM>=o
zK5z29UvSF^9Jhm6DhUbhEQhImIGL|EY+*pr<Pfu}>gsUX+_FqyEow@Y=7X5SiXAKG
zAr6cjn~$G)RB*6-O&zg<Of+IPq&D$e(Ow)P62nC<*i^lF9A1egCTa=_KK8{(OZVgt
zkBL2gT%ZnB>lX!5zx(;k#w$9rd_Tfc2))9B0%^KXv}G~)%b}~n#_n%DI^|i^9IdL0
z9m3GAW?-I6Et=*4oQaG@b>gvjm_l-O1kH!l|1`^7fWoo#g3Vm8R`R=npSr-qp^&Sp
zt5eQN+LU=Nud*Kg(hWscS}~Kep~dD?c0Nmpazoe$-8laZ%N0PYL~+xhs3>~qN4s6Z
z*?5f7B7rbEcP?NUQ>O<#{W4dICsz%<Q0-E?W=<w`GQ(dL$^j^1ZdeE|KwR?$n7_KG
z?11p*k;33utc(2LSiRqDVU5WHExUdXgKzQG31mEf+-&3BMD$m>)+{MUOTbA5rZBc<
zF`2_Ne74A;VtXWKHCj9!k-e>L>YO6s=JSb`t-pukf>&2rIbyL*czf=E5_ugR3nZc`
zZh}S(A)lml8GI*Eu@Y=)|9;u2Bbx`(s!jL8JS{yvaTn&ird%>U`p#nU;)B(N`VeR>
zBwgXNx#h(Bk6SLReg)!*w}freS#8;eWUvsEvrJqj%g}a(9%%QMI8Hw4L4))&U(#o^
z@^}m?DmKvFBS{Alb&ycn<U0-5%dbHJi~My(NnfQ~__CuFoG1S1)bcJF`qSFCmi7Wy
zRCfgQ{yKscE3GJce)p;c+SIm%{?o~yz8**2;zG%1=<up2@(d3j)(9Fstrx?tqK_SG
z1c9%bXm95S2{h!7o#prS4^W9^9vAd4s5=&vm@)taU4e4Ff&74;2tHB4%`85&ZZzvr
z@R-C-CAJ?8I&+=mXhw^LA;F!esxeoc?1xZtE+;2FuaWbC=^ff;nrwG;Rs!UaS-5bF
zQ!|PUt3hp)yWtU+N!K2O_vpQ!9)pG0G9U+`%iArgwGWJ1iM_WTm>Uy(bW0wk4r@E%
zjQOjFmgNAm$Zu!$bL$5zYo4*$kff5emA2Mk3PgFXo|WLySY=E<1$z>ga(Ey?LYtPL
zH{lNr4Zzm95Qn^wd{9`+&04td4EZIbqyR`AK5UBv`@{G8vu2qZo+icyjcjK86oO?&
z=nzT}<jmI7qfSkTj`q6xf{Cy1Ys=OUwZUkWolqygZ)_w^G3*NN<=^$$U_e|LDh=v4
zq|HWi=KS5@-B(H-=YeXNM3O3FL&F#Av#ECIx0V72#{d9Z!fPW>S{yriG>syMXm3jn
z#epzwTI8Kql!llRBh1Oabt{1j=S$al6XnsHJ<iJ;HDbgH`lvMSKUrta7i0k)K5(EE
zp^TH0Q`%Pigvs3A3ld$mrDy`>7~26mPyT>qRjElPK}taWgUtfNGR~YiLeQa;h!T(*
zNNG&7rJOKQoFf_45Jm&k8KK`{SiUCTfYA}c$d&w~oCnS{ilgae+X7#rCvwhS!byly
z3P(MZQo^~=Y#BE{?4!|zpwg#wq$3C3O7}fKO$tZb0hCudSslC)CtmQnLV^z)bu4WM
zL=Tb|bNS?;p!wwa$a<IrlCVF%HlEu_+!tEyhxYHU>l>St^odS0Y?Pi|zC5$e8Z;9z
ze)aR`&bV$P(W7-VF(Cn?X%Rd$zlc+6-Sd62OG^sP)fuEh=89ylfwYA9Zf@_&8IHgV
zCV;FfN=sV0<FJdq%vh7HAk&~xJi_3nrC*#-%$MMbK(Zl(P3{5Rv7*ND_{O}u@QGjK
zh@P4IRnm90qvJNz#Oy2{@f%1%4hZUaPB&?o3wCI@-Y&)K8Tjd~(gdkMMaxkF)?0v{
ze5A?YC6%%b|GaoH9y3{7XOZgC;gN<!1UQNyM<Rh&TY6erg5~Zm|NfTwy-_aO_GVg(
zih8fUefy3*dY8GzB$EDpfJUBM`;~XpJ^yLUd}D2TK$Pi3<Q^#9SACn={R`{P8g=jZ
z&NLSwiaC#&%RY%KFPyvg9hO|}{QaMVKc&=uz)@mi00f8&WqkyvK=g{mFMj<=b=yRL
zEZe@l2HYWi<yjTa=YM$pP7Oh0X#2lKpZssBwAkwP4@aq%{!sjT96fn*o<Lt=QXsSw
zV^UD_qJES4G55wQp=VZjMZc{cdZzko9W8S6$SKT8`UB#mf8^QGwx)9=J8zFZecHS@
zP560rE&L74z1t>f3I8P~W=u>NR)%ELti3V#>=M<o=uxU^zcdEZ-4ZEUrXtuUXm@XN
z@~HN-?6_ui8=|K9G3uzapi7hN5PAzMjHxjJ0l#g%i<Q;Xvz*u6ZgdZn$sMU{vcxW7
zqQSWKUrOCsBuGD5Ss)MeyPohAK2oHjp%DVSTV|AoKb-I?6=;MqG{~0U+LjYpwaQNq
zP?&pP>u4tL_(8D}T3DOd_Dr_FdgaQ1Z~fC=GQUen5%079yjKMbzQC2Pe@XPW>_K68
z#K#8TIBC(HZy(YAhq@}5aH-lR8mVH0erw_N?=dn-pZ(<c3@=d<y1nb^zXnScm*b>d
z`EK_QDXH+&UN;o!OcK#2HTqkpvzU8L#l;zAC_T5^jp!t{|6h_CbMM3H3coaR@kZ@R
zov+6oNHxDdTWwqHRTA`>g3<oA_9iIIu|soD1e=ad{<=x{ZQozoW7(iY2CcAyvSPZ*
zU(w6yA2XsuE4<&^-@hjg+ax1!KboFj(EeK#_v<9~c{uMP03nb?NKbCF>d0}^AMB^_
z+0p$V4zMf<qhciq4(b@{A-1$P>ifjc>uhbGAy6AIl*v>Zh^Nrk3O<5=_l;3q|DVq(
zhWtuSOvI6HzN>7<Rj>bxc{a?k)*(LW(q%S~kGb^q#XtWve5hZcbCW(jBKpH=BmS=o
z>UgVI?IF|Ab@x*qb7Dd1>FDYf6%`E}+K=~>(&MEwK*N~S{lUYf*`4gZ_YHn~D}Q)7
zk--L^J2y=ZDdXEm8MB35``Hun%3Q-*bTF}>=tn2v&o^o7($Fq2QS{o&{Y%{BoSg4E
zOH52*;zT!ca-E|>Apj8$i<Vli^1Xfj=Lhh&-ZO1%wt_^NZze8w-maZij~isy&KzHf
z2i}1C#2Mx){@}`fd8b$R%BrgSbSLQ;Ao&J55N`RF*UWwz!cidJF@`0i!N$luNA;Jt
zefz6+7lc@+GBW!2^yMhy7+J?02o{HzlQ|R(gC{`ec)r~5oqdLmeJMTv#~qiF%98xK
z&pw3G18^Jtv4okq`DrE;v#j)HcNMFM;k@Q7#V}t&XAtIETWeK*u5I;GbWb7E8xJW`
zy#43Zd0oM(8DG)u5_H&P(AKKw*M*Pb?g@4>J`$WpWWV=obAQ|rubqu7@gmN6&JxlU
zmZquecN~omj{cp91oF!n<0HZ0Pj)tTs=U~TEPQvrw5xo%#sBNp4KtMowQqj;*n7a$
z6!MZf^lWHpYHrdz{C(@RU+(ql$7kMgrpGgVAqmB1!oy8s&mX%`MwpnGBnlF!_i%Js
zBYuM8QuAnKBk5bQcI{koRBB5}65ZXiW4wP^i`T>1F(fNJ<k*drgkY&=ET{jPH=_Y0
zWuf@%A&0>vvF7jP5<gBHZEq2>U4WO9$ByZW?*@{f4}h-Kj?Z4bnpROh@4F3wc0Gfv
zXE@xJ(J`>HdW49X=^MwZcZoe;Gl<c5=;z@~b%eR}B-|>GC;xI!NPyhPj!7|f3>d{g
z0l|d~94ZWp)4rNb){g*N@EsY@^SfJx*ACh>+NdiaLPq&OOyGiuFk;Nq(PAH_3KTJD
zBng<01Y&+_$uUX=;rdC~1g>)5`TgXT^`%?`{-*gb1_>KTFiybaNnncRu1kc6(VWCe
zg?b{KpVOA*%IXM9WN;l<#@60JImJlhDL8vFUqn^{x?gG9w9Sl(n&7|~2J;10Ry7PA
zXBsbRL-7bCBINDpnN|*y_yLPX|4u6-;O9PO2%>!%Glog*$LT9+PalJ0bcHPuKbUMr
zXl<fcZ5DKdQR7#1|5D$vTeQVd98yS-gKu+nRhLu17#;l<Zc&g!{(X9j9X!w1!zGtg
zb%C0YuGT#q52Bs3C&5Q0Q23s_w6`PE5k@4EHO^Oq2oZpDz>DkpLM|Y?l%^BbBeG_$
zQQik>Bgl2T2}4H+@X~+98Oy=~IP#Go<D-J9ysRg-Kf-kMc_0P$0(Eo`Ir@elQ_ryy
zUr(R#BXiKcOnHON3y#4<L=ifKEqamYTv)-v9xzmj3-FRb^y~;r$H5q9OKT98F*e*J
z|G*ACb#$=@OdZ>f=7RVz$fBp|_5qa9cE>t<jih{=^z7L)O0XDPZV4lDn%`9{RrUTE
z?4^9EgJBRFLF-Bj=%{micxq-^JD6EIA0C`cmBLa*LR?&qL<+v!8k`J-uH)&5iLEZ>
z?$JAkR?#GvF}nZ+CEP9!jMRsh#G(xWUU=|#s6(qYBKR8$j5E{;4LMzZ+=J+}=k=Qe
zcP;38aXDaDd}B<7h>tW(LcA-^B7#pb9hTgxcyEG#b3+4)-e|iO9jjoq4#P2Xp=oIj
zFjHe!&IUPR&n>^Y6QdL*b#<}Td%AYMOfUl%?$5AQD=WjH>bQMDWWe~3saW&V;zcS9
zH%}IV@nOwxQS_!@gAZtncB!c!js#$OXV0E>o-EU?d&hRVI{@Zq=#=#<*&d|VNBd^2
zd2kz#I1*yyk5@c1#k}sknn7LN-tVlSrFCdJGrrNEWMX7Puj{S0n4j?=btVNDVWz1K
z)Bwn6&4%6Zbd=wVbFYH~f(y2Op&blD)G}-}Q=^%tl=5uHW<{+hN;NNV2LgUPyRhVH
zZm?~aCrK>SigO$HiJ+<(#mX2GV{&>+Z{f!B!WN|se@%MY4h98X*1Y|~`NF5S!jx5*
z7enkPQQ9&>j)Qi-OK0sa5<PvFsGh-od&Uf*V+@HJ%K~7xa5|+a6JS6FagaZGV%*Q1
zc4-%O9z}^^#YGAUp<TR8QLv$g4PBi3lGYB0?yw7AQA*|oN-s>BQ^>zj>12BR_BN!+
zIW^Yr|N2_Jol^w}=Jo4oOo?Kh+Nqg$e2*!K$;pD>3fLeyX`+33Cou^fh;!I#3~0tV
zKAFsp_s9(u<?j7iBaj;#4<VQ32;67)jiy<D0tXMFOZ4xN<ZH}U!xwCZdxHXYdFkn9
z#Ya1NUBLl@5^@771hxPbdh^1%tkw<ERTDZcg}9ktT+IBr3nlNvme*g$L5R~S(<6W)
zGo^KnNOZqNk{Um=*2Fng)aA#s8rJcBABeqAq~^%{?iN()Pas3<l9BO;Tr6=6k1s-g
zx39n1yL7tXDAQ;5JI@XHg=5-)DzW$QousR4KP3Hf^nJf5>Jhzs3ZySf&$-QI&s-XJ
x^Wtl(>T_LxYB0qv_5P8K@V2r4U;iVq(`U(>F;AVW_6k>Is5fucNu6ao{tqt&R7d~-

diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 5ea543f4cb1e8..77af3f68a0505 100755
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -1,94 +1,79 @@
 #!/usr/bin/env bash
 set -ex
 
-# prepare workspace directory
-WORKSPACE=$1
-if [ -z "$WORKSPACE" ]; then
-    export WORKSPACE=$(pwd)/ep_kernels_workspace
-fi
+# usage: ./build.sh [workspace_dir] [mode]
+#   mode: "install" (default) → install directly into current Python env
+#         "wheel"              → build wheels into WORKSPACE/dist
 
-if [ ! -d "$WORKSPACE" ]; then
-    mkdir -p $WORKSPACE
-fi
+WORKSPACE=${1:-$(pwd)/ep_kernels_workspace}
+MODE=${2:-install}
+mkdir -p "$WORKSPACE"
+
+WHEEL_DIR="$WORKSPACE/dist"
+mkdir -p "$WHEEL_DIR"
+NVSHMEM_VER=3.3.9
+
+pushd "$WORKSPACE"
 
-# configurable pip command (default: pip3)
-PIP_CMD=${PIP_CMD:-pip3}
 CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
 
 # install dependencies if not installed
-$PIP_CMD install cmake torch ninja
-
-# build nvshmem
-pushd $WORKSPACE
-mkdir -p nvshmem_src
-wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
-tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
-pushd nvshmem_src
-wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
-git init
-git apply -vvv nvshmem.patch
-
-# assume CUDA_HOME is set correctly
-if [ -z "$CUDA_HOME" ]; then
-    echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
-    exit 1
+if [ -z "$VIRTUAL_ENV" ]; then
+  uv pip install --system cmake torch ninja
+else
+  uv pip install cmake torch ninja
 fi
 
-# assume TORCH_CUDA_ARCH_LIST is set correctly
-if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then
-    echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture."
+# fetch nvshmem
+ARCH=$(uname -m)
+case "${ARCH,,}" in
+  x86_64|amd64)
+    NVSHMEM_SUBDIR="linux-x86_64"
+    NVSHMEM_FILE="libnvshmem-linux-x86_64-${NVSHMEM_VER}_cuda12-archive.tar.xz"
+    ;;
+  aarch64|arm64)
+    NVSHMEM_SUBDIR="linux-sbsa"
+    NVSHMEM_FILE="libnvshmem-linux-sbsa-${NVSHMEM_VER}_cuda12-archive.tar.xz"
+    ;;
+  *)
+    echo "Unsupported architecture: ${ARCH}" >&2
     exit 1
-fi
+    ;;
+esac
 
-# disable all features except IBGDA
-export NVSHMEM_IBGDA_SUPPORT=1
-
-export NVSHMEM_SHMEM_SUPPORT=0
-export NVSHMEM_UCX_SUPPORT=0
-export NVSHMEM_USE_NCCL=0
-export NVSHMEM_PMIX_SUPPORT=0
-export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
-export NVSHMEM_USE_GDRCOPY=0
-export NVSHMEM_IBRC_SUPPORT=0
-export NVSHMEM_BUILD_TESTS=0
-export NVSHMEM_BUILD_EXAMPLES=0
-export NVSHMEM_MPI_SUPPORT=0
-export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
-export NVSHMEM_BUILD_TXZ_PACKAGE=0
-export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
-
-cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
-cmake --build $WORKSPACE/nvshmem_build/ --target install
+NVSHMEM_URL="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/${NVSHMEM_SUBDIR}/${NVSHMEM_FILE}"
 
+pushd "$WORKSPACE"
+echo "Downloading NVSHMEM ${NVSHMEM_VER} for ${NVSHMEM_SUBDIR} ..."
+curl -fSL "${NVSHMEM_URL}" -o "${NVSHMEM_FILE}"
+tar -xf "${NVSHMEM_FILE}"
+mv "${NVSHMEM_FILE%.tar.xz}" nvshmem
+rm -f "${NVSHMEM_FILE}"
+rm -rf nvshmem/lib/bin nvshmem/lib/share
 popd
 
-export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
+export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem/lib/cmake:$CMAKE_PREFIX_PATH
 
 is_git_dirty() {
     local dir=$1
     pushd "$dir" > /dev/null
-
-    if [ -d ".git" ] && [ -n "$(git status --porcelain 2>/dev/null)" ]; then
+    if [ -d ".git" ] && [ -n "$(git status --porcelain 3>/dev/null)" ]; then
         popd > /dev/null
-        return 0  # dirty (true)
+        return 0
     else
         popd > /dev/null
-        return 1  # clean (false)
+        return 1
     fi
 }
 
-# Function to handle git repository cloning with dirty/incomplete checks
 clone_repo() {
     local repo_url=$1
     local dir_name=$2
     local key_file=$3
     local commit_hash=$4
-
     if [ -d "$dir_name" ]; then
-        # Check if directory has uncommitted changes (dirty)
         if is_git_dirty "$dir_name"; then
             echo "$dir_name directory is dirty, skipping clone"
-        # Check if clone failed (directory exists but not a valid git repo or missing key files)
         elif [ ! -d "$dir_name/.git" ] || [ ! -f "$dir_name/$key_file" ]; then
             echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
             rm -rf "$dir_name"
@@ -99,7 +84,7 @@ clone_repo() {
                 cd ..
             fi
         else
-            echo "$dir_name directory exists and appears complete; manually update if needed"
+            echo "$dir_name directory exists and appears complete"
         fi
     else
         git clone "$repo_url"
@@ -111,17 +96,44 @@ clone_repo() {
     fi
 }
 
-# build and install pplx, require pytorch installed
-pushd $WORKSPACE
-clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
-cd pplx-kernels
-$PIP_CMD install --no-build-isolation -vvv -e .
-popd
+do_build() {
+    local repo=$1
+    local name=$2
+    local key=$3
+    local commit=$4
+    local extra_env=$5
 
-# build and install deepep, require pytorch installed
-pushd $WORKSPACE
-clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "73b6ea4"
-cd DeepEP
-export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
-$PIP_CMD install --no-build-isolation -vvv -e .
-popd
+    pushd "$WORKSPACE"
+    clone_repo "$repo" "$name" "$key" "$commit"
+    cd "$name"
+
+    if [ "$MODE" = "install" ]; then
+        echo "Installing $name into environment"
+        eval "$extra_env" uv pip install --no-build-isolation -vvv .
+    else
+        echo "Building $name wheel into $WHEEL_DIR"
+        eval "$extra_env" uv build --wheel --no-build-isolation -vvv --out-dir "$WHEEL_DIR" .
+    fi
+    popd
+}
+
+# build pplx-kernels
+do_build \
+    "https://github.com/ppl-ai/pplx-kernels" \
+    "pplx-kernels" \
+    "setup.py" \
+    "12cecfd" \
+    ""
+
+# build DeepEP
+do_build \
+    "https://github.com/deepseek-ai/DeepEP" \
+    "DeepEP" \
+    "setup.py" \
+    "73b6ea4" \
+    "export NVSHMEM_DIR=$WORKSPACE/nvshmem; "
+
+if [ "$MODE" = "wheel" ]; then
+    echo "All wheels written to $WHEEL_DIR"
+    ls -l "$WHEEL_DIR"
+fi
diff --git a/tools/install_deepgemm.sh b/tools/install_deepgemm.sh
index 4f2cd302c3eff..ee9a5dd4aa643 100755
--- a/tools/install_deepgemm.sh
+++ b/tools/install_deepgemm.sh
@@ -1,12 +1,13 @@
 #!/bin/bash
-# Script to install DeepGEMM from source
-# This script can be used both in Docker builds and by users locally
-
+# Script to build and/or install DeepGEMM from source
+# Default: build and install immediately
+# Optional: build wheels to a directory for later installation (useful in multi-stage builds)
 set -e
 
 # Default values
 DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
 DEEPGEMM_GIT_REF="594953acce41793ae00a1233eb516044d604bcb6"
+WHEEL_DIR=""
 
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
@@ -27,11 +28,20 @@ while [[ $# -gt 0 ]]; do
             CUDA_VERSION="$2"
             shift 2
             ;;
+        --wheel-dir)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --wheel-dir requires a directory path." >&2
+                exit 1
+            fi
+            WHEEL_DIR="$2"
+            shift 2
+            ;;
         -h|--help)
             echo "Usage: $0 [OPTIONS]"
             echo "Options:"
             echo "  --ref REF          Git reference to checkout (default: $DEEPGEMM_GIT_REF)"
             echo "  --cuda-version VER CUDA version (auto-detected if not provided)"
+            echo "  --wheel-dir PATH   If set, build wheel into PATH but do not install"
             echo "  -h, --help         Show this help message"
             exit 0
             ;;
@@ -57,16 +67,15 @@ fi
 CUDA_MAJOR="${CUDA_VERSION%%.*}"
 CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
 CUDA_MINOR="${CUDA_MINOR%%.*}"
-
 echo "CUDA version: $CUDA_VERSION (major: $CUDA_MAJOR, minor: $CUDA_MINOR)"
 
 # Check CUDA version requirement
 if [ "$CUDA_MAJOR" -lt 12 ] || { [ "$CUDA_MAJOR" -eq 12 ] && [ "$CUDA_MINOR" -lt 8 ]; }; then
-    echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
+    echo "Skipping DeepGEMM build/installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
     exit 0
 fi
 
-echo "Installing DeepGEMM from source..."
+echo "Preparing DeepGEMM build..."
 echo "Repository: $DEEPGEMM_GIT_REPO"
 echo "Reference: $DEEPGEMM_GIT_REF"
 
@@ -76,23 +85,31 @@ trap 'rm -rf "$INSTALL_DIR"' EXIT
 
 # Clone the repository
 git clone --recursive --shallow-submodules "$DEEPGEMM_GIT_REPO" "$INSTALL_DIR/deepgemm"
-
-echo "🏗️  Building DeepGEMM"
 pushd "$INSTALL_DIR/deepgemm"
 
 # Checkout the specific reference
 git checkout "$DEEPGEMM_GIT_REF"
 
-# Build DeepGEMM
+# Clean previous build artifacts
 # (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
-rm -rf build dist
-rm -rf *.egg-info
+rm -rf build dist *.egg-info
+
+# Build wheel
+echo "🏗️  Building DeepGEMM wheel..."
 python3 setup.py bdist_wheel
 
-# Install the wheel
+# If --wheel-dir was specified, copy wheels there and exit
+if [ -n "$WHEEL_DIR" ]; then
+    mkdir -p "$WHEEL_DIR"
+    cp dist/*.whl "$WHEEL_DIR"/
+    echo "✅ Wheel built and copied to $WHEEL_DIR"
+    popd
+    exit 0
+fi
+
+# Default behaviour: install built wheel
 if command -v uv >/dev/null 2>&1; then
     echo "Installing DeepGEMM wheel using uv..."
-    # Use --system in Docker contexts, respect user's environment otherwise
     if [ -n "$VLLM_DOCKER_BUILD_CONTEXT" ]; then
         uv pip install --system dist/*.whl
     else
@@ -104,5 +121,4 @@ else
 fi
 
 popd
-
 echo "✅ DeepGEMM installation completed successfully"

From 7df331c66b242a3109a95fb434a1badd196966d7 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sat, 22 Nov 2025 13:07:18 -0800
Subject: [PATCH 170/249] [BugFix] Fix chunked prompt logprobs + preemption
 (#29071)

---
 tests/conftest.py                  | 27 +++++++++--
 tests/v1/sample/test_logprobs.py   | 76 ++++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_input_batch.py  | 14 ------
 vllm/v1/worker/gpu_model_runner.py | 20 ++++++--
 vllm/v1/worker/tpu_input_batch.py  | 10 ----
 vllm/v1/worker/tpu_model_runner.py | 11 +++++
 6 files changed, 127 insertions(+), 31 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 5afdb225b8923..163593eb3f14f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -853,6 +853,7 @@ class VllmRunner:
     @staticmethod
     def _final_steps_generate_w_logprobs(
         req_outputs: list[RequestOutput],
+        include_prompt_token_ids: bool = False,
     ) -> list[TokensTextLogprobsPromptLogprobs]:
         outputs: list[TokensTextLogprobsPromptLogprobs] = []
         for req_output in req_outputs:
@@ -861,9 +862,26 @@ class VllmRunner:
                 output_str = sample.text
                 output_ids = list(sample.token_ids)
                 output_logprobs = sample.logprobs
-            outputs.append(
-                (output_ids, output_str, output_logprobs, req_output.prompt_logprobs)
-            )
+            if include_prompt_token_ids:
+                outputs.append(
+                    (  # type: ignore[arg-type]
+                        output_ids,
+                        output_str,
+                        output_logprobs,
+                        req_output.prompt_token_ids,
+                        req_output.prompt_logprobs,
+                    )
+                )
+            else:
+                outputs.append(
+                    (
+                        output_ids,
+                        output_str,
+                        output_logprobs,
+                        req_output.prompt_logprobs,
+                    )
+                )
+
         return outputs
 
     def generate_w_logprobs(
@@ -873,6 +891,7 @@ class VllmRunner:
         images: PromptImageInput | None = None,
         audios: PromptAudioInput | None = None,
         videos: PromptVideoInput | None = None,
+        include_prompt_token_ids: bool = False,
         **kwargs: Any,
     ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
         inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
@@ -882,7 +901,7 @@ class VllmRunner:
         )
 
         toks_str_logsprobs_prompt_logprobs = self._final_steps_generate_w_logprobs(
-            req_outputs
+            req_outputs, include_prompt_token_ids
         )
         # Omit prompt logprobs if not required by sampling params
         return (
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index c0b0e1ea226ed..c89c33be80c10 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -605,3 +605,79 @@ def test_spec_decode_logprobs(
         )
         assert ref_logprob.rank == spec_logprob.rank
         assert ref_logprob.decoded_token == spec_logprob.decoded_token
+
+
+def test_prompt_logprobs_with_chunking_and_preemption():
+    """Test that prompt logprobs are correctly returned when using
+    both chunked prefill and preemption.
+
+    This test ensures that the num_prompt_logprobs tracking persists
+    across preemptions and prefill chunks.
+    """
+
+    # Create prompts that will trigger chunking and preemption
+    prompts = [
+        "The following numbers of the sequence "
+        + ", ".join(str(i) for i in range(10))
+        + " are:",
+        "In one word, the capital of France is ",
+    ] + [f"Tell me about the number {i}: " for i in range(32)]
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=40,
+        min_tokens=20,
+        prompt_logprobs=2,  # Request prompt logprobs
+    )
+
+    with VllmRunner(
+        "Qwen/Qwen3-0.6B",
+        max_model_len=512,
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=48,  # Force prefill chunking
+        num_gpu_blocks_override=32,  # Force preemptions
+        disable_log_stats=False,
+        gpu_memory_utilization=0.25,
+    ) as vllm_model:
+        metrics_before = vllm_model.llm.get_metrics()
+
+        # Generate with prompt logprobs using generate_w_logprobs which
+        # returns (output_ids, output_str, output_logprobs, prompt_logprobs)
+        outputs = vllm_model.generate_w_logprobs(
+            prompts, sampling_params=sampling_params, include_prompt_token_ids=True
+        )
+
+        # Verify that all outputs have prompt logprobs
+        for i, output in enumerate(outputs):
+            _, _, _, prompt_token_ids, prompt_logprobs = output
+            assert prompt_logprobs is not None and len(prompt_logprobs) > 0, (
+                f"Output {i} missing prompt logprobs"
+            )
+            assert len(prompt_logprobs) == len(prompt_token_ids), (
+                "Unexpected number of prompt logprob positions"
+            )
+
+            # Each position should have the requested number of logprobs
+            for pos, logprobs_dict in enumerate(prompt_logprobs):
+                if logprobs_dict is not None:  # First token may be None
+                    assert (
+                        sampling_params.prompt_logprobs
+                        <= len(logprobs_dict)
+                        <= sampling_params.prompt_logprobs + 1
+                    ), (
+                        f"Output {i} position {pos} has {len(logprobs_dict)} "
+                        f"logprobs, expected {sampling_params.prompt_logprobs}"
+                    )
+
+        # Check that we actually had preemptions
+        metrics_after = vllm_model.llm.get_metrics()
+        preemptions_before = next(
+            (m.value for m in metrics_before if m.name == "vllm:num_preemptions"), 0
+        )
+        preemptions_after = next(
+            (m.value for m in metrics_after if m.name == "vllm:num_preemptions"), 0
+        )
+        preemptions = preemptions_after - preemptions_before
+        assert preemptions > 0, "Test did not trigger any preemptions"
+
+        print(f"Test passed with {preemptions} preemptions")
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 7b4bc1d2a2241..d6fef450c028a 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -219,9 +219,6 @@ class InputBatch:
         self.generators: dict[int, torch.Generator] = {}
 
         self.num_logprobs: dict[str, int] = {}
-        # NOTE(rob): num_prompt_logprobs only includes reqs
-        # that are currently in the prefill phase.
-        self.num_prompt_logprobs: dict[str, int] = {}
 
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
@@ -385,12 +382,6 @@ class InputBatch:
                     if sampling_params.logprobs == -1
                     else sampling_params.logprobs
                 )
-            if sampling_params.prompt_logprobs is not None:
-                self.num_prompt_logprobs[req_id] = (
-                    self.vocab_size
-                    if sampling_params.prompt_logprobs == -1
-                    else sampling_params.prompt_logprobs
-                )
 
             if sampling_params.allowed_token_ids:
                 self.has_allowed_token_ids.add(req_id)
@@ -488,7 +479,6 @@ class InputBatch:
         self.repetition_penalties_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
-        self.num_prompt_logprobs.pop(req_id, None)
         self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
 
         self.has_allowed_token_ids.discard(req_id)
@@ -972,10 +962,6 @@ class InputBatch:
     def max_num_logprobs(self) -> int | None:
         return max(self.num_logprobs.values()) if self.num_logprobs else None
 
-    @property
-    def no_prompt_logprob(self) -> bool:
-        return not self.num_prompt_logprobs
-
     @property
     def no_allowed_token_ids(self) -> bool:
         return len(self.has_allowed_token_ids) == 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 979f977587038..49285a7b8e0ad 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -393,6 +393,9 @@ class GPUModelRunner(
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: dict[str, int] = {}
         self.comm_stream = torch.cuda.Stream()
 
         # Input Batch
@@ -687,6 +690,7 @@ class GPUModelRunner(
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
+            self.num_prompt_logprobs.pop(req_id, None)
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
         # scheduled_req_ids overlap. This happens when a request is aborted and
@@ -755,6 +759,13 @@ class GPUModelRunner(
             )
             self.requests[req_id] = req_state
 
+            if sampling_params and sampling_params.prompt_logprobs is not None:
+                self.num_prompt_logprobs[req_id] = (
+                    self.input_batch.vocab_size
+                    if sampling_params.prompt_logprobs == -1
+                    else sampling_params.prompt_logprobs
+                )
+
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             if self.uses_mrope:
                 self._init_mrope_positions(req_state)
@@ -2671,7 +2682,7 @@ class GPUModelRunner(
                         scheduler_output, self.vllm_config
                     )
                 if self.cache_config.kv_sharing_fast_prefill:
-                    assert not self.input_batch.num_prompt_logprobs, (
+                    assert not self.num_prompt_logprobs, (
                         "--kv-sharing-fast-prefill produces incorrect "
                         "logprobs for prompt tokens, tokens, please disable "
                         "it when the requests need prompt logprobs"
@@ -3436,7 +3447,7 @@ class GPUModelRunner(
         hidden_states: torch.Tensor,
         num_scheduled_tokens: dict[str, int],
     ) -> dict[str, LogprobsTensors | None]:
-        num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
+        num_prompt_logprobs_dict = self.num_prompt_logprobs
         if not num_prompt_logprobs_dict:
             return {}
 
@@ -3447,7 +3458,10 @@ class GPUModelRunner(
         # maintainable loop over optimal performance.
         completed_prefill_reqs = []
         for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items():
-            num_tokens = num_scheduled_tokens[req_id]
+            num_tokens = num_scheduled_tokens.get(req_id)
+            if num_tokens is None:
+                # This can happen if the request was preempted in prefill stage.
+                continue
 
             # Get metadata for this request.
             request = self.requests[req_id]
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index 6bf4f91931849..2ed65ca9d31cd 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -149,9 +149,6 @@ class InputBatch:
         self.generators: dict[int, torch.Generator] = {}
 
         self.num_logprobs: dict[str, int] = {}
-        # NOTE(rob): num_prompt_logprobs only includes reqs
-        # that are currently in the prefill phase.
-        self.num_prompt_logprobs: dict[str, int] = {}
 
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
@@ -256,8 +253,6 @@ class InputBatch:
 
         if sampling_params.logprobs is not None:
             self.num_logprobs[req_id] = sampling_params.logprobs
-        if sampling_params.prompt_logprobs is not None:
-            self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
         if sampling_params.logit_bias is not None:
             self.logit_bias[req_index] = sampling_params.logit_bias
 
@@ -317,7 +312,6 @@ class InputBatch:
         self.repetition_penalties_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
-        self.num_prompt_logprobs.pop(req_id, None)
         self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
 
         # LoRA
@@ -584,10 +578,6 @@ class InputBatch:
     def max_num_logprobs(self) -> int | None:
         return max(self.num_logprobs.values()) if self.num_logprobs else None
 
-    @property
-    def no_prompt_logprob(self) -> bool:
-        return not self.num_prompt_logprobs
-
     @property
     def no_allowed_token_ids(self) -> bool:
         return len(self.has_allowed_token_ids) == 0
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 5f6012ec614c2..72d4474b89627 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -247,6 +247,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: dict[str, int] = {}
 
         # Initialize input batch early to avoid AttributeError in _update_states
         self.input_batch = InputBatch(
@@ -420,6 +423,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
+            self.num_prompt_logprobs.pop(req_id, None)
 
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
@@ -477,6 +481,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 lora_request=new_req_data.lora_request,
             )
 
+            if sampling_params and sampling_params.prompt_logprobs is not None:
+                self.num_prompt_logprobs[req_id] = (
+                    self.input_batch.vocab_size
+                    if sampling_params.prompt_logprobs == -1
+                    else sampling_params.prompt_logprobs
+                )
+
             req_ids_to_add.append(req_id)
 
         # Update the states of the running/resumed requests.

From df78aeef084cf35eecc6ba52640de8c390c99543 Mon Sep 17 00:00:00 2001
From: Yizhou <136800916+yiz-liu@users.noreply.github.com>
Date: Sun, 23 Nov 2025 05:10:31 +0800
Subject: [PATCH 171/249] Refactor: Move CUDA graph dispatch logic earlier
 (#27382)

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
---
 vllm/v1/worker/gpu_model_runner.py | 50 +++++++++++++++---------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 49285a7b8e0ad..6a54e02f861e9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3740,6 +3740,31 @@ class GPUModelRunner(
             dp_rank = self.parallel_config.data_parallel_rank
             num_tokens_after_padding = int(num_tokens_across_dp[dp_rank])
 
+        # filter out the valid batch descriptor
+        _cg_mode, batch_descriptor = (
+            self.cudagraph_dispatcher.dispatch(
+                BatchDescriptor(
+                    num_tokens=num_tokens_after_padding,
+                    uniform_decode=uniform_decode,
+                    has_lora=activate_lora and self.lora_config is not None,
+                )
+            )
+            if not is_profile
+            else (CUDAGraphMode.NONE, None)
+        )
+        if cudagraph_runtime_mode is not None:
+            # we allow forcing NONE when the dispatcher disagrees to support
+            # warm ups for cudagraph capture
+            assert (
+                cudagraph_runtime_mode == CUDAGraphMode.NONE
+                or cudagraph_runtime_mode == _cg_mode
+            ), (
+                f"Cudagraph runtime mode mismatch at dummy_run. "
+                f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}."
+            )
+        else:
+            cudagraph_runtime_mode = _cg_mode
+
         attn_metadata: PerLayerAttnMetadata | None = None
 
         # If force_attention is True, we always capture attention. Otherwise,
@@ -3814,31 +3839,6 @@ class GPUModelRunner(
                     num_tokens_after_padding, None, False
                 )
 
-            # filter out the valid batch descriptor
-            _cg_mode, batch_descriptor = (
-                self.cudagraph_dispatcher.dispatch(
-                    BatchDescriptor(
-                        num_tokens=num_tokens_after_padding,
-                        uniform_decode=uniform_decode,
-                        has_lora=activate_lora and self.lora_config is not None,
-                    )
-                )
-                if not is_profile
-                else (CUDAGraphMode.NONE, None)
-            )
-            if cudagraph_runtime_mode is not None:
-                # we allow forcing NONE when the dispatcher disagrees to support
-                # warm ups for cudagraph capture
-                assert (
-                    cudagraph_runtime_mode == CUDAGraphMode.NONE
-                    or cudagraph_runtime_mode == _cg_mode
-                ), (
-                    f"Cudagraph runtime mode mismatch at dummy_run. "
-                    f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}."
-                )
-            else:
-                cudagraph_runtime_mode = _cg_mode
-
             if ubatch_slices is not None:
                 # Adjust values to reflect a single ubatch.
                 # TODO(sage,lucas): this is cruft that should be addressed in

From 472fdee97472cae444635508fcf73ebe28f79980 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sat, 22 Nov 2025 16:50:02 -0500
Subject: [PATCH 172/249] [Chore] Update batch invariant code owner (#29246)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .github/CODEOWNERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 0e834c057c401..3247408e1163e 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -9,6 +9,7 @@
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
+/vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
@@ -59,6 +60,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/kv_connector/nixl_integration @NickLucche
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
+/tests/v1/determinism @yewentao256 
 
 # Transformers modeling backend
 /vllm/model_executor/models/transformers @hmellor

From 4587063267d2751ac183c16421aeda3f335cee39 Mon Sep 17 00:00:00 2001
From: Qidong Su <soodoshll@gmail.com>
Date: Sat, 22 Nov 2025 18:25:13 -0500
Subject: [PATCH 173/249] Patch DeepEP when building docker image with CUDA 13
 (#29154)

Signed-off-by: Qidong Su <soodoshll@gmail.com>
---
 tools/ep_kernels/install_python_libraries.sh | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 77af3f68a0505..1cea1bef8dbc9 100755
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -96,6 +96,13 @@ clone_repo() {
     fi
 }
 
+deepep_cuda13_patch() {
+    cuda_version_major=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2)
+    if [ ${cuda_version_major} -ge 13 ]; then
+        sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py"
+    fi
+}
+
 do_build() {
     local repo=$1
     local name=$2
@@ -107,6 +114,10 @@ do_build() {
     clone_repo "$repo" "$name" "$key" "$commit"
     cd "$name"
 
+    if [ "$name" == "DeepEP" ]; then
+        deepep_cuda13_patch
+    fi
+
     if [ "$MODE" = "install" ]; then
         echo "Installing $name into environment"
         eval "$extra_env" uv pip install --no-build-isolation -vvv .

From 5f96c00c557fc68c352d1b9bb1d6f9c9bb9f133d Mon Sep 17 00:00:00 2001
From: jiahanc <173873397+jiahanc@users.noreply.github.com>
Date: Sat, 22 Nov 2025 16:39:30 -0800
Subject: [PATCH 174/249] [Fix] Add SM check to flashinfer MOE backend (#29144)

Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 .../layers/quantization/utils/flashinfer_utils.py      | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 7eba8359b92f6..eef7a0896c375 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -282,6 +282,16 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
 
     flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
     if flashinfer_moe_backend in backend_map:
+        if (
+            flashinfer_moe_backend == "latency"
+            and not current_platform.is_device_capability(100)
+        ):
+            logger.info_once(
+                "Flashinfer TRTLLM MOE backend is only supported on "
+                "SM100 and later, using CUTLASS backend instead",
+                scope="local",
+            )
+            return FlashinferMoeBackend.CUTLASS
         return backend_map[flashinfer_moe_backend]
     elif current_platform.is_device_capability(90):
         return FlashinferMoeBackend.CUTLASS

From 3ed767ec064fbebbf5d8de829d390fa4a1bf0a0b Mon Sep 17 00:00:00 2001
From: Michael Act <michael.a.c.tulenan@gdplabs.id>
Date: Sun, 23 Nov 2025 09:58:28 +0700
Subject: [PATCH 175/249] docs: fixes distributed executor backend config for
 multi-node vllm (#29173)

Signed-off-by: Michael Act <michael.a.c.tulenan@gdplabs.id>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 docs/serving/parallelism_scaling.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md
index 14cd3b057791c..a32840ea73b9a 100644
--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@@ -118,14 +118,16 @@ The common practice is to set the tensor parallel size to the number of GPUs in
 ```bash
 vllm serve /path/to/the/model/in/the/container \
     --tensor-parallel-size 8 \
-    --pipeline-parallel-size 2
+    --pipeline-parallel-size 2 \
+    --distributed-executor-backend ray
 ```
 
 Alternatively, you can set `tensor_parallel_size` to the total number of GPUs in the cluster:
 
 ```bash
 vllm serve /path/to/the/model/in/the/container \
-     --tensor-parallel-size 16
+     --tensor-parallel-size 16 \
+     --distributed-executor-backend ray
 ```
 
 ## Optimizing network communication for tensor parallelism

From 389aa1b2ebf3726fef6aac737e0020075324d138 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 23 Nov 2025 10:58:48 +0800
Subject: [PATCH 176/249] [Doc] Update more docs with respect to V1 (#29188)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/configuration/conserving_memory.md       |   3 -
 docs/configuration/optimization.md            |   4 +-
 docs/usage/reproducibility.md                 |  34 ++---
 docs/usage/v1_guide.md                        | 136 +++++++++---------
 examples/offline_inference/reproducibility.py |   5 +-
 .../models/language/generation/test_common.py |   7 -
 6 files changed, 89 insertions(+), 100 deletions(-)

diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 5ce43c7984057..0aa89a89eae5c 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -49,9 +49,6 @@ llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
 
 By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
 
-!!! warning
-    CUDA graph capture takes up more memory in V1 than in V0.
-
 You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
 
 ??? code
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index b0d390d7e1cbb..fdd9c317b022f 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -31,9 +31,7 @@ In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as re
 
 Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
 
-In vLLM V1, **chunked prefill is always enabled by default**. This is different from vLLM V0, where it was conditionally enabled based on model characteristics.
-
-With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
+In V1, **chunked prefill is enabled by default whenever possible**. With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
 
 This policy has two benefits:
 
diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md
index afc25b63902e2..a8e49d0a3398f 100644
--- a/docs/usage/reproducibility.md
+++ b/docs/usage/reproducibility.md
@@ -1,21 +1,23 @@
 # Reproducibility
 
 vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. To achieve
-reproducible results, you need to turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
+reproducible results:
+
+- In offline mode, you can either set `VLLM_ENABLE_V1_MULTIPROCESSING=0` which makes scheduling deterministic,
+  or enable [batch invariance](../features/batch_invariance.md) to make the outputs insensitive to scheduling.
+- In online mode, you can only enable [batch invariance](../features/batch_invariance.md).
 
 Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py)
 
 !!! warning
 
-    Applying the above settings [changes the random state in user code](#locality-of-random-state).
+    Setting `VLLM_ENABLE_V1_MULTIPROCESSING=0` will change the random state of user code 
+    (i.e. the code that constructs [LLM][vllm.LLM] class).
 
 !!! note
 
     Even with the above settings, vLLM only provides reproducibility
     when it runs on the same hardware and the same vLLM version.
-    Also, the online serving API (`vllm serve`) does not support reproducibility
-    because it is almost impossible to make the scheduling deterministic in the
-    online setting.
 
 ## Setting the global seed
 
@@ -23,25 +25,17 @@ The `seed` parameter in vLLM is used to control the random states for various ra
 
 If a specific seed value is provided, the random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly.
 
-However, in some cases, setting the seed will also [change the random state in user code](#locality-of-random-state).
-
 ### Default Behavior
 
 In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`.
 
+It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
+for workflows such as speculative decoding. For more information, see: <https://github.com/vllm-project/vllm/pull/17929>
+
 !!! note
 
-    It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
-    for workflows such as speculative decoding.
-    
-    For more information, see: <https://github.com/vllm-project/vllm/pull/17929>
+    The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM 
+    only if the workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
 
-### Locality of random state
-
-The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM under the following conditions:
-
-- For V0: The seed is specified.
-- For V1: The workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
-
-By default, these conditions are not active so you can use vLLM without having to worry about
-accidentally making deterministic subsequent operations that rely on random state.
+    By default, `VLLM_ENABLE_V1_MULTIPROCESSING=1` so you can use vLLM without having to worry about
+    accidentally making deterministic subsequent operations that rely on random state.
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 22f4e6761ea9a..5f647aafd61d4 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -4,9 +4,7 @@
 
     We have fully deprecated V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
 
-V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
-
-## Why vLLM V1?
+    If you have a use case that works on V0 Engine but not V1, please share it on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
 vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
 
@@ -32,16 +30,44 @@ Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-
 
 This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
 
-## Current Status
+## Differences from V0
 
-For each item, our progress towards V1 support falls into one of the following states:
+This section lists some differences in behavior between V0 and V1.
 
-- **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
-- **🟢 Functional**: Fully operational, with ongoing optimizations.
-- **🚧 WIP**: Under active development.
-- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).
-- **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
-- **🔴 Deprecated**: Not planned for V1 unless there is strong demand.
+### Chunked Prefill
+
+Chunked prefill is enabled by default whenever possible, unlike in V0 where it was conditionally enabled based on model characteristics.
+
+### CUDA Graphs
+
+CUDA graph capture takes up more memory in V1 than in V0.
+
+### Semantic Changes to Logprobs
+
+#### Logprobs Calculation
+
+By default, logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
+before applying any logits post-processing such as temperature scaling or penalty
+adjustments). As a result, the returned logprobs do not reflect the final adjusted
+probabilities used during sampling.
+
+You can adjust this behavior by setting the `--logprobs-mode` flag.
+Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`.
+Raw means the values before applying any logit processors, like bad words.
+Processed means the values after applying all processors, including temperature and top_k/top_p.
+
+#### Prompt Logprobs with Prefix Caching
+
+While V1 supports passing prompt logprobs with prefix caching enabled, it no longer caches the logprobs.
+For a request requiring prompt logprobs, the engine will ignore the prefix cache and recompute the prefill of full prompt to generate the logprobs.
+
+## Feature Support
+
+For each item, its support in vLLM V1 falls into one of the following states:
+
+- **🟢 Functional**: Fully operational with optimizations comparable to or better than V0.
+- **🟡 In Progress**: Planned to be in vLLM V1, with open PRs/RFCs.
+- **🔴 Removed**: Dropped from vLLM V1. Will only consider re-introducing if there is strong demand.
 
 !!! note
     vLLM V1’s unified scheduler treats both prompt and output tokens the same
@@ -57,13 +83,13 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 
 ### Hardware
 
-| Hardware   | Status                                        |
-|------------|-----------------------------------------------|
-| **NVIDIA** | <nobr>🚀</nobr>                               |
-| **AMD**    | <nobr>🟢</nobr>                               |
+| Hardware         | Status                                        |
+|------------------|-----------------------------------------------|
+| **NVIDIA**       | <nobr>🟢</nobr>                               |
+| **AMD**          | <nobr>🟢</nobr>                               |
 | **INTEL GPU**    | <nobr>🟢</nobr>                               |
-| **TPU**    | <nobr>🟢</nobr>                               |
-| **CPU**    | <nobr>🟢 (x86\_64/aarch64) 🟡 (MacOS) </nobr> |
+| **TPU**          | <nobr>🟢</nobr>                               |
+| **CPU**          | <nobr>🟢</nobr>                               |
 
 !!! note
 
@@ -78,23 +104,21 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 
 ### Models
 
-| Model Type                  | Status                                                                             |
-|-----------------------------|------------------------------------------------------------------------------------|
-| **Decoder-only Models**     | <nobr>🚀 Optimized</nobr>                                                          |
-| **Encoder-Decoder Models**  | <nobr>🟢 Whisper only</nobr>                                                       |
-| **Embedding Models**        | <nobr>🟢 Functional</nobr>                                                         |
-| **Mamba Models**            | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr>                                            |
-| **Multimodal Models**       | <nobr>🟢 Functional</nobr>                                                         |
+| Model Type                  | Status                                                                  |
+|-----------------------------|-------------------------------------------------------------------------|
+| **Decoder-only Models**     | <nobr>🟢</nobr>                                                         |
+| **Encoder-Decoder Models**  | <nobr>🟢 (Whisper), 🔴 (Others) </nobr>                                |
+| **Pooling Models**          | <nobr>🟢</nobr>                                                         |
+| **Mamba Models**            | <nobr>🟢</nobr>                                                         |
+| **Multimodal Models**       | <nobr>🟢</nobr>                                                         |
 
 See below for the status of models that are not yet supported or have more features planned in V1.
 
-#### Embedding Models
+#### Pooling Models
 
-The initial basic support is now functional.
+Now fully supported, with prefix caching and chunked prefill newly available for last-pooling models.
 
-Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
-which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)
-to enable simultaneous generation and embedding using the same engine instance in V1.
+We are working on enabling prefix caching and chunked prefill for more categories of pooling models.
 
 #### Mamba Models
 
@@ -112,24 +136,25 @@ Please note that prefix caching is not yet supported for any of the above models
 
 Whisper is supported. Other models requiring cross-attention between separate
 encoder and decoder (e.g., `BartForConditionalGeneration`,
-`MllamaForConditionalGeneration`) are not supported.
+`MllamaForConditionalGeneration`) are no longer supported.
 
 ### Features
 
 | Feature                                     | Status                                                                            |
 |---------------------------------------------|-----------------------------------------------------------------------------------|
-| **Prefix Caching**                          | <nobr>🚀 Optimized</nobr>                                                         |
-| **Chunked Prefill**                         | <nobr>🚀 Optimized</nobr>                                                         |
-| **LoRA**                                    | <nobr>🚀 Optimized</nobr>                                                         |
+| **Prefix Caching**                          | <nobr>🟢 Functional</nobr>                                                        |
+| **Chunked Prefill**                         | <nobr>🟢 Functional</nobr>                                                        |
+| **LoRA**                                    | <nobr>🟢 Functional</nobr>                                                        |
 | **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
-| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices (<https://github.com/vllm-project/vllm/pull/15191>)</nobr>|
-| **Spec Decode**                             | <nobr>🚀 Optimized</nobr>                                                         |
-| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
+| **FP8 KV Cache**                            | <nobr>🟢 Functional</nobr>                                                        |
+| **Spec Decode**                             | <nobr>🟢 Functional</nobr>                                                        |
+| **Prompt Logprobs with Prefix Caching**     | <nobr>🟢 Functional</nobr>                                                        |
 | **Structured Output Alternative Backends**  | <nobr>🟢 Functional</nobr>                                                        |
-| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
-| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
-| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
-| **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |
+| **Concurrent Partial Prefills**             | <nobr>🟡 [In Progress](https://github.com/vllm-project/vllm/issues/14003)</nobr>  |
+| **best_of**                                 | <nobr>🔴 [Removed](https://github.com/vllm-project/vllm/issues/13361)</nobr>      |
+| **Per-Request Logits Processors**           | <nobr>🔴 [Removed](https://github.com/vllm-project/vllm/pull/13360)</nobr>        |
+| **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Removed</nobr>                                                           |
+| **Request-level Structured Output Backend** | <nobr>🔴 Removed</nobr>                                                           |
 
 !!! note
 
@@ -139,37 +164,16 @@ encoder and decoder (e.g., `BartForConditionalGeneration`,
     prefix caching, and speculative decoding without a strict separation between prefill
     and decode phases.
 
-#### Semantic Changes to Logprobs
+#### Removed Features
 
-vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
-differences compared to V0:
-
-##### Logprobs Calculation
-
-By default, logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
-before applying any logits post-processing such as temperature scaling or penalty
-adjustments). As a result, the returned logprobs do not reflect the final adjusted
-probabilities used during sampling.
-
-You can adjust this behavior by setting the `--logprobs-mode` flag.
-Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`.
-Raw means the values before applying any logit processors, like bad words.
-Processed means the values after applying all processors, including temperature and top_k/top_p.
-
-##### Prompt Logprobs with Prefix Caching
-
-Logprobs are not cached. For a request requiring prompt logprobs, the engine will ignore the prefix cache and recompute the prefill of full prompt to generate the logprobs.
-
-#### Deprecated Features
-
-As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
+As part of the major architectural rework in vLLM V1, several legacy features have been removed.
 
 ##### Sampling features
 
-- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
+- **best_of**: This feature has been removed due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
 - **Per-Request Logits Processors**: In V0, users could pass custom
   processing functions to adjust logits on a per-request basis. In vLLM V1, this
-  feature has been deprecated. Instead, we now support **global logits processors**
+  feature has been removed. Instead, we now support **global logits processors**
   which are set at startup time, see [RFC #17799](https://github.com/vllm-project/vllm/issues/17799).
 
 ##### KV Cache features
@@ -179,4 +183,4 @@ to handle request preemptions.
 
 ##### Structured Output features
 
-- **Request-level Structured Output Backend**: Deprecated, alternative backends (outlines, guidance) with fallbacks is supported now.
+- **Request-level Structured Output Backend**: Removed; alternative backends (outlines, guidance) with fallbacks are supported now.
diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py
index e135bc1b2abb7..72c1e841dca45 100644
--- a/examples/offline_inference/reproducibility.py
+++ b/examples/offline_inference/reproducibility.py
@@ -11,8 +11,11 @@ import random
 
 from vllm import LLM, SamplingParams
 
-# Turn off multiprocessing to make the scheduling deterministic.
+# Either:
+## Turn off multiprocessing to make the scheduling deterministic, or
 os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+## Enable batch invariance to get consistent results regardless of scheduling.
+os.environ["VLLM_BATCH_INVARIANT"] = "1"
 
 prompts = [
     "Hello, my name is",
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 0cdb7c9a603f2..df6c2cab7814b 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -10,13 +10,6 @@ from ....utils import large_gpu_mark
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 
-# These have unsupported head_dim for FA. We do not
-# have a clean way to fall back, so we fail with
-# a clear msg when it happens.
-# https://github.com/vllm-project/vllm/issues/14524
-# NOTE(woosuk): Skipping these tests until V1 supports them.
-# REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
-
 # This list contains the model that are using AITER kernel.
 # Skip model that are not using AITER tests.
 # When more AITER kernels are added, this list will not be

From 20ee418adc279f29e76e7770c4f688c4fc070274 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 22 Nov 2025 20:12:50 -0800
Subject: [PATCH 177/249] [Model Runner V2] Minor fix for cudagraph_utils
 (#29256)

---
 vllm/v1/worker/gpu/cudagraph_utils.py | 19 +++++--------------
 vllm/v1/worker/gpu/model_runner.py    |  1 +
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 763bd61834625..654bd60e558b1 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import gc
-from contextlib import contextmanager
+from unittest.mock import patch
 
 import numpy as np
 import torch
@@ -140,6 +139,7 @@ class CudaGraphManager:
             attn_metadata,
             self.vllm_config,
             num_tokens=batch_size,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
             num_tokens_across_dp=num_tokens_across_dp,
         ):
             hidden_states = model(
@@ -148,15 +148,16 @@ class CudaGraphManager:
             )
             if self.hidden_states is None:
                 self.hidden_states = torch.empty_like(hidden_states)
-        torch.cuda.synchronize()
 
         # Capture the graph.
         graph = torch.cuda.CUDAGraph()
         with (
+            patch("torch.cuda.empty_cache", lambda: None),
             set_forward_context(
                 attn_metadata,
                 self.vllm_config,
                 num_tokens=batch_size,
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
                 num_tokens_across_dp=num_tokens_across_dp,
             ),
             torch.cuda.graph(graph, self.pool),
@@ -183,7 +184,7 @@ class CudaGraphManager:
         if is_global_first_rank():
             sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs")
 
-        with freeze_gc(), graph_capture(device=self.device):
+        with graph_capture(device=self.device):
             for batch_size in sizes_to_capture:
                 self.capture_graph(
                     batch_size,
@@ -199,13 +200,3 @@ class CudaGraphManager:
         self.graphs[batch_size].replay()
         assert self.hidden_states is not None
         return self.hidden_states[:batch_size]
-
-
-@contextmanager
-def freeze_gc():
-    gc.collect()
-    gc.freeze()
-    try:
-        yield
-    finally:
-        gc.unfreeze()
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 9ca37ff282d82..9d6e2cf92a8cc 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -298,6 +298,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return 0
 
         start_time = time.perf_counter()
+        torch.cuda.empty_cache()
         start_free_gpu_memory = torch.cuda.mem_get_info()[0]
 
         with self.maybe_setup_dummy_loras(self.lora_config):

From 71362ffab4ef3f5f99ecae2f345f8c689e903f7d Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Sat, 22 Nov 2025 22:42:49 -0600
Subject: [PATCH 178/249] [CI/Build][AMD] Skip
 test_multi_shared_storage_connector_consistency  in test_multi_connector.py
 due to hipErrorLaunchFailure  when calling .cpu() (#29253)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/v1/kv_connector/unit/test_multi_connector.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index 1c1ac915c758e..ffa7d884d2762 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -20,6 +20,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
     NixlKVConnectorStats,
 )
+from vllm.platforms import current_platform
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
@@ -69,6 +70,13 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool:
     return True
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason=(
+        "hipErrorLaunchFailure when running this test, see issue:"
+        "https://github.com/ROCm/pytorch/issues/2822"
+    ),
+)
 def test_multi_shared_storage_connector_consistency():
     """
     Tests that MultiConnector with two SharedStorageConnectors saves

From 3999442f1c1d091dda370bc8cb2022f54b7d805b Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Sat, 22 Nov 2025 22:45:08 -0600
Subject: [PATCH 179/249] [CI/Build][AMD] Add check for flash_att_varlen_func
 to test_tree_attention.py (#29252)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/v1/spec_decode/test_tree_attention.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index 6958d62dc7e90..a4ee53008ce82 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -3,6 +3,7 @@
 
 import math
 
+import pytest
 import torch
 
 from tests.v1.attention.utils import (
@@ -11,9 +12,16 @@ from tests.v1.attention.utils import (
     try_get_attention_backend,
 )
 from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
 from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 
+if not is_flash_attn_varlen_func_available():
+    pytest.skip(
+        "This test requires flash_attn_varlen_func, but it's not available.",
+        allow_module_level=True,
+    )
+
 
 class MockAttentionLayer(torch.nn.Module):
     _q_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")

From 55c21c88363811feb2aeb5aecac3cd48683e4705 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Sat, 22 Nov 2025 23:05:00 -0600
Subject: [PATCH 180/249] [ROCm][CI] Fix "Cannot re-initialize CUDA in forked
 subprocess" in test_pynccl.py  (#29119)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 requirements/rocm-test.txt       | 3 +++
 tests/distributed/test_pynccl.py | 8 +++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index eabb5065bfceb..2d57e7e167869 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -40,5 +40,8 @@ mteb[bm25s]>=1.38.11, <2
 # Required for eval tests
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
 
+# Required for multiprocessed tests that use spawn method
+multiprocess==0.70.16
+
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index c3085beeb3564..c7c9d0602def0 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import multiprocessing
 import os
 
+import multiprocess as mp
 import numpy as np
 import pytest
 import torch
@@ -20,10 +20,12 @@ from vllm.distributed.parallel_state import (
 )
 from vllm.utils.system_utils import update_environment_variables
 
+mp.set_start_method("spawn", force=True)
+
 
 def distributed_run(fn, world_size):
     number_of_processes = world_size
-    processes: list[multiprocessing.Process] = []
+    processes: list[mp.Process] = []
     for i in range(number_of_processes):
         env: dict[str, str] = {}
         env["RANK"] = str(i)
@@ -32,7 +34,7 @@ def distributed_run(fn, world_size):
         env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
         env["MASTER_ADDR"] = "localhost"
         env["MASTER_PORT"] = "12345"
-        p = multiprocessing.Process(target=fn, args=(env,))
+        p = mp.Process(target=fn, args=(env,))
         processes.append(p)
         p.start()
 

From 6fb0215eee44cf5e4b28f57e6739ef4a51945127 Mon Sep 17 00:00:00 2001
From: Luke <yq0536@gmail.com>
Date: Sun, 23 Nov 2025 06:43:21 -0500
Subject: [PATCH 181/249] [Bugfix] Use lazy string reference for
 DeepseekV3Config in config registry (#28958)

Signed-off-by: Luke <yq0536@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/transformers_utils/config.py           | 4 ++--
 vllm/transformers_utils/configs/__init__.py | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index db7bf228f411d..3d282da8c6112 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -27,7 +27,7 @@ from huggingface_hub.utils import (
     RevisionNotFoundError,
 )
 from packaging.version import Version
-from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig
+from transformers import GenerationConfig, PretrainedConfig
 from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 from transformers.models.auto.image_processing_auto import get_image_processor_config
 from transformers.models.auto.modeling_auto import (
@@ -84,7 +84,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
     afmoe="AfmoeConfig",
     chatglm="ChatGLMConfig",
     deepseek_vl_v2="DeepseekVLV2Config",
-    deepseek_v32=DeepseekV3Config,
+    deepseek_v32="DeepseekV3Config",
     flex_olmo="FlexOlmoConfig",
     kimi_linear="KimiLinearConfig",
     kimi_vl="KimiVLConfig",
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index dcae05a15fec3..d28fd8d033373 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -5,8 +5,13 @@ Model configs may be defined in this directory for the following reasons:
 
 - There is no configuration file defined by HF Hub or Transformers library.
 - There is a need to override the existing config to support vLLM.
+- The HF model_type isn't recognized by the Transformers library but can
+  be mapped to an existing Transformers config, such as
+  deepseek-ai/DeepSeek-V3.2-Exp.
 """
 
+from transformers import DeepseekV3Config
+
 from vllm.transformers_utils.configs.afmoe import AfmoeConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
@@ -44,6 +49,7 @@ __all__ = [
     "AfmoeConfig",
     "ChatGLMConfig",
     "DeepseekVLV2Config",
+    "DeepseekV3Config",
     "DotsOCRConfig",
     "EAGLEConfig",
     "FlexOlmoConfig",

From 7f12c82fa62cee7e815332c1ab2714bdee2a075e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 23 Nov 2025 09:42:52 -0800
Subject: [PATCH 182/249] [Model Runner V2] Change bookkeeping logic in
 preparation for spec decoding (#29194)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/async_utils.py     |   7 +-
 vllm/v1/worker/gpu/attn_utils.py      |  14 +-
 vllm/v1/worker/gpu/cudagraph_utils.py |   8 +-
 vllm/v1/worker/gpu/input_batch.py     | 174 ++++++++++++++++-------
 vllm/v1/worker/gpu/model_runner.py    | 193 ++++++++++++++++----------
 vllm/v1/worker/gpu/states.py          |  13 +-
 6 files changed, 269 insertions(+), 140 deletions(-)

diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
index e523090aa2172..421fb29a7f87f 100644
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
 
-import numpy as np
 import torch
 
 from vllm.v1.outputs import (
@@ -18,7 +17,7 @@ class AsyncOutput(AsyncModelRunnerOutput):
         self,
         model_runner_output: ModelRunnerOutput,
         sampler_output: SamplerOutput,
-        num_sampled_tokens: np.ndarray,
+        num_sampled_tokens: torch.Tensor,
         copy_stream: torch.cuda.Stream,
         copy_event: torch.cuda.Event,
     ):
@@ -52,6 +51,7 @@ class AsyncOutput(AsyncModelRunnerOutput):
                 )
             else:
                 self.logprobs_tensors = None
+            self.num_sampled_tokens = num_sampled_tokens.to("cpu", non_blocking=True)
             self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
             if self.model_runner_output.prompt_logprobs_dict:
                 for k, v in self.model_runner_output.prompt_logprobs_dict.items():
@@ -63,6 +63,7 @@ class AsyncOutput(AsyncModelRunnerOutput):
 
     def get_output(self) -> ModelRunnerOutput:
         self.copy_event.synchronize()
+        num_sampled_tokens_np = self.num_sampled_tokens.numpy()
 
         # NOTE(woosuk): The following code is to ensure compatibility with
         # the existing model runner.
@@ -71,7 +72,7 @@ class AsyncOutput(AsyncModelRunnerOutput):
         sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
         num_reqs = len(sampled_token_ids)
         for i in range(num_reqs):
-            del sampled_token_ids[i][self.num_sampled_tokens[i] :]
+            del sampled_token_ids[i][num_sampled_tokens_np[i] :]
         self.model_runner_output.sampled_token_ids = sampled_token_ids
 
         if self.logprobs_tensors is not None:
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
index 222db565dff17..4510a1c5ca1e9 100644
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -3,6 +3,7 @@
 from collections.abc import Sequence
 from typing import Any, cast
 
+import numpy as np
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
@@ -145,8 +146,9 @@ def build_attn_metadata(
     num_reqs: int,
     num_tokens: int,
     query_start_loc: CpuGpuBuffer,
-    seq_lens: CpuGpuBuffer,
-    num_computed_tokens_cpu: torch.Tensor,
+    seq_lens: torch.Tensor,
+    seq_lens_np: np.ndarray,
+    num_computed_tokens_cpu: torch.Tensor | None,
     block_tables: Sequence[torch.Tensor],
     slot_mappings: torch.Tensor,
     kv_cache_config: KVCacheConfig,
@@ -154,9 +156,9 @@ def build_attn_metadata(
     query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
     query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
     max_query_len = int(query_start_loc.np[: num_reqs + 1].max())
-    seq_lens_gpu = seq_lens.gpu[:num_reqs]
-    seq_lens_cpu = seq_lens.cpu[:num_reqs]
-    max_seq_len = int(seq_lens.np[:num_reqs].max())
+    seq_lens = seq_lens[:num_reqs]
+    seq_lens_cpu = torch.from_numpy(seq_lens_np)
+    max_seq_len = int(seq_lens_np.max())
 
     attn_metadata: dict[str, Any] = {}
     kv_cache_groups = kv_cache_config.kv_cache_groups
@@ -167,7 +169,7 @@ def build_attn_metadata(
         common_attn_metadata = CommonAttentionMetadata(
             query_start_loc=query_start_loc_gpu,
             query_start_loc_cpu=query_start_loc_cpu,
-            seq_lens=seq_lens_gpu,
+            seq_lens=seq_lens,
             seq_lens_cpu=seq_lens_cpu,
             max_seq_len=max_seq_len,
             num_computed_tokens_cpu=num_computed_tokens_cpu,
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 654bd60e558b1..dccf747369e4e 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -101,14 +101,13 @@ class CudaGraphManager:
 
         # Prepare dummy inputs.
         input_ids = input_buffers.input_ids.gpu[:batch_size]
-        positions = input_buffers.positions.gpu[:batch_size]
+        positions = input_buffers.positions[:batch_size]
 
         input_buffers.query_start_loc.np[: batch_size + 1] = np.arange(batch_size + 1)
         input_buffers.query_start_loc.np[batch_size:] = batch_size
         input_buffers.query_start_loc.copy_to_gpu()
-        input_buffers.seq_lens.np[:batch_size] = self.max_model_len
-        input_buffers.seq_lens.np[batch_size:] = 0
-        input_buffers.seq_lens.copy_to_gpu()
+        input_buffers.seq_lens[:batch_size] = self.max_model_len
+        input_buffers.seq_lens[batch_size:] = 0
 
         input_block_tables = [x[:batch_size] for x in block_tables.input_block_tables]
         slot_mappings = block_tables.slot_mappings[:, :batch_size]
@@ -119,6 +118,7 @@ class CudaGraphManager:
             num_tokens=batch_size,
             query_start_loc=input_buffers.query_start_loc,
             seq_lens=input_buffers.seq_lens,
+            seq_lens_np=np.full(batch_size, self.max_model_len, dtype=np.int32),
             num_computed_tokens_cpu=None,  # FIXME
             block_tables=input_block_tables,
             slot_mappings=slot_mappings,
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 8313b32d29797..b671c093113ba 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -32,9 +32,9 @@ class InputBuffers:
 
         self.idx_mapping = self._make_buffer(max_num_reqs, dtype=torch.int32)
         self.input_ids = self._make_buffer(max_num_tokens, dtype=torch.int32)
-        self.positions = self._make_buffer(max_num_tokens, dtype=torch.int64)
+        self.positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=device)
         self.query_start_loc = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
-        self.seq_lens = self._make_buffer(max_num_reqs, dtype=torch.int32)
+        self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
 
         # Structured outputs.
         self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
@@ -107,13 +107,15 @@ class InputBatch:
         query_start_loc_np = input_buffers.query_start_loc.np[: num_reqs + 1]
         query_start_loc = input_buffers.query_start_loc.copy_to_gpu()[: num_reqs + 1]
         # seq_len equals to query_len
-        input_buffers.seq_lens.np[:num_reqs] = num_scheduled_tokens
-        input_buffers.seq_lens.np[num_reqs:] = 0
-        seq_lens_np = input_buffers.seq_lens.np[:num_reqs]
-        seq_lens = input_buffers.seq_lens.copy_to_gpu()[:num_reqs]
+        seq_lens_np = np.full(num_reqs, num_tokens // num_reqs, dtype=np.int32)
+        seq_lens_np[-1] += num_tokens % num_reqs
+        input_buffers.seq_lens[:num_reqs] = num_tokens // num_reqs
+        input_buffers.seq_lens[num_reqs - 1] += num_tokens % num_reqs
+        input_buffers.seq_lens[num_reqs:] = 0
+        seq_lens = input_buffers.seq_lens[:num_reqs]
 
         input_ids = input_buffers.input_ids.copy_to_gpu(num_tokens)
-        positions = input_buffers.positions.copy_to_gpu(num_tokens)
+        positions = input_buffers.positions[:num_tokens]
         # attn_metadata = defaultdict(lambda: None)
         logits_indices = query_start_loc[1:] - 1
         return cls(
@@ -141,27 +143,25 @@ class InputBatch:
     [
         types.none(
             types.int32[:],  # idx_mapping
-            types.int32[:, :],  # token_ids
-            types.int32[:],  # num_computed_tokens
             types.int32[:],  # num_scheduled_tokens
+            types.int32[:, :],  # prefill_token_ids
+            types.int32[:],  # num_computed_prefill_tokens
+            types.int32[:],  # prefill_len
             types.int32[:],  # input_ids
-            types.int64[:],  # positions
             types.int32[:],  # query_start_loc
-            types.int32[:],  # seq_lens
         )
     ],
     nopython=True,
     cache=True,
 )
-def _prepare_inputs(
+def _prepare_prefill_inputs(
     idx_mapping: np.ndarray,  # batch_idx -> req_idx
-    token_ids: np.ndarray,  # [N, max_model_len]
-    num_computed_tokens: np.ndarray,  # [N]
     num_scheduled_tokens: np.ndarray,  # [B]
+    prefill_token_ids: np.ndarray,  # [N, max_model_len]
+    num_computed_prefill_tokens: np.ndarray,  # [N]
+    prefill_len: np.ndarray,  # [N]
     input_ids: np.ndarray,  # [num_input_tokens]
-    positions: np.ndarray,  # [num_input_tokens]
     query_start_loc: np.ndarray,  # [B + 1]
-    seq_lens: np.ndarray,  # [B]
 ) -> None:
     num_reqs = num_scheduled_tokens.shape[0]
     query_start_loc[0] = 0
@@ -170,62 +170,112 @@ def _prepare_inputs(
     for i in range(num_reqs):
         req_idx = idx_mapping[i]
         query_len = num_scheduled_tokens[i]
-        start = num_computed_tokens[req_idx]
-        end = start + query_len
-        seq_lens[i] = end
+
+        start = num_computed_prefill_tokens[req_idx]
+        end = min(start + query_len, prefill_len[req_idx])
+        n = end - start
 
         start_idx = cu_num_tokens
-        end_idx = start_idx + query_len
-        input_ids[start_idx:end_idx] = token_ids[req_idx, start:end]
-        positions[start_idx:end_idx] = np.arange(start, end, dtype=np.int64)
+        input_ids[start_idx : start_idx + n] = prefill_token_ids[req_idx, start:end]
 
-        cu_num_tokens = end_idx
+        cu_num_tokens = start_idx + query_len
         query_start_loc[i + 1] = cu_num_tokens
 
     # Pad the inputs for CUDA graphs.
     # Note: pad query_start_loc to be non-decreasing, as kernels
     # like FlashAttention requires that
     query_start_loc[num_reqs + 1 :].fill(cu_num_tokens)
-    # Fill unused with 0 for full cuda graph mode.
-    seq_lens[num_reqs:].fill(0)
 
 
-def prepare_inputs(
+def prepare_prefill_inputs(
     idx_mapping: np.ndarray,
-    prefill_token_ids: np.ndarray,
-    num_computed_tokens: np.ndarray,
     num_scheduled_tokens: np.ndarray,
+    total_num_tokens: int,
+    prefill_token_ids: np.ndarray,
+    num_computed_prefill_tokens: np.ndarray,
+    prefill_len: np.ndarray,
     input_ids: CpuGpuBuffer,
-    positions: CpuGpuBuffer,
     query_start_loc: CpuGpuBuffer,
-    seq_lens: CpuGpuBuffer,
-    num_tokens: int,
 ) -> None:
-    _prepare_inputs(
+    _prepare_prefill_inputs(
         idx_mapping,
-        prefill_token_ids,
-        num_computed_tokens,
         num_scheduled_tokens,
+        prefill_token_ids,
+        num_computed_prefill_tokens,
+        prefill_len,
         input_ids.np,
-        positions.np,
         query_start_loc.np,
-        seq_lens.np,
     )
-    input_ids.copy_to_gpu(num_tokens)
-    positions.copy_to_gpu(num_tokens)
+    input_ids.copy_to_gpu(total_num_tokens)
     # NOTE(woosuk): We should copy the whole query_start_loc and seq_lens
     # tensors from CPU to GPU, because they may include paddings needed
     # for full CUDA graph mode.
     query_start_loc.copy_to_gpu()
-    seq_lens.copy_to_gpu()
-    return
 
 
 @triton.jit
-def _combine_last_token_ids_kernel(
+def _prepare_pos_seq_lens_kernel(
+    pos_ptr,
+    seq_lens_ptr,
+    idx_mapping_ptr,
+    query_start_loc_ptr,
+    num_computed_tokens_ptr,
+    max_num_reqs,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_id = tl.program_id(0)
+    num_reqs = tl.num_programs(0) - 1
+    if req_id == num_reqs:
+        # Pad unused seq_lens as 0 for full CUDA graphs.
+        for i in tl.range(num_reqs, max_num_reqs, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            mask = block < max_num_reqs
+            tl.store(seq_lens_ptr + block, 0, mask=mask)
+        return
+
+    req_state_idx = tl.load(idx_mapping_ptr + req_id)
+    num_computed_tokens = tl.load(num_computed_tokens_ptr + req_state_idx)
+
+    start = tl.load(query_start_loc_ptr + req_id)
+    end = tl.load(query_start_loc_ptr + req_id + 1)
+    query_len = end - start
+
+    seq_len = num_computed_tokens + query_len
+    tl.store(seq_lens_ptr + req_id, seq_len)
+
+    for i in tl.range(0, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        pos = num_computed_tokens + block
+        tl.store(pos_ptr + start + block, pos, mask=mask)
+
+
+def prepare_pos_seq_lens(
+    idx_mapping: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    num_computed_tokens: torch.Tensor,
+    pos: torch.Tensor,
+    seq_lens: torch.Tensor,
+) -> None:
+    num_reqs = idx_mapping.shape[0]
+    # NOTE(woosuk): We do +1 because the last thread block is used
+    # to pad unused seq_lens as 0 for full CUDA graphs.
+    _prepare_pos_seq_lens_kernel[(num_reqs + 1,)](
+        pos,
+        seq_lens,
+        idx_mapping,
+        query_start_loc,
+        num_computed_tokens,
+        seq_lens.shape[0],
+        BLOCK_SIZE=1024,
+    )
+
+
+@triton.jit
+def _combine_sampled_and_draft_tokens_kernel(
     input_ids_ptr,
     idx_mapping_ptr,
-    last_token_ids_ptr,
+    last_sampled_tokens_ptr,
     query_start_loc_ptr,
     seq_lens_ptr,
     prefill_len_ptr,
@@ -239,26 +289,56 @@ def _combine_last_token_ids_kernel(
         # Handling prefill tokens.
         return
 
-    last_token_id = tl.load(last_token_ids_ptr + req_state_idx)
+    last_token_id = tl.load(last_sampled_tokens_ptr + req_state_idx)
     end = tl.load(query_start_loc_ptr + batch_idx + 1)
     tl.store(input_ids_ptr + end - 1, last_token_id)
 
 
-def combine_last_token_ids(
+def combine_sampled_and_draft_tokens(
     input_ids: torch.Tensor,
     idx_mapping: torch.Tensor,
-    last_token_ids: torch.Tensor,
+    last_sampled_tokens: torch.Tensor,
     query_start_loc: torch.Tensor,
     seq_lens: torch.Tensor,
     prefill_len: torch.Tensor,
 ) -> torch.Tensor:
     num_reqs = seq_lens.shape[0]
-    _combine_last_token_ids_kernel[(num_reqs,)](
+    _combine_sampled_and_draft_tokens_kernel[(num_reqs,)](
         input_ids,
         idx_mapping,
-        last_token_ids,
+        last_sampled_tokens,
         query_start_loc,
         seq_lens,
         prefill_len,
     )
     return input_ids
+
+
+@triton.jit
+def _update_num_computed_tokens_kernel(
+    idx_mapping_ptr,
+    num_computed_tokens_ptr,
+    query_start_loc_ptr,
+):
+    req_id = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + req_id)
+
+    start = tl.load(query_start_loc_ptr + req_id)
+    end = tl.load(query_start_loc_ptr + req_id + 1)
+    query_len = end - start
+
+    n = tl.load(num_computed_tokens_ptr + req_state_idx)
+    tl.store(num_computed_tokens_ptr + req_state_idx, n + query_len)
+
+
+def update_num_computed_tokens(
+    idx_mapping: torch.Tensor,
+    num_computed_tokens: torch.Tensor,
+    query_start_loc: torch.Tensor,
+) -> None:
+    num_reqs = idx_mapping.shape[0]
+    _update_num_computed_tokens_kernel[(num_reqs,)](
+        idx_mapping,
+        num_computed_tokens,
+        query_start_loc,
+    )
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 9d6e2cf92a8cc..bacfbd6c2f465 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -39,8 +39,10 @@ from vllm.v1.worker.gpu.dp_utils import get_batch_metadata_across_dp
 from vllm.v1.worker.gpu.input_batch import (
     InputBatch,
     InputBuffers,
-    combine_last_token_ids,
-    prepare_inputs,
+    combine_sampled_and_draft_tokens,
+    prepare_pos_seq_lens,
+    prepare_prefill_inputs,
+    update_num_computed_tokens,
 )
 from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
 from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
@@ -179,6 +181,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.vllm_config,
             self.device,
         )
+        # TODO(woosuk): Support other backends.
+        if not all(b.get_name() == "FLASH_ATTN" for b in self.attn_backends.values()):
+            raise NotImplementedError("Only FLASH_ATTN backend is supported currently.")
 
         self.kv_caches: list[torch.Tensor] = []
         init_kv_cache(
@@ -196,8 +201,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         slot_mappings = self.block_tables.get_dummy_slot_mappings(
             input_batch.num_tokens
         )
-        num_computed_tokens_cpu = torch.zeros(
-            input_batch.num_reqs, dtype=torch.int32, device="cpu"
+        num_computed_tokens = torch.zeros(
+            input_batch.num_reqs, dtype=torch.int32, device=self.device
         )
         attn_metadata = build_attn_metadata(
             attn_metadata_builders=self.attn_metadata_builders,
@@ -205,7 +210,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_tokens=input_batch.num_tokens,
             query_start_loc=self.input_buffers.query_start_loc,
             seq_lens=self.input_buffers.seq_lens,
-            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            seq_lens_np=input_batch.seq_lens_np,
+            num_computed_tokens_cpu=num_computed_tokens,
             block_tables=block_tables,
             slot_mappings=slot_mappings,
             kv_cache_config=self.kv_cache_config,
@@ -368,6 +374,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 cu_num_new_blocks[i].append(x + len(block_ids))
                 new_block_ids[i].extend(block_ids)
             overwrite.append(True)
+        # Update the GPU tensors for request states.
+        if scheduler_output.scheduled_new_reqs:
+            self.req_states.prefill_len.copy_to_gpu()
 
         # Add new blocks for the existing requests.
         cached_reqs = scheduler_output.scheduled_cached_reqs
@@ -421,46 +430,60 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
         block_tables = self.block_tables.gather_block_tables(idx_mapping)
 
-        prepare_inputs(
+        # Copy prefill tokens from CPU to GPU and get query_start_loc.
+        prepare_prefill_inputs(
             idx_mapping_np,
-            self.req_states.prefill_token_ids,
-            self.req_states.num_computed_tokens,
             num_scheduled_tokens,
-            self.input_buffers.input_ids,
-            self.input_buffers.positions,
-            self.input_buffers.query_start_loc,
-            self.input_buffers.seq_lens,
             num_tokens,
+            self.req_states.prefill_token_ids,
+            self.req_states.num_computed_prefill_tokens,
+            self.req_states.prefill_len.np,
+            self.input_buffers.input_ids,
+            self.input_buffers.query_start_loc,
         )
-
         query_start_loc = self.input_buffers.query_start_loc
         query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
         query_start_loc_np = query_start_loc.np[: num_reqs + 1]
-        seq_lens_gpu = self.input_buffers.seq_lens.gpu[:num_reqs]
-        seq_lens_np = self.input_buffers.seq_lens.np[:num_reqs]
 
-        # Some input token ids are directly read from the last sampled tokens.
-        combine_last_token_ids(
+        # Prepare positions and seq_lens.
+        prepare_pos_seq_lens(
+            idx_mapping,
+            query_start_loc_gpu,
+            self.req_states.num_computed_tokens,
+            self.input_buffers.positions,
+            self.input_buffers.seq_lens,
+        )
+        seq_lens = self.input_buffers.seq_lens[:num_reqs]
+
+        # Some input token ids are directly read from the last sampled tokens
+        # and draft tokens.
+        combine_sampled_and_draft_tokens(
             self.input_buffers.input_ids.gpu,
             idx_mapping,
             self.req_states.last_sampled_tokens,
             query_start_loc_gpu,
-            seq_lens_gpu,
-            self.req_states.prefill_len.copy_to_gpu(),
+            seq_lens,
+            self.req_states.prefill_len.gpu,
         )
 
         # Compute slot mappings: [num_kv_cache_groups, num_tokens]
         slot_mappings = self.block_tables.compute_slot_mappings(
-            query_start_loc_gpu, self.input_buffers.positions.gpu[:num_tokens]
-        )
-
-        num_computed_tokens_cpu = torch.from_numpy(
-            self.req_states.num_computed_tokens[idx_mapping_np]
+            query_start_loc_gpu, self.input_buffers.positions[:num_tokens]
         )
 
         # Logits indices to sample next token from.
         logits_indices = query_start_loc_gpu[1:] - 1
 
+        # Get num_computed_tokens.
+        # HACK(woosuk): Here, we use num_computed_tokens on GPU instead of
+        # num_computed_tokens_cpu. This works for most cases.
+        num_computed_tokens = self.req_states.num_computed_tokens[idx_mapping]
+        # HACK(woosuk): Only GPU has the exact seq_lens because at this point
+        # CPU does not know how many draft tokens are accepted/rejected in the
+        # previous step. Therefore, we use max_model_len to be safe.
+        # NOTE(woosuk): This only works for FA3 backend.
+        seq_lens_np = np.full(num_reqs, self.max_model_len, dtype=np.int32)
+
         # Layer name -> attention metadata.
         attn_metadata = build_attn_metadata(
             attn_metadata_builders=self.attn_metadata_builders,
@@ -468,14 +491,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_tokens=num_tokens,
             query_start_loc=self.input_buffers.query_start_loc,
             seq_lens=self.input_buffers.seq_lens,
-            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            seq_lens_np=seq_lens_np,
+            num_computed_tokens_cpu=num_computed_tokens,
             block_tables=block_tables,
             slot_mappings=slot_mappings,
             kv_cache_config=self.kv_cache_config,
         )
 
         input_ids = self.input_buffers.input_ids.gpu[:num_tokens_after_padding]
-        positions = self.input_buffers.positions.gpu[:num_tokens_after_padding]
+        positions = self.input_buffers.positions[:num_tokens_after_padding]
         return InputBatch(
             req_ids=req_ids,
             num_reqs=num_reqs,
@@ -486,7 +510,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_tokens_after_padding=num_tokens_after_padding,
             query_start_loc=query_start_loc_gpu,
             query_start_loc_np=query_start_loc_np,
-            seq_lens=seq_lens_gpu,
+            seq_lens=seq_lens,
             seq_lens_np=seq_lens_np,
             input_ids=input_ids,
             positions=positions,
@@ -500,11 +524,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         input_batch: InputBatch,
         sampling_metadata: SamplingMetadata,
         grammar_output: GrammarOutput | None,
-    ) -> SamplerOutput:
+    ) -> tuple[SamplerOutput, torch.Tensor]:
         sample_hidden_states = hidden_states[input_batch.logits_indices]
         logits = self.model.compute_logits(sample_hidden_states)
         if grammar_output is not None:
             # Apply grammar bitmask to the logits in-place.
+            # TODO(woosuk): Make compatible with spec decoding.
             with async_barrier(self.structured_outputs_event):
                 apply_grammar_bitmask(
                     logits,
@@ -513,8 +538,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     grammar_output.grammar_bitmask,
                     self.input_buffers,
                 )
+
         sampler_output = self.sampler(logits, sampling_metadata)
-        return sampler_output
+        # Get the number of sampled tokens.
+        # 0 if chunked-prefilling, 1 if not.
+        prefill_len = self.req_states.prefill_len.gpu[input_batch.idx_mapping]
+        is_chunked_prefilling = input_batch.seq_lens < prefill_len
+        num_sampled = (~is_chunked_prefilling).int()
+        return sampler_output, num_sampled
 
     def compute_prompt_logprobs(
         self,
@@ -527,11 +558,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # No request asks for prompt logprobs.
             return {}
 
-        num_computed_tokens = self.req_states.num_computed_tokens[idx_mapping_np]
         prompt_lens = self.req_states.prompt_len[idx_mapping_np]
         # NOTE(woosuk): -1 because the last prompt token's hidden state is not
         # needed for prompt logprobs.
-        includes_prompt = num_computed_tokens < prompt_lens - 1
+        computed_prefill = self.req_states.num_computed_prefill_tokens[idx_mapping_np]
+        includes_prompt = computed_prefill < prompt_lens - 1
         # NOTE(woosuk): If the request was resumed after preemption, its prompt
         # logprobs must have been computed before preemption. Skip.
         resumed_after_prompt = (
@@ -550,8 +581,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         token_ids[n - 1] = 0
 
         # Handle chunked prompts.
-        seq_lens = self.input_buffers.seq_lens.np[: input_batch.num_reqs]
-        is_prompt_chunked = seq_lens < prompt_lens
+        pos_after_step = computed_prefill + input_batch.num_scheduled_tokens
+        is_prompt_chunked = pos_after_step < prompt_lens
         prefill_token_ids = self.req_states.prefill_token_ids
         query_start_loc = self.input_buffers.query_start_loc.np
         for i, req_id in enumerate(input_batch.req_ids):
@@ -561,7 +592,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 continue
             # The prompt is chunked. Get the next prompt token.
             req_idx = input_batch.idx_mapping_np[i]
-            next_prompt_token = int(prefill_token_ids[req_idx, seq_lens[i]])
+            next_prompt_token = int(prefill_token_ids[req_idx, pos_after_step[i]])
             idx = int(query_start_loc[i + 1] - 1)
             # Set the next prompt token.
             # NOTE(woosuk): This triggers a GPU operation.
@@ -617,48 +648,27 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def postprocess(
         self,
-        sampler_output: SamplerOutput,
-        prompt_logprobs_dict: dict[str, LogprobsTensors],
         input_batch: InputBatch,
-    ) -> AsyncOutput | ModelRunnerOutput:
-        # Store the last sampled token ids.
-        self.req_states.last_sampled_tokens[input_batch.idx_mapping] = (
-            sampler_output.sampled_token_ids
+        sampled_tokens: torch.Tensor,
+        num_sampled: torch.Tensor,
+    ) -> None:
+        # Update the number of computed tokens.
+        update_num_computed_tokens(
+            input_batch.idx_mapping,
+            self.req_states.num_computed_tokens,
+            input_batch.query_start_loc,
         )
-        # Get the number of sampled tokens.
-        # 0 if chunked-prefilling, 1 if not.
         idx_mapping_np = input_batch.idx_mapping_np
-        is_chunked_prefilling = (
-            input_batch.seq_lens_np < self.req_states.num_tokens[idx_mapping_np]
-        )
-        num_sampled_tokens = (~is_chunked_prefilling).astype(np.int32)
-        # Increment the number of tokens.
-        self.req_states.num_tokens[idx_mapping_np] += num_sampled_tokens
-        # Increment the number of computed tokens.
-        self.req_states.num_computed_tokens[idx_mapping_np] += (
-            input_batch.num_scheduled_tokens
+        computed_prefill = self.req_states.num_computed_prefill_tokens
+        # TODO(woosuk): Simplify this.
+        computed_prefill[idx_mapping_np] = np.minimum(
+            computed_prefill[idx_mapping_np] + input_batch.num_scheduled_tokens,
+            self.req_states.prefill_len.np[idx_mapping_np],
         )
 
-        model_runner_output = ModelRunnerOutput(
-            req_ids=input_batch.req_ids,
-            req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)},
-            sampled_token_ids=None,  # type: ignore
-            logprobs=None,
-            prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore
-            pooler_output=[],
-            kv_connector_output=None,
-            num_nans_in_logits=None,
-        )
-        async_output = AsyncOutput(
-            model_runner_output=model_runner_output,
-            sampler_output=sampler_output,
-            num_sampled_tokens=num_sampled_tokens,
-            copy_stream=self.output_copy_stream,
-            copy_event=self.output_copy_event,
-        )
-        if self.use_async_scheduling:
-            return async_output
-        return async_output.get_output()
+        # Store the last sampled token ids.
+        last_sampled = sampled_tokens
+        self.req_states.last_sampled_tokens[input_batch.idx_mapping] = last_sampled
 
     def get_cudagraph_and_dp_padding(
         self,
@@ -782,6 +792,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             )
         else:
             # Run PyTorch model in eager mode.
+            # TODO(woosuk): Support piecewise CUDA graph.
             with set_forward_context(
                 input_batch.attn_metadata,
                 self.vllm_config,
@@ -807,13 +818,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.execute_model_state = None  # type: ignore
         assert sampling_metadata is not None
 
-        sampler_output = self.sample(
+        sampler_output, num_sampled_tokens = self.sample(
             hidden_states, input_batch, sampling_metadata, grammar_output
         )
         prompt_logprobs_dict = self.compute_prompt_logprobs(hidden_states, input_batch)
-        output = self.postprocess(
-            sampler_output,
-            prompt_logprobs_dict,
-            input_batch,
+
+        # Prepare the model runner output.
+        model_runner_output = ModelRunnerOutput(
+            req_ids=input_batch.req_ids,
+            # NOTE(woosuk): req_id_to_index is unused in this model runner.
+            # Only for compatibility with the existing model runner and scheduler.
+            req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)},
+            sampled_token_ids=None,  # type: ignore
+            logprobs=None,
+            prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore
+            pooler_output=[],
+            kv_connector_output=None,
+            num_nans_in_logits=None,
         )
-        return output
+        async_output = AsyncOutput(
+            model_runner_output=model_runner_output,
+            sampler_output=sampler_output,
+            num_sampled_tokens=num_sampled_tokens,
+            copy_stream=self.output_copy_stream,
+            copy_event=self.output_copy_event,
+        )
+
+        # Postprocess results and update request states.
+        # NOTE: This is intentionally done after creating the AsyncOutput,
+        # ensuring that `copy_event` is recorded before calling postprocess.
+        # This sequencing may slightly reduce latency as async D2H copy does not
+        # need to wait for the postprocess to finish.
+        self.postprocess(
+            input_batch, sampler_output.sampled_token_ids, num_sampled_tokens
+        )
+
+        if self.use_async_scheduling:
+            return async_output
+        return async_output.get_output()
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 5d05c3f57790a..e8a3207a3a53e 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -85,8 +85,12 @@ class RequestState:
             dtype=np.int32,
         )
         self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
-        self.num_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
-        self.num_computed_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
+
+        # Number of computed tokens.
+        self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens = torch.zeros(
+            self.max_num_reqs, dtype=torch.int32, device=device
+        )
 
         # Last sampled tokens.
         self.last_sampled_tokens = torch.zeros(
@@ -145,7 +149,10 @@ class RequestState:
         )
         self.prefill_len.np[req_idx] = prefill_len
         self.prefill_token_ids[req_idx, :prefill_len] = prefill_token_ids
-        self.num_tokens[req_idx] = prefill_len
+
+        self.num_computed_prefill_tokens[req_idx] = num_computed_tokens
+        # FIXME(woosuk): This triggers a GPU operation whenever adding a new request.
+        # Optimize this.
         self.num_computed_tokens[req_idx] = num_computed_tokens
 
         if lora_request is not None:

From b004c00418268daa61b3526358b661165a360f7d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 23 Nov 2025 10:09:06 -0800
Subject: [PATCH 183/249] [Model Runner V2] Support spec decoding [1/N]
 (#29274)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py             | 122 ++++++++++++++++--
 vllm/v1/worker/gpu/model_runner.py            |  86 ++++++++++--
 vllm/v1/worker/gpu/spec_decode/__init__.py    |   0
 .../gpu/spec_decode/rejection_sample.py       |  71 ++++++++++
 vllm/v1/worker/gpu/states.py                  |  94 ++++++++++++++
 5 files changed, 347 insertions(+), 26 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/spec_decode/__init__.py
 create mode 100644 vllm/v1/worker/gpu/spec_decode/rejection_sample.py

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index b671c093113ba..7675cb45170b5 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -35,6 +35,7 @@ class InputBuffers:
         self.positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=device)
         self.query_start_loc = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
         self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
+        self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
 
         # Structured outputs.
         self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
@@ -64,6 +65,7 @@ class InputBatch:
     # sum(num_scheduled_tokens)
     num_tokens: int
     num_tokens_after_padding: int
+    num_draft_tokens: int
 
     # [num_reqs + 1]
     query_start_loc: torch.Tensor
@@ -80,8 +82,10 @@ class InputBatch:
     # layer_name -> Metadata
     attn_metadata: dict[str, Any]
 
-    # [num_reqs]
+    # [total_num_logits]
     logits_indices: torch.Tensor
+    # [num_reqs + 1]
+    cu_num_logits: torch.Tensor
 
     @classmethod
     def make_dummy(
@@ -118,6 +122,7 @@ class InputBatch:
         positions = input_buffers.positions[:num_tokens]
         # attn_metadata = defaultdict(lambda: None)
         logits_indices = query_start_loc[1:] - 1
+        cu_num_logits = torch.arange(num_reqs + 1, device=device, dtype=torch.int32)
         return cls(
             req_ids=req_ids,
             num_reqs=num_reqs,
@@ -126,6 +131,7 @@ class InputBatch:
             num_scheduled_tokens=num_scheduled_tokens,
             num_tokens=num_tokens,
             num_tokens_after_padding=num_tokens,
+            num_draft_tokens=0,
             query_start_loc=query_start_loc,
             query_start_loc_np=query_start_loc_np,
             seq_lens=seq_lens,
@@ -134,6 +140,7 @@ class InputBatch:
             positions=positions,
             attn_metadata=None,  # type: ignore
             logits_indices=logits_indices,
+            cu_num_logits=cu_num_logits,
         )
 
 
@@ -279,19 +286,53 @@ def _combine_sampled_and_draft_tokens_kernel(
     query_start_loc_ptr,
     seq_lens_ptr,
     prefill_len_ptr,
+    draft_tokens_ptr,
+    draft_tokens_stride,
+    cu_num_logits_ptr,
+    logits_indices_ptr,
+    BLOCK_SIZE: tl.constexpr,
 ):
     batch_idx = tl.program_id(0)
     req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
 
+    # Get the number of logits and draft tokens.
+    cu_num_logits_start = tl.load(cu_num_logits_ptr + batch_idx)
+    cu_num_logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1)
+    num_logits = cu_num_logits_end - cu_num_logits_start
+    num_draft_tokens = num_logits - 1
+
+    # Compute the logits indices.
+    block = tl.arange(0, BLOCK_SIZE)
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    logits_start = query_end - num_logits
+    tl.store(
+        logits_indices_ptr + cu_num_logits_start + block,
+        logits_start + block,
+        mask=block < num_logits,
+    )
+
     seq_len = tl.load(seq_lens_ptr + batch_idx)
     prefill_len = tl.load(prefill_len_ptr + req_state_idx)
     if seq_len <= prefill_len:
-        # Handling prefill tokens.
+        # Handling prefill tokens. No sampled or draft tokens.
         return
 
+    # Write the last sampled token ID to input_ids.
     last_token_id = tl.load(last_sampled_tokens_ptr + req_state_idx)
-    end = tl.load(query_start_loc_ptr + batch_idx + 1)
-    tl.store(input_ids_ptr + end - 1, last_token_id)
+    tl.store(input_ids_ptr + query_end - num_logits, last_token_id)
+
+    # Write the draft tokens (if any) to input_ids.
+    if num_draft_tokens > 0:
+        mask = block < num_draft_tokens
+        draft_tokens = tl.load(
+            draft_tokens_ptr + req_state_idx * draft_tokens_stride + block,
+            mask=mask,
+        )
+        tl.store(
+            input_ids_ptr + query_end - num_draft_tokens + block,
+            draft_tokens,
+            mask=mask,
+        )
 
 
 def combine_sampled_and_draft_tokens(
@@ -301,8 +342,18 @@ def combine_sampled_and_draft_tokens(
     query_start_loc: torch.Tensor,
     seq_lens: torch.Tensor,
     prefill_len: torch.Tensor,
+    draft_tokens: torch.Tensor,
+    cu_num_logits: torch.Tensor,
+    num_logits: int,
 ) -> torch.Tensor:
     num_reqs = seq_lens.shape[0]
+    num_speculative_steps = draft_tokens.shape[-1]
+
+    logits_indices = torch.empty(
+        num_logits,
+        dtype=torch.int64,
+        device=input_ids.device,
+    )
     _combine_sampled_and_draft_tokens_kernel[(num_reqs,)](
         input_ids,
         idx_mapping,
@@ -310,35 +361,80 @@ def combine_sampled_and_draft_tokens(
         query_start_loc,
         seq_lens,
         prefill_len,
+        draft_tokens,
+        draft_tokens.stride(0),
+        cu_num_logits,
+        logits_indices,
+        # NOTE(woosuk): Add 1 to ensure the block can cover the last sampled token
+        # in addition to all draft tokens.
+        BLOCK_SIZE=triton.next_power_of_2(num_speculative_steps + 1),
     )
-    return input_ids
+    return logits_indices
 
 
 @triton.jit
-def _update_num_computed_tokens_kernel(
+def _post_update_kernel(
     idx_mapping_ptr,
     num_computed_tokens_ptr,
+    last_sampled_tokens_ptr,
+    sampled_tokens_ptr,
+    sampled_tokens_stride,
+    num_sampled_ptr,
     query_start_loc_ptr,
+    cu_num_logits_ptr,
 ):
     req_id = tl.program_id(0)
     req_state_idx = tl.load(idx_mapping_ptr + req_id)
 
-    start = tl.load(query_start_loc_ptr + req_id)
-    end = tl.load(query_start_loc_ptr + req_id + 1)
-    query_len = end - start
+    num_sampled = tl.load(num_sampled_ptr + req_id)
+    if num_sampled > 0:
+        token_id = tl.load(
+            sampled_tokens_ptr + req_id * sampled_tokens_stride + num_sampled - 1
+        )
+        tl.store(last_sampled_tokens_ptr + req_state_idx, token_id)
 
-    n = tl.load(num_computed_tokens_ptr + req_state_idx)
-    tl.store(num_computed_tokens_ptr + req_state_idx, n + query_len)
+    query_start = tl.load(query_start_loc_ptr + req_id)
+    query_end = tl.load(query_start_loc_ptr + req_id + 1)
+    query_len = query_end - query_start
+
+    num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
+    num_computed += query_len
+    # Consider the rejected tokens in spec decoding.
+    if num_sampled > 0:
+        # NOTE(woosuk): We must skip num_sampled == 0 to account for chunked prefills.
+        logits_start = tl.load(cu_num_logits_ptr + req_id)
+        logits_end = tl.load(cu_num_logits_ptr + req_id + 1)
+        num_logits = logits_end - logits_start
+        num_rejected = num_logits - num_sampled
+        num_computed -= num_rejected
+    tl.store(num_computed_tokens_ptr + req_state_idx, num_computed)
 
 
-def update_num_computed_tokens(
+def post_update(
+    # [num_reqs]
     idx_mapping: torch.Tensor,
+    # [max_num_reqs]
     num_computed_tokens: torch.Tensor,
+    # [max_num_reqs]
+    last_sampled_tokens: torch.Tensor,
+    # [num_reqs, num_speculative_steps + 1]
+    sampled_tokens: torch.Tensor,
+    # [num_reqs]
+    num_sampled: torch.Tensor,
+    # [num_reqs + 1]
     query_start_loc: torch.Tensor,
+    # [num_reqs + 1]
+    cu_num_logits: torch.Tensor,
 ) -> None:
     num_reqs = idx_mapping.shape[0]
-    _update_num_computed_tokens_kernel[(num_reqs,)](
+    _post_update_kernel[(num_reqs,)](
         idx_mapping,
         num_computed_tokens,
+        last_sampled_tokens,
+        sampled_tokens,
+        sampled_tokens.stride(0),
+        num_sampled,
         query_start_loc,
+        cu_num_logits,
+        num_warps=1,
     )
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index bacfbd6c2f465..4b4ee92176f2c 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -40,11 +40,12 @@ from vllm.v1.worker.gpu.input_batch import (
     InputBatch,
     InputBuffers,
     combine_sampled_and_draft_tokens,
+    post_update,
     prepare_pos_seq_lens,
     prepare_prefill_inputs,
-    update_num_computed_tokens,
 )
 from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
+from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
 from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
 from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
@@ -100,10 +101,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.input_prep_event = None
             self.structured_outputs_event = None
 
+        if self.speculative_config is not None:
+            self.do_spec_decode = True
+            self.num_speculative_steps = self.speculative_config.num_speculative_tokens
+        else:
+            self.do_spec_decode = False
+            self.num_speculative_steps = 0
+
         self.req_states = RequestState(
             max_num_reqs=self.max_num_reqs,
             max_model_len=self.max_model_len,
             max_num_batched_tokens=self.max_num_tokens,
+            num_speculative_steps=self.num_speculative_steps,
             vocab_size=self.vocab_size,
             device=self.device,
             pin_memory=self.pin_memory,
@@ -427,6 +436,32 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         idx_mapping_np = idx_mapping.np[:num_reqs]
         idx_mapping = idx_mapping.copy_to_gpu(num_reqs)
 
+        # Get the number of draft tokens for each request.
+        if not scheduler_output.scheduled_spec_decode_tokens:
+            # No draft token scheduled (common case).
+            total_num_draft_tokens = 0
+            total_num_logits = num_reqs
+            cu_num_logits = torch.arange(
+                num_reqs + 1, device=self.device, dtype=torch.int32
+            )
+        else:
+            draft_tokens = scheduler_output.scheduled_spec_decode_tokens
+            num_draft_tokens = np.array(
+                [
+                    len(draft_tokens[req_id]) if req_id in draft_tokens else 0
+                    for req_id in req_ids
+                ],
+                dtype=np.int32,
+            )
+            total_num_draft_tokens = int(num_draft_tokens.sum())
+            total_num_logits = num_reqs + total_num_draft_tokens
+
+            np.cumsum(
+                num_draft_tokens + 1,
+                out=self.input_buffers.cu_num_logits.np[1 : num_reqs + 1],
+            )
+            cu_num_logits = self.input_buffers.cu_num_logits.copy_to_gpu(num_reqs + 1)
+
         # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
         block_tables = self.block_tables.gather_block_tables(idx_mapping)
 
@@ -456,14 +491,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         seq_lens = self.input_buffers.seq_lens[:num_reqs]
 
         # Some input token ids are directly read from the last sampled tokens
-        # and draft tokens.
-        combine_sampled_and_draft_tokens(
+        # and draft tokens. Also, get the logits indices to sample tokens from.
+        logits_indices = combine_sampled_and_draft_tokens(
             self.input_buffers.input_ids.gpu,
             idx_mapping,
             self.req_states.last_sampled_tokens,
             query_start_loc_gpu,
             seq_lens,
             self.req_states.prefill_len.gpu,
+            self.req_states.draft_tokens,
+            cu_num_logits,
+            total_num_logits,
         )
 
         # Compute slot mappings: [num_kv_cache_groups, num_tokens]
@@ -471,9 +509,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             query_start_loc_gpu, self.input_buffers.positions[:num_tokens]
         )
 
-        # Logits indices to sample next token from.
-        logits_indices = query_start_loc_gpu[1:] - 1
-
         # Get num_computed_tokens.
         # HACK(woosuk): Here, we use num_computed_tokens on GPU instead of
         # num_computed_tokens_cpu. This works for most cases.
@@ -508,6 +543,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_scheduled_tokens=num_scheduled_tokens,
             num_tokens=num_tokens,
             num_tokens_after_padding=num_tokens_after_padding,
+            num_draft_tokens=total_num_draft_tokens,
             query_start_loc=query_start_loc_gpu,
             query_start_loc_np=query_start_loc_np,
             seq_lens=seq_lens,
@@ -516,6 +552,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             positions=positions,
             attn_metadata=attn_metadata,
             logits_indices=logits_indices,
+            cu_num_logits=cu_num_logits,
         )
 
     def sample(
@@ -530,6 +567,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if grammar_output is not None:
             # Apply grammar bitmask to the logits in-place.
             # TODO(woosuk): Make compatible with spec decoding.
+            assert input_batch.num_draft_tokens == 0
             with async_barrier(self.structured_outputs_event):
                 apply_grammar_bitmask(
                     logits,
@@ -539,12 +577,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     self.input_buffers,
                 )
 
+        # Sample tokens and compute logprobs (if needed).
         sampler_output = self.sampler(logits, sampling_metadata)
+
         # Get the number of sampled tokens.
-        # 0 if chunked-prefilling, 1 if not.
         prefill_len = self.req_states.prefill_len.gpu[input_batch.idx_mapping]
         is_chunked_prefilling = input_batch.seq_lens < prefill_len
-        num_sampled = (~is_chunked_prefilling).int()
+        if input_batch.num_draft_tokens == 0:
+            # No draft tokens (common case).
+            # 0 if chunked-prefilling, 1 if not.
+            num_sampled = (~is_chunked_prefilling).int()
+        else:
+            # Draft tokens for spec decoding.
+            input_ids = input_batch.input_ids[input_batch.logits_indices]
+            sampled_tokens, num_sampled = rejection_sample(
+                sampler_output.sampled_token_ids,
+                input_ids,
+                input_batch.cu_num_logits,
+                self.num_speculative_steps,
+            )
+            num_sampled *= ~is_chunked_prefilling
+            sampler_output.sampled_token_ids = sampled_tokens
+            # TODO(woosuk): Support logprobs with spec decoding.
         return sampler_output, num_sampled
 
     def compute_prompt_logprobs(
@@ -653,11 +707,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         num_sampled: torch.Tensor,
     ) -> None:
         # Update the number of computed tokens.
-        update_num_computed_tokens(
+        post_update(
             input_batch.idx_mapping,
             self.req_states.num_computed_tokens,
+            self.req_states.last_sampled_tokens,
+            sampled_tokens,
+            num_sampled,
             input_batch.query_start_loc,
+            input_batch.cu_num_logits,
         )
+
+        # Update the number of computed prefill tokens.
         idx_mapping_np = input_batch.idx_mapping_np
         computed_prefill = self.req_states.num_computed_prefill_tokens
         # TODO(woosuk): Simplify this.
@@ -666,10 +726,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.req_states.prefill_len.np[idx_mapping_np],
         )
 
-        # Store the last sampled token ids.
-        last_sampled = sampled_tokens
-        self.req_states.last_sampled_tokens[input_batch.idx_mapping] = last_sampled
-
     def get_cudagraph_and_dp_padding(
         self,
         scheduler_output: SchedulerOutput,
@@ -761,6 +817,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 sampling_metadata = self.req_states.make_sampling_metadata(
                     input_batch.idx_mapping_np, pos
                 )
+                if input_batch.num_draft_tokens > 0:
+                    sampling_metadata = self.req_states.expand_sampling_metadata(
+                        sampling_metadata, input_batch.cu_num_logits
+                    )
 
                 if self.lora_config:
                     # Activate LoRA adapters.
diff --git a/vllm/v1/worker/gpu/spec_decode/__init__.py b/vllm/v1/worker/gpu/spec_decode/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
new file mode 100644
index 0000000000000..8a7bf28bacbd4
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _rejection_sample_kernel(
+    sampled_ptr,  # [num_reqs, num_speculative_steps + 1]
+    sampled_stride,
+    num_sampled_ptr,  # [num_reqs]
+    target_sampled_ptr,  # [num_draft_tokens + num_reqs]
+    input_ids_ptr,  # [num_draft_tokens + num_reqs]
+    cu_num_logits_ptr,  # [num_reqs + 1]
+):
+    req_idx = tl.program_id(0)
+    start_idx = tl.load(cu_num_logits_ptr + req_idx)
+    end_idx = tl.load(cu_num_logits_ptr + req_idx + 1)
+    num_tokens = end_idx - start_idx
+
+    num_sampled = 0
+    rejected = False
+    for i in range(num_tokens - 1):
+        if not rejected:
+            target_sampled = tl.load(target_sampled_ptr + start_idx + i)
+            draft_sampled = tl.load(input_ids_ptr + start_idx + i + 1)
+            tl.store(sampled_ptr + req_idx * sampled_stride + i, target_sampled)
+            num_sampled += 1
+            if target_sampled != draft_sampled:
+                rejected = True
+    if not rejected:
+        target_sampled = tl.load(target_sampled_ptr + start_idx + num_tokens - 1)
+        tl.store(
+            sampled_ptr + req_idx * sampled_stride + num_tokens - 1, target_sampled
+        )
+        num_sampled += 1
+    tl.store(num_sampled_ptr + req_idx, num_sampled)
+
+
+def rejection_sample(
+    # [num_draft_tokens + num_reqs]
+    target_sampled: torch.Tensor,
+    # [num_draft_tokens + num_reqs]
+    input_ids: torch.Tensor,
+    # [num_reqs + 1]
+    cu_num_logits: torch.Tensor,
+    num_speculative_steps: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    num_reqs = cu_num_logits.shape[0] - 1
+    sampled = torch.empty(
+        num_reqs,
+        num_speculative_steps + 1,
+        dtype=target_sampled.dtype,
+        device=target_sampled.device,
+    )
+    num_sampled = torch.empty(
+        num_reqs,
+        dtype=torch.int32,
+        device=target_sampled.device,
+    )
+    _rejection_sample_kernel[(num_reqs,)](
+        sampled,
+        sampled.stride(0),
+        num_sampled,
+        target_sampled,
+        input_ids,
+        cu_num_logits,
+        num_warps=1,
+    )
+    return sampled, num_sampled
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index e8a3207a3a53e..513d45d95d7cd 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -7,6 +7,7 @@ import torch
 
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
+from vllm.triton_utils import tl, triton
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.utils import CpuGpuBuffer
 
@@ -63,6 +64,7 @@ class RequestState:
         max_num_reqs: int,
         max_model_len: int,
         max_num_batched_tokens: int,
+        num_speculative_steps: int,
         vocab_size: int,
         device: torch.device,
         pin_memory: bool,
@@ -70,6 +72,7 @@ class RequestState:
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
         self.max_num_batched_tokens = max_num_batched_tokens
+        self.num_speculative_steps = num_speculative_steps
         self.vocab_size = vocab_size
         self.device = device
         self.pin_memory = pin_memory
@@ -100,6 +103,14 @@ class RequestState:
             device=device,
         )
 
+        # Draft tokens.
+        self.draft_tokens = torch.zeros(
+            self.max_num_reqs,
+            self.num_speculative_steps,
+            dtype=torch.int64,
+            device=device,
+        )
+
         # LoRA.
         self.lora_ids = np.zeros(self.max_num_reqs, dtype=np.int32)
         self.lora_ids.fill(NO_LORA_ID)
@@ -226,6 +237,17 @@ class RequestState:
             max_num_logprobs=max_num_logprobs,
         )
 
+    def expand_sampling_metadata(
+        self,
+        sampling_metadata: SamplingMetadata,
+        cu_num_logits: torch.Tensor,
+    ) -> SamplingMetadata:
+        # For draft tokens, we need to expand the sampling param tensors as
+        # each request samples multiple tokens in each step.
+        return expand_sampling_metadata(
+            sampling_metadata, cu_num_logits, self.num_speculative_steps
+        )
+
     def make_lora_inputs(
         self,
         req_ids: list[str],
@@ -270,3 +292,75 @@ class Param:
 class ExtraData:
     lora_request: LoRARequest | None
     in_progress_prompt_logprobs: list[LogprobsTensors] = field(default_factory=list)
+
+
+# NOTE(woosuk): Re-compilation can happen at runtime since top_p and top_k can be None.
+@triton.jit
+def _expand_sampling_metadata_kernel(
+    temp_ptr,
+    expanded_temp_ptr,
+    top_p_ptr,
+    expanded_top_p_ptr,
+    top_k_ptr,
+    expanded_top_k_ptr,
+    seeds_ptr,
+    expanded_seeds_ptr,
+    cu_num_logits_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    start_idx = tl.load(cu_num_logits_ptr + req_idx)
+    end_idx = tl.load(cu_num_logits_ptr + req_idx + 1)
+    num_tokens = end_idx - start_idx
+
+    block = tl.arange(0, BLOCK_SIZE)
+    mask = block < num_tokens
+
+    temp = tl.load(temp_ptr + req_idx)
+    tl.store(expanded_temp_ptr + start_idx + block, temp, mask=mask)
+
+    if top_p_ptr is not None:
+        top_p = tl.load(top_p_ptr + req_idx)
+        tl.store(expanded_top_p_ptr + start_idx + block, top_p, mask=mask)
+
+    if top_k_ptr is not None:
+        top_k = tl.load(top_k_ptr + req_idx)
+        tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask)
+
+    seed = tl.load(seeds_ptr + req_idx)
+    tl.store(expanded_seeds_ptr + start_idx + block, seed, mask=mask)
+
+
+def expand_sampling_metadata(
+    sampling_metadata: SamplingMetadata,
+    cu_num_logits: torch.Tensor,
+    num_speculative_steps: int,
+) -> SamplingMetadata:
+    total_num_logits = sampling_metadata.pos.shape[0]
+    create_empty = lambda x: x.new_empty(total_num_logits) if x is not None else None
+    expanded_temp = create_empty(sampling_metadata.temperature)
+    expanded_top_p = create_empty(sampling_metadata.top_p)
+    expanded_top_k = create_empty(sampling_metadata.top_k)
+    expanded_seeds = create_empty(sampling_metadata.seeds)
+
+    num_reqs = cu_num_logits.shape[0] - 1
+    _expand_sampling_metadata_kernel[(num_reqs,)](
+        sampling_metadata.temperature,
+        expanded_temp,
+        sampling_metadata.top_p,
+        expanded_top_p,
+        sampling_metadata.top_k,
+        expanded_top_k,
+        sampling_metadata.seeds,
+        expanded_seeds,
+        cu_num_logits,
+        BLOCK_SIZE=triton.next_power_of_2(num_speculative_steps + 1),
+    )
+    return SamplingMetadata(
+        temperature=expanded_temp,
+        top_p=expanded_top_p,
+        top_k=expanded_top_k,
+        seeds=expanded_seeds,
+        pos=sampling_metadata.pos,
+        max_num_logprobs=sampling_metadata.max_num_logprobs,
+    )

From 62d54ba46db25b95de2d21e46f4b57b5502ed747 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 23 Nov 2025 11:15:32 -0800
Subject: [PATCH 184/249] [Model Runner V2] Optimize CUDA graph capture time
 (#29275)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py | 5 ++++-
 vllm/v1/worker/gpu/model_runner.py    | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index dccf747369e4e..ba783e2d0c6fb 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -106,7 +106,10 @@ class CudaGraphManager:
         input_buffers.query_start_loc.np[: batch_size + 1] = np.arange(batch_size + 1)
         input_buffers.query_start_loc.np[batch_size:] = batch_size
         input_buffers.query_start_loc.copy_to_gpu()
-        input_buffers.seq_lens[:batch_size] = self.max_model_len
+        # HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len)
+        # for seq_lens. This leads to a mismatch between seq_lens (GPU) and
+        # seq_lens_np (CPU), which might cause issues in some attention backends.
+        input_buffers.seq_lens[:batch_size] = 1
         input_buffers.seq_lens[batch_size:] = 0
 
         input_block_tables = [x[:batch_size] for x in block_tables.input_block_tables]
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 4b4ee92176f2c..6e332ee4b75b8 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -313,6 +313,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return 0
 
         start_time = time.perf_counter()
+        gc.collect()
         torch.cuda.empty_cache()
         start_free_gpu_memory = torch.cuda.mem_get_info()[0]
 

From 3e1ad406559c3b520eeda0e681ea68d33daf1be1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 23 Nov 2025 14:13:00 -0800
Subject: [PATCH 185/249] [Model Runner V2] Add apply_temperature option to
 gumbel_sample (#29276)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/sampler.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
index 499e9d3b1538d..c48ed2d8ca167 100644
--- a/vllm/v1/worker/gpu/sampler.py
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -68,9 +68,10 @@ class Sampler:
 
         sampled = gumbel_sample(
             logits,
-            is_greedy,
+            sampling_metadata.temperature,
             sampling_metadata.seeds,
             sampling_metadata.pos,
+            apply_temperature=False,
         )
         return sampled, logits if return_logits else None
 
@@ -85,9 +86,10 @@ def _gumbel_sample_kernel(
     logits_stride,
     seeds_ptr,
     pos_ptr,
-    is_greedy_ptr,
+    temp_ptr,
     vocab_size,
     BLOCK_SIZE: tl.constexpr,
+    APPLY_TEMPERATURE: tl.constexpr,
 ):
     req_idx = tl.program_id(0)
     block_idx = tl.program_id(1)
@@ -99,8 +101,8 @@ def _gumbel_sample_kernel(
         other=float("-inf"),
     )
 
-    is_greedy = tl.load(is_greedy_ptr + req_idx)
-    if not is_greedy:
+    temp = tl.load(temp_ptr + req_idx)
+    if temp != 0.0:
         # Calculate the seed for gumbel noise.
         seed = tl.load(seeds_ptr + req_idx)
         pos = tl.load(pos_ptr + req_idx)
@@ -111,6 +113,11 @@ def _gumbel_sample_kernel(
         gumbel_noise = -tl.log(-tl.log(r + 1e-20) + 1e-20)
         gumbel_noise = gumbel_noise.to(tl.float32)
 
+        # Apply temperature.
+        if APPLY_TEMPERATURE:
+            # NOTE(woosuk): Use div_rn to match the behavior of torch.
+            logits = tl.div_rn(logits, temp.to(tl.float32))
+
         # Apply gumbel noise.
         logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
 
@@ -123,9 +130,10 @@ def _gumbel_sample_kernel(
 
 def gumbel_sample(
     logits: torch.Tensor,  # [num_reqs, vocab_size]
-    is_greedy: torch.Tensor,  # [num_reqs]
+    temperature: torch.Tensor,  # [num_reqs]
     seed: torch.Tensor,  # [num_reqs]
     pos: torch.Tensor,  # [num_reqs]
+    apply_temperature: bool,
 ) -> torch.Tensor:
     num_reqs, vocab_size = logits.shape
     BLOCK_SIZE = 1024
@@ -151,9 +159,10 @@ def gumbel_sample(
         logits.stride(0),
         seed,
         pos,
-        is_greedy,
+        temperature,
         vocab_size,
         BLOCK_SIZE=BLOCK_SIZE,
+        APPLY_TEMPERATURE=apply_temperature,
     )
     # NOTE(woosuk): Use int64 for later indexing.
     max_block_idx = local_max.argmax(dim=-1, keepdim=True)

From c309bb5245b6d05228c9d2f9c8f3e769c08d9194 Mon Sep 17 00:00:00 2001
From: Josh Moore <joshiemoore98@gmail.com>
Date: Sun, 23 Nov 2025 19:47:54 -0500
Subject: [PATCH 186/249] [Bugfix] Update Gradio OpenAI Chatbot Webserver
 example to new Gradio message history format (#29249)

Signed-off-by: joshiemoore <joshiemoore98@gmail.com>
---
 .../gradio_openai_chatbot_webserver.py        | 20 ++++++-------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index d5d0a07a29183..c76c60cc4472d 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -25,25 +25,17 @@ import gradio as gr
 from openai import OpenAI
 
 
-def format_history_to_openai(history):
-    history_openai_format = [
-        {"role": "system", "content": "You are a great AI assistant."}
-    ]
-    for human, assistant in history:
-        history_openai_format.append({"role": "user", "content": human})
-        history_openai_format.append({"role": "assistant", "content": assistant})
-    return history_openai_format
-
-
 def predict(message, history, client, model_name, temp, stop_token_ids):
-    # Format history to OpenAI chat format
-    history_openai_format = format_history_to_openai(history)
-    history_openai_format.append({"role": "user", "content": message})
+    messages = [
+        {"role": "system", "content": "You are a great AI assistant."},
+        *history,
+        {"role": "user", "content": message},
+    ]
 
     # Send request to OpenAI API (vLLM server)
     stream = client.chat.completions.create(
         model=model_name,
-        messages=history_openai_format,
+        messages=messages,
         temperature=temp,
         stream=True,
         extra_body={

From 1073ba68b0a741de6de0a7230795acba151084f2 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 24 Nov 2025 10:27:23 +0800
Subject: [PATCH 187/249] [LoRA] Optimize 3D MoE logic (#29222)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_gptoss_tp.py                 |   7 +-
 vllm/lora/layers/__init__.py                 |   3 +-
 vllm/lora/layers/base.py                     |   4 +-
 vllm/lora/layers/base_linear.py              |   6 +-
 vllm/lora/layers/column_parallel_linear.py   |   4 +-
 vllm/lora/layers/fused_moe.py                | 353 +++++++++++++++----
 vllm/lora/layers/logits_processor.py         |   6 +-
 vllm/lora/layers/vocal_parallel_embedding.py |   7 +-
 vllm/lora/models.py                          |  99 ++++--
 vllm/lora/utils.py                           |  12 +-
 vllm/model_executor/models/gpt_oss.py        |   1 +
 11 files changed, 397 insertions(+), 105 deletions(-)

diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py
index 711d514a39eb3..f4269750feb6b 100644
--- a/tests/lora/test_gptoss_tp.py
+++ b/tests/lora/test_gptoss_tp.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -84,14 +86,17 @@ def test_gpt_oss_lora(gptoss20b_lora_files):
 
 
 @multi_gpu_test(num_gpus=2)
-def test_gpt_oss_lora_tp2(gptoss20b_lora_files):
+@pytest.mark.parametrize("fully_sharded_loras", [False, True])
+def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
     llm = vllm.LLM(
         MODEL_PATH,
         max_model_len=1024,
         enable_lora=True,
         max_loras=2,
         max_lora_rank=8,
+        max_num_seqs=16,
         tensor_parallel_size=2,
+        fully_sharded_loras=fully_sharded_loras,
         compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
             cudagraph_specialize_lora=False,
         ),
diff --git a/vllm/lora/layers/__init__.py b/vllm/lora/layers/__init__.py
index 8a4f5ff175d4f..25364a5881364 100644
--- a/vllm/lora/layers/__init__.py
+++ b/vllm/lora/layers/__init__.py
@@ -11,7 +11,7 @@ from vllm.lora.layers.column_parallel_linear import (
     QKVParallelLinearWithLoRA,
     QKVParallelLinearWithShardedLoRA,
 )
-from vllm.lora.layers.fused_moe import FusedMoEWithLoRA
+from vllm.lora.layers.fused_moe import FusedMoE3DWithLoRA, FusedMoEWithLoRA
 from vllm.lora.layers.logits_processor import LogitsProcessorWithLoRA
 from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA
 from vllm.lora.layers.row_parallel_linear import (
@@ -38,4 +38,5 @@ __all__ = [
     "ReplicatedLinearWithLoRA",
     "LoRAMapping",
     "FusedMoEWithLoRA",
+    "FusedMoE3DWithLoRA",
 ]
diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py
index 62326c05b2bd1..3bfb88c007622 100644
--- a/vllm/lora/layers/base.py
+++ b/vllm/lora/layers/base.py
@@ -42,8 +42,8 @@ class BaseLayerWithLoRA(nn.Module):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         """Overwrites lora tensors at index."""
         ...
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index e85c5bd70b072..06ecc8d2f634c 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -94,13 +94,15 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         # Except for QKVParallelLinearWithLoRA and
         # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
         # store weights in a tuple of size 1. These two layers will
         # override this function.
+        assert isinstance(lora_a, torch.Tensor)
+        assert isinstance(lora_b, torch.Tensor)
         assert (
             len(self.lora_a_stacked) == len(self.lora_b_stacked) == self.n_slices == 1
         )
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index 273c4950e3239..3e21d426c304a 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -246,8 +246,8 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         self.reset_lora(index)
 
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 5aeaca8de5e53..0eb6562bec6cd 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -42,7 +42,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
         self.device = base_layer.w2_weight.device
-        self.w13_slices = 2
+        self._w13_slices = 2
         self._inject_lora_into_fused_moe()
 
     def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]:
@@ -160,7 +160,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                     op_prefix="w13",
                     num_loras=self.max_loras,
                     rank=max_lora_rank,
-                    num_slices=self.w13_slices,
+                    num_slices=self._w13_slices,
                     M=M,
                     layer=layer,
                     top_k=top_k,
@@ -230,7 +230,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
                 num_tokens = hidden_states.size(0)
                 M = min(num_tokens, CHUNK_SIZE)
-                max_lora_rank = self.w2_lora_a_stacked.shape[-2]
+                max_lora_rank = self.w2_lora_a_stacked[0].shape[-2]
                 shrink_config, expand_config = self._get_lora_moe_configs(
                     op_prefix="w2",
                     num_loras=self.max_loras,
@@ -258,8 +258,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 self.punica_wrapper.add_lora_fused_moe(
                     intermediate_cache3,
                     intermediate_cache2,
-                    (self.w2_lora_a_stacked,),
-                    (self.w2_lora_b_stacked,),
+                    self.w2_lora_a_stacked,
+                    self.w2_lora_b_stacked,
                     topk_weights,
                     sorted_token_ids_lora,
                     expert_ids_lora,
@@ -292,22 +292,12 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
             self.base_layer.quant_method, m_fused_moe_fn
         )
 
-    def create_lora_weights(
+    def _create_lora_a_weights(
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: PretrainedConfig | None = None,
-    ) -> None:
-        """Initializes lora matrices."""
-        assert self.w13_slices == 2
-        self.max_loras = lora_config.max_loras
-        self.fully_sharded = lora_config.fully_sharded_loras
-
-        self.adapter_enabled = torch.tensor(
-            [0] * (max_loras + 1), dtype=torch.int, device=self.device
-        )
-
-        self.w13_lora_a_stacked = tuple(
+    ):
+        self.w13_lora_a_stacked: tuple[torch.Tensor, ...] = tuple(
             torch.zeros(
                 (
                     max_loras,
@@ -320,10 +310,23 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 dtype=lora_config.lora_dtype,
                 device=self.device,
             )
-            for _ in range(self.w13_slices)
+            for _ in range(self._w13_slices)
+        )
+        self.w2_lora_a_stacked: tuple[torch.Tensor, ...] = (
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    lora_config.max_lora_rank,
+                    self.base_layer.intermediate_size_per_partition,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ),
         )
 
-        self.w13_lora_b_stacked = tuple(
+    def _create_lora_b_weights(self, max_loras: int, lora_config: LoRAConfig):
+        self.w13_lora_b_stacked: tuple[torch.Tensor, ...] = tuple(
             torch.zeros(
                 (
                     max_loras,
@@ -334,34 +337,42 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 dtype=lora_config.lora_dtype,
                 device=self.device,
             )
-            for _ in range(self.w13_slices)
+            for _ in range(self._w13_slices)
+        )
+        self.w2_lora_b_stacked: tuple[torch.Tensor, ...] = (
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    self.base_layer.hidden_size
+                    if not self.fully_sharded
+                    else divide(self.base_layer.hidden_size, self.tp_size),
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ),
         )
 
-        self.w2_lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                lora_config.max_lora_rank,
-                self.base_layer.intermediate_size_per_partition,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.w2_lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                self.base_layer.hidden_size
-                if not self.fully_sharded
-                else divide(self.base_layer.hidden_size, self.tp_size),
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: PretrainedConfig | None = None,
+    ) -> None:
+        """Initializes lora matrices."""
+        self.max_loras = lora_config.max_loras
+        self.fully_sharded = lora_config.fully_sharded_loras
+
+        self.adapter_enabled = torch.tensor(
+            [0] * (max_loras + 1), dtype=torch.int, device=self.device
         )
 
+        self._create_lora_a_weights(max_loras, lora_config)
+        self._create_lora_b_weights(max_loras, lora_config)
         # They will be used by 'LoRALayerWeights.create_dummy_lora_weights'
         # to create a dummy LoRA weights.
+        # TODO Optimize this section
         self.lora_a_stacked = []
         self.lora_b_stacked = []
         for lora_id in range(max_loras):
@@ -370,36 +381,43 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 self.lora_a_stacked.append(
                     self.w13_lora_a_stacked[0][lora_id][experts_id]
                 )
-                self.lora_a_stacked.append(self.w2_lora_a_stacked[lora_id][experts_id])
                 self.lora_a_stacked.append(
-                    self.w13_lora_a_stacked[1][lora_id][experts_id]
+                    self.w2_lora_a_stacked[0][lora_id][experts_id]
                 )
 
                 self.lora_b_stacked.append(
                     self.w13_lora_b_stacked[0][lora_id][experts_id]
                 )
-                self.lora_b_stacked.append(self.w2_lora_b_stacked[lora_id][experts_id])
+                self.lora_b_stacked.append(
+                    self.w2_lora_b_stacked[0][lora_id][experts_id]
+                )
+
+                self.lora_a_stacked.append(
+                    self.w13_lora_a_stacked[1][lora_id][experts_id]
+                )
                 self.lora_b_stacked.append(
                     self.w13_lora_b_stacked[1][lora_id][experts_id]
                 )
 
     def reset_lora(self, index: int):
         """Resets the lora weights at index back to 0."""
-        for pos in range(self.w13_slices):
+        for pos in range(self._w13_slices):
             self.w13_lora_a_stacked[pos][index] = 0
             self.w13_lora_b_stacked[pos][index] = 0
 
-        self.w2_lora_a_stacked[index] = 0
-        self.w2_lora_b_stacked[index] = 0
+        self.w2_lora_a_stacked[0][index] = 0
+        self.w2_lora_b_stacked[0][index] = 0
         self.adapter_enabled[index] = 0
 
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         """Overwrites lora tensors at index."""
+        assert isinstance(lora_a, list)
+        assert isinstance(lora_b, list)
         self.reset_lora(index)
         self.adapter_enabled[index] = 1
         for eid in range(len(lora_a) // 3):
@@ -432,7 +450,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                     w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :]
                     w3_lora_a = w3_lora_a[w13_start_idx:w13_end_idx, :]
 
-                    w2_shard_size = self.w2_lora_b_stacked[index, eid].shape[0]
+                    w2_shard_size = self.w2_lora_b_stacked[0][index, eid].shape[0]
                     w2_start_idx = self.tp_rank * w2_shard_size
                     w2_end_idx = (self.tp_rank + 1) * w2_shard_size
                     w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :]
@@ -454,26 +472,14 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                 index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1]
             ].copy_(w3_lora_b, non_blocking=True)
 
-            self.w2_lora_a_stacked[
+            self.w2_lora_a_stacked[0][
                 index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1]
             ].copy_(w2_lora_a, non_blocking=True)
 
-            self.w2_lora_b_stacked[
+            self.w2_lora_b_stacked[0][
                 index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1]
             ].copy_(w2_lora_b, non_blocking=True)
 
-    @classmethod
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: PretrainedConfig | None,
-    ) -> bool:
-        """Returns True if the layer can be replaced by this LoRA layer."""
-        # return type(source_layer) is FusedMoE
-        return isinstance(source_layer, FusedMoE)
-
     def forward(self, *args, **kwargs):
         return self.base_layer.forward(*args, **kwargs)
 
@@ -491,3 +497,220 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
     @property
     def is_internal_router(self) -> bool:
         return self.base_layer.is_internal_router
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: PretrainedConfig | None,
+    ) -> bool:
+        """Returns True if the layer can be replaced by this LoRA layer."""
+        # return type(source_layer) is FusedMoE
+
+        return type(source_layer) is FusedMoE and len(packed_modules_list) == 2
+
+
+class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
+    def __init__(self, base_layer):
+        super().__init__(base_layer)
+        self._w13_slices = 1
+
+    def _create_lora_b_weights(self, max_loras, lora_config):
+        self.w13_lora_b_stacked: tuple[torch.Tensor] = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    self.base_layer.intermediate_size_per_partition * 2,
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            )
+            for _ in range(self._w13_slices)
+        )
+        self.w2_lora_b_stacked: tuple[torch.Tensor] = (
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    self.base_layer.hidden_size
+                    if not self.fully_sharded
+                    else divide(self.base_layer.hidden_size, self.tp_size),
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ),
+        )
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: PretrainedConfig | None = None,
+    ) -> None:
+        """Initializes lora matrices."""
+        self.max_loras = lora_config.max_loras
+        self.fully_sharded = lora_config.fully_sharded_loras
+
+        self.adapter_enabled = torch.tensor(
+            [0] * (max_loras + 1), dtype=torch.int, device=self.device
+        )
+
+        self._create_lora_a_weights(max_loras, lora_config)
+        self._create_lora_b_weights(max_loras, lora_config)
+
+    def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor:
+        if self.tp_size == 1 or not self.fully_sharded:
+            return w13_lora_a
+
+        # w13_lora_a shape (num_experts,rank,input_size)
+        current_lora_rank = w13_lora_a.shape[1]
+        assert current_lora_rank % self.tp_size == 0
+
+        sliced_rank = current_lora_rank // self.tp_size
+        start_idx = self.tp_rank * sliced_rank
+        end_idx = (self.tp_rank + 1) * sliced_rank
+        return w13_lora_a[:, start_idx:end_idx, :]
+
+    def _slice_w13_b(self, w13_lora_b: torch.Tensor, is_interleave: bool = True):
+        if self.tp_size == 1:
+            return w13_lora_b
+
+        # w13_lora_b shape (num_experts,output_size,rank)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        if is_interleave:
+            # For models like GPT-OSS, the weights of w1 (gate_proj) and w3 (up_proj)
+            # in the interleaved order, and corresponding LoRA need to be processed.
+            w1_lora_b = w13_lora_b[:, ::2, :]
+            w3_lora_b = w13_lora_b[:, 1::2, :]
+            sliced_w1_lora_b = w1_lora_b[:, start_idx:end_idx, :]
+            sliced_w3_lora_b = w3_lora_b[:, start_idx:end_idx, :]
+
+            return torch.stack([sliced_w1_lora_b, sliced_w3_lora_b], dim=2).flatten(
+                1, 2
+            )
+        else:
+            slice_size = w13_lora_b.shape[1] // 2
+            w1_lora_b = w13_lora_b[:, :slice_size, :]
+            w3_lora_b = w13_lora_b[:, slice_size:, :]
+            sliced_w1_lora_b = w1_lora_b[:, start_idx:end_idx, :]
+            sliced_w3_lora_b = w3_lora_b[:, start_idx:end_idx, :]
+
+            return torch.cat([sliced_w1_lora_b, sliced_w3_lora_b], dim=1)
+
+    def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor:
+        if self.tp_size == 1:
+            return w2_lora_a
+        # w2_lora_a shape (num_experts,rank,input_size)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+
+        return w2_lora_a[:, :, start_idx:end_idx]
+
+    def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor:
+        if self.tp_size == 1 or not self.fully_sharded:
+            return w2_lora_b
+        # Based on S-LoRA, we slice W2 B along the hidden_size dim.
+        # w2_lora_b shape (num_experts,output_size,rank)
+        current_lora_size = w2_lora_b.shape[1]
+
+        sliced_size = current_lora_size // self.tp_size
+        start_idx = self.tp_rank * sliced_size
+        end_idx = (self.tp_rank + 1) * sliced_size
+        return w2_lora_b[:, start_idx:end_idx, :]
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
+    ):
+        """Overwrites lora tensors at index."""
+        # Make mypy happy
+        assert isinstance(lora_a, list)
+        assert isinstance(lora_b, list)
+        assert len(lora_a) == len(lora_b) == 2
+
+        self.reset_lora(index)
+        self.adapter_enabled[index] = 1
+
+        num_experts = self.w13_lora_a_stacked[0].shape[1]
+        w13_lora_a, w2_lora_a = lora_a
+        w13_lora_b, w2_lora_b = lora_b
+
+        # (num_experts,rank,input_size)
+        w13_lora_a = w13_lora_a.reshape(num_experts, -1, w13_lora_a.shape[-1])
+        w2_lora_a = w2_lora_a.reshape(num_experts, -1, w2_lora_a.shape[-1])
+        # (output_size,num_experts,rank)
+        w13_lora_b = w13_lora_b.reshape(w13_lora_b.shape[0], num_experts, -1)
+        w2_lora_b = w2_lora_b.reshape(w2_lora_b.shape[0], num_experts, -1)
+        # (num_experts,output_size,rank)
+        w13_lora_b = w13_lora_b.permute(1, 0, 2)
+        w2_lora_b = w2_lora_b.permute(1, 0, 2)
+
+        sliced_w13_lora_a = self._slice_w13_a(w13_lora_a)
+        sliced_w13_lora_b = self._slice_w13_b(w13_lora_b, is_interleave=True)
+
+        sliced_w2_lora_a = self._slice_w2_a(w2_lora_a)
+        sliced_w2_lora_b = self._slice_w2_b(w2_lora_b)
+
+        self.w13_lora_a_stacked[0][
+            index, :, : sliced_w13_lora_a.shape[1], : sliced_w13_lora_a.shape[2]
+        ].copy_(sliced_w13_lora_a, non_blocking=True)
+        self.w2_lora_a_stacked[0][
+            index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
+        ].copy_(sliced_w2_lora_a, non_blocking=True)
+
+        self.w13_lora_b_stacked[0][
+            index, :, : sliced_w13_lora_b.shape[1], : sliced_w13_lora_b.shape[2]
+        ].copy_(sliced_w13_lora_b, non_blocking=True)
+        self.w2_lora_b_stacked[0][
+            index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
+        ].copy_(sliced_w2_lora_b, non_blocking=True)
+
+    @property
+    def w13_input_size(self):
+        """
+        Full size
+        """
+        return self.w13_lora_a_stacked[0].shape[-1]
+
+    @property
+    def w13_output_size(self):
+        """
+        Full size
+        """
+        return self.w13_lora_b_stacked[0].shape[-2] * self.tp_size
+
+    @property
+    def w2_input_size(self):
+        """
+        Full size
+        """
+        return self.w2_lora_a_stacked[0].shape[-1] * self.tp_size
+
+    @property
+    def w2_output_size(self):
+        """
+        Full size
+        """
+        return self.w2_lora_a_stacked[0].shape[-2]
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: PretrainedConfig | None,
+    ) -> bool:
+        """Returns True if the layer can be replaced by this LoRA layer."""
+
+        return type(source_layer) is FusedMoE and len(packed_modules_list) == 1
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
index 06f92652031e1..c01984db4e64c 100644
--- a/vllm/lora/layers/logits_processor.py
+++ b/vllm/lora/layers/logits_processor.py
@@ -128,9 +128,11 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
+        assert isinstance(lora_a, torch.Tensor)
+        assert isinstance(lora_b, torch.Tensor)
         self.reset_lora(index)
         self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
             lora_a, non_blocking=True
diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py
index 5b1f7886bc238..c87ca9e24dece 100644
--- a/vllm/lora/layers/vocal_parallel_embedding.py
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -77,12 +77,15 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
+        assert isinstance(lora_a, torch.Tensor)
+        assert isinstance(lora_b, torch.Tensor)
         self.reset_lora(index)
         # NOTE self.lora_a_stacked is row-major, and lora_a is col-major,
         # so we need transpose here
+
         self.lora_a_stacked[index, : lora_a.shape[1], : lora_a.shape[0]].copy_(
             lora_a.T, non_blocking=True
         )
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index eb11cd0afc487..636f062feb7b0 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -22,11 +22,13 @@ from vllm.lora.utils import (
     from_layer_logits_processor,
     get_supported_lora_modules,
     is_base_embeddding_weights,
+    is_moe_model,
     is_regex_target_modules,
     parse_fine_tuned_lora_name,
     process_packed_modules_mapping,
     replace_submodule,
 )
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.interfaces import is_pooling_model
@@ -356,7 +358,11 @@ class LoRAModelManager:
         self.modules: dict[str, BaseLayerWithLoRA] = {}
         # Dict instead of a set for compatibility with LRUCache.
         self._last_mapping: LoRAMapping | None = None
+        self._is_3d_moe_model = is_moe_model(self.model) and hasattr(
+            self.model, "is_3d_moe_weight"
+        )
         self._create_lora_modules()
+
         self.model.lora_manager = self
 
     def __len__(self) -> int:
@@ -400,22 +406,36 @@ class LoRAModelManager:
         self.lora_index_to_id[index] = lora_model.id
         for module_name, module in self.modules.items():
             module_lora = self._get_lora_layer_weights(lora_model, module_name)
-            if module_lora:
-                # Note (gnovack) - If MOE lora weights are not split into
-                # num_experts chunks, we split them here
-                if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor(
-                    module_lora.lora_a
-                ):
-                    # Handle FSDP file format where experts.base_layer is the
-                    # gate_up_proj and experts is the down_proj
-                    gate_up_proj_lora = self._get_lora_layer_weights(
-                        lora_model, module_name + ".base_layer"
-                    )
-
-                    assert gate_up_proj_lora is not None
-                    assert module_lora is not None
-
-                    down_proj_lora = module_lora
+            if not module_lora:
+                module.reset_lora(index)
+                continue
+            # Note (gnovack) - If MOE lora weights are not split into
+            # num_experts chunks, we split them here
+            if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor(
+                module_lora.lora_a
+            ):
+                # Handle PEFT file format where experts.base_layer is the
+                # gate_up_proj and experts is the down_proj
+                gate_up_proj_lora = self._get_lora_layer_weights(
+                    lora_model, module_name + ".base_layer"
+                )
+                down_proj_lora = module_lora
+                # FIXME Edge case where LoRA is not added to gate_up_proj
+                # or down_proj
+                assert gate_up_proj_lora is not None
+                assert down_proj_lora is not None
+                if self._is_3d_moe_model:
+                    module_lora.lora_a = [
+                        gate_up_proj_lora.lora_a,
+                        down_proj_lora.lora_a,
+                    ]
+                    module_lora.lora_b = [
+                        gate_up_proj_lora.lora_b,
+                        down_proj_lora.lora_b,
+                    ]
+                else:
+                    # Some 3D MoE models haven't added the `is_3d_moe_weight`
+                    # attribute yet, so fallback here
                     num_experts = module_lora.lora_a.shape[0] // module_lora.rank
 
                     gate_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
@@ -444,14 +464,12 @@ class LoRAModelManager:
 
                     module_lora.lora_a = lora_a
                     module_lora.lora_b = lora_b
+            module.set_lora(
+                index,
+                module_lora.lora_a,
+                module_lora.lora_b,
+            )
 
-                module.set_lora(
-                    index,
-                    module_lora.lora_a,
-                    module_lora.lora_b,
-                )
-            else:
-                module.reset_lora(index)
         return True
 
     def _deactivate_adapter(self, lora_id: int):
@@ -512,6 +530,13 @@ class LoRAModelManager:
                 continue
             parts = module_name.split(".")[-1]
             packed_moduled_lst = self.packed_modules_mapping.get(parts, [])
+            if isinstance(module, FusedMoE):
+                # packed_moduled_lst is used here to just determine whether to
+                # instantiate FusedMoE3DWithLoRA or FusedMoEWithLoRA, and the
+                # difference between these two LoRA layers is whether the
+                # LoRA weights of w1 and w3 have already been fused on disk.
+
+                packed_moduled_lst = ["w13"] if self._is_3d_moe_model else ["w1", "w3"]
             new_module = replace_submodule(
                 self.model,
                 module_name,
@@ -560,6 +585,7 @@ class LoRAModelManager:
             self._register_packed_modules(module_name)
             # All lora layers share the same punica_wrapper based on reference.
             new_module.set_mapping(self.punica_wrapper)
+        pass
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA), (
@@ -605,6 +631,30 @@ class LoRAModelManager:
                         module.lora_a_stacked[0].dtype,
                         "cpu",
                     )
+                    model.loras[module_name] = lora
+                elif module.__class__.__name__ == "FusedMoE3DWithLoRA":
+                    # Case for 3D moe model
+                    # w2
+                    lora = LoRALayerWeights.create_dummy_lora_weights(
+                        module_name,
+                        module.w2_input_size,
+                        module.w2_output_size,
+                        rank * module.w2_lora_a_stacked[0].shape[1],  # rank*num_experts
+                        module.w2_lora_a_stacked[0].dtype,
+                        "cpu",
+                    )
+                    model.loras[module_name] = lora
+                    # w13
+                    lora = LoRALayerWeights.create_dummy_lora_weights(
+                        module_name,
+                        module.w13_input_size,
+                        module.w13_output_size,
+                        rank
+                        * module.w13_lora_a_stacked[0].shape[1],  # rank*num_experts
+                        module.w13_lora_a_stacked[0].dtype,
+                        "cpu",
+                    )
+                    model.loras[module_name + ".base_layer"] = lora
                 else:
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
@@ -614,6 +664,7 @@ class LoRAModelManager:
                         module.lora_a_stacked[0].dtype,
                         "cpu",
                     )
+                    model.loras[module_name] = lora
             else:
                 parts = module_name.split(".")
                 replacements = self.packed_modules_mapping[parts[-1]]
@@ -629,7 +680,7 @@ class LoRAModelManager:
                     )
                     subloras.append(lora)
                 lora = PackedLoRALayerWeights.pack(subloras)
-            model.loras[module_name] = lora
+                model.loras[module_name] = lora
         return model
 
     def _match_target_modules(self, module_name: str):
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index a49a7d9d1669d..12524994d4968 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -23,6 +23,7 @@ from vllm.lora.layers import (
     BaseLayerWithLoRA,
     ColumnParallelLinearWithLoRA,
     ColumnParallelLinearWithShardedLoRA,
+    FusedMoE3DWithLoRA,
     FusedMoEWithLoRA,
     LogitsProcessorWithLoRA,
     MergedColumnParallelLinearWithLoRA,
@@ -62,6 +63,7 @@ _all_lora_classes: set[type[BaseLayerWithLoRA]] = {
     MergedQKVParallelLinearWithShardedLoRA,
     RowParallelLinearWithShardedLoRA,
     FusedMoEWithLoRA,
+    FusedMoE3DWithLoRA,
 }
 
 
@@ -288,10 +290,12 @@ def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]:
             # the expert indices are expanded based on the configured number
             # of routed experts.
             packed_modules_mapping = get_packed_modules_mapping(model)
-
-            packed_modules_mapping["experts"] = [
-                weight_name.rstrip(".") for _, weight_name, _, _ in moe_packed_mapping
-            ]
+            if not hasattr(model, "is_3d_moe_weight"):
+                # 3D MoE LoRA does not need `packed_modules_mapping`
+                packed_modules_mapping["experts"] = [
+                    weight_name.rstrip(".")
+                    for _, weight_name, _, _ in moe_packed_mapping
+                ]
 
             return packed_modules_mapping
         else:
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 8835acb8ec65c..1bc0ad38765d5 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -656,6 +656,7 @@ class GptOssModel(nn.Module):
 
 
 class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
+    is_3d_moe_weight: bool = True
     packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     hf_to_vllm_mapper = WeightsMapper(

From 30854783add03ef4d669e3a0041f60d89061172e Mon Sep 17 00:00:00 2001
From: Zero <rockmanzero@naver.com>
Date: Mon, 24 Nov 2025 11:27:55 +0900
Subject: [PATCH 188/249] [Model] Add OpenCUA-7B support (#29068)

Signed-off-by: lim4349 <rockmanzero@naver.com>
Signed-off-by: Zero <rockmanzero@naver.com>
Co-authored-by: Cloud User <ubuntu@a100-80g-4.novalocal>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/models/supported_models.md        |   1 +
 tests/models/registry.py               |   3 +
 vllm/model_executor/models/opencua.py  | 271 +++++++++++++++++++++++++
 vllm/model_executor/models/registry.py |   4 +
 4 files changed, 279 insertions(+)
 create mode 100644 vllm/model_executor/models/opencua.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 626904a974155..404519f887dc6 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -701,6 +701,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
 | `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
+| `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
 | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
 | `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
 | `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b088e16756d7a..758ec54493aa3 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -725,6 +725,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "NemotronH_Nano_VL_V2": _HfExamplesInfo(
         "nano_vl_dummy", is_available_online=False, trust_remote_code=True
     ),
+    "OpenCUAForConditionalGeneration": _HfExamplesInfo(
+        "xlangai/OpenCUA-7B", trust_remote_code=True
+    ),
     "Ovis": _HfExamplesInfo(
         "AIDC-AI/Ovis2-1B",
         trust_remote_code=True,
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
new file mode 100644
index 0000000000000..121bf896fa6ba
--- /dev/null
+++ b/vllm/model_executor/models/opencua.py
@@ -0,0 +1,271 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Adapted from Qwen2.5-VL implementation
+# Copyright 2025 The vLLM team.
+# Copyright 2025 XLANG Lab, The University of Hong Kong
+
+"""Inference-only OpenCUA-7B model compatible with HuggingFace weights."""
+
+from collections.abc import Mapping, Sequence
+from typing import Any
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature
+from transformers.models.qwen2_vl import (
+    Qwen2VLImageProcessor,
+    Qwen2VLProcessor,
+    Qwen2VLVideoProcessor,
+)
+
+from vllm.config import VllmConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalFieldConfig,
+    MultiModalKwargs,
+)
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .qwen2_5_vl import (
+    Qwen2_5_VisionTransformer as OpenCUAVisionTransformer,
+)
+from .qwen2_5_vl import (
+    Qwen2_5_VLForConditionalGeneration,
+)
+from .qwen2_vl import (
+    Qwen2VLDummyInputsBuilder,
+    Qwen2VLMultiModalDataParser,
+    Qwen2VLProcessingInfo,
+    _create_qwen2vl_field_factory,
+)
+from .utils import (
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+
+
+class OpenCUAProcessingInfo(Qwen2VLProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None}
+
+    def get_hf_processor(self, **kwargs: object):
+        """Load OpenCUA processor."""
+        tokenizer = self.get_tokenizer()
+        vision_config = self.ctx.get_hf_image_processor_config()
+        return OpenCUAProcessor(
+            vision_config=vision_config,
+            tokenizer=tokenizer,
+            **kwargs,
+        )
+
+
+class OpenCUAProcessor(Qwen2VLProcessor):
+    def check_argument_for_proper_class(self, attribute_name: str, arg: object) -> None:
+        if attribute_name == "tokenizer":
+            return
+        return super().check_argument_for_proper_class(attribute_name, arg)
+
+    def __init__(
+        self,
+        vision_config: dict,
+        tokenizer: AnyTokenizer,
+        **kwargs,
+    ):
+        image_processor = Qwen2VLImageProcessor(**vision_config)
+        video_processor = Qwen2VLVideoProcessor(**vision_config)
+        chat_template = kwargs.pop("chat_template", None)
+
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            video_processor=video_processor,
+            chat_template=chat_template,
+            **kwargs,
+        )
+
+        self.image_token = "<|media_placeholder|>"
+
+    def __call__(
+        self,
+        text=None,
+        images=None,
+        return_tensors=None,
+        **kwargs,
+    ):
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
+            text_inputs = self.tokenizer(text, **kwargs)
+        else:
+            text_inputs = {}
+
+        image_inputs = {}
+        if images is not None:
+            if not isinstance(images, list):
+                images = [images]
+            if len(images) > 0:
+                image_inputs = self.image_processor(
+                    images, return_tensors=return_tensors or "pt"
+                )
+
+        combined_inputs = {**text_inputs, **image_inputs}
+
+        return BatchFeature(combined_inputs, tensor_type=return_tensors)
+
+
+class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo]):
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return Qwen2VLMultiModalDataParser(
+            self.info.get_hf_config().vision_config.spatial_merge_size
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _create_qwen2vl_field_factory(
+            self.info.get_hf_config().vision_config.spatial_merge_size
+        )(hf_inputs)
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        """vLLM이 prompt 업데이트를 처리하도록 False 반환."""
+        return False
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+        hf_config = self.info.get_hf_config()
+
+        image_token_str = getattr(hf_processor, "image_token", "<|media_placeholder|>")
+        image_token_id = vocab.get(
+            image_token_str,
+            getattr(hf_config, "media_placeholder_token_id", 151664),
+        )
+
+        merge_length = image_processor.merge_size**2
+
+        def get_replacement_opencua(item_idx: int):
+            out_item = out_mm_kwargs["image"][item_idx]
+            grid_thw = out_item["image_grid_thw"].data
+            assert isinstance(grid_thw, torch.Tensor)
+
+            num_tokens = int(grid_thw.prod()) // merge_length
+            return [image_token_id] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement_opencua,
+            )
+        ]
+
+
+class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        image_token = "<|media_placeholder|>"
+
+        return image_token * num_images
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    OpenCUAMultiModalProcessor,
+    info=OpenCUAProcessingInfo,
+    dummy_inputs=OpenCUADummyInputsBuilder,
+)
+class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
+    merge_by_field_config = True
+    multimodal_cpu_fields = {"image_grid_thw"}
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            "vision_tower.": "visual.",
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        }
+    )
+
+    supports_encoder_tp_data = True
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<|media_placeholder|>"
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
+        self.config = config
+        self.vllm_config = vllm_config
+        self.multimodal_config = multimodal_config
+        self.quant_config = quant_config
+        self.is_multimodal_pruning_enabled = (
+            multimodal_config.is_multimodal_pruning_enabled()
+        )
+
+        if multimodal_config.get_limit_per_prompt("image"):
+            attn_backend_override = (
+                multimodal_config.mm_encoder_attn_backend
+                if multimodal_config is not None
+                else None
+            )
+            self.visual = OpenCUAVisionTransformer(
+                vision_config=config.vision_config,
+                norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, "visual"),
+                use_data_parallel=self.use_data_parallel,
+                attn_backend_override=attn_backend_override,
+            )
+        else:
+            self.visual = None
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4943987606201..b3da64af750c7 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -289,6 +289,10 @@ _MULTIMODAL_MODELS = {
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
+    "OpenCUAForConditionalGeneration": (
+        "opencua",
+        "OpenCUAForConditionalGeneration",
+    ),
     "InternS1ForConditionalGeneration": (
         "interns1",
         "InternS1ForConditionalGeneration",

From 5253f4276f333474f43d7f1cdaad6104d8f88f1f Mon Sep 17 00:00:00 2001
From: tongqiu <toqiu@amd.com>
Date: Mon, 24 Nov 2025 11:26:00 +0800
Subject: [PATCH 189/249] [ROCm] Support for Whisper v1 with Aiter Unified
 Attention and Aiter Flash Attention (#28376)

Signed-off-by: apinge <Tong.Qiu2@amd.com>
---
 vllm/v1/attention/backends/rocm_aiter_fa.py   | 22 ++++++++++++-------
 .../backends/rocm_aiter_unified_attn.py       | 14 ++++++++++--
 vllm/v1/attention/backends/rocm_attn.py       |  7 ++----
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index c8742e9835203..ea911af3d19ce 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -517,12 +517,9 @@ class AiterFlashAttentionImpl(AttentionImpl):
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        if attn_type != AttentionType.DECODER:
+        if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]:
             raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "FlashAttentionImpl"
+                "Encoder self-attention is not implemented for FlashAttentionImpl"
             )
 
     def extend_forward(
@@ -678,7 +675,14 @@ class AiterFlashAttentionImpl(AttentionImpl):
         # performance to make sure it does not introduce any overhead.
         num_actual_tokens = attn_metadata.num_actual_tokens
         key_cache, value_cache = kv_cache.unbind(0)
-        if self.kv_sharing_target_layer_name is None:
+        # key and value may be None in the case of cross attention. They are
+        # calculated once based on the output from the encoder and then cached
+        # in KV cache.
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             # NOTE(woosuk): Here, key and value are padded while slot_mapping
@@ -704,8 +708,10 @@ class AiterFlashAttentionImpl(AttentionImpl):
 
         # decode:extend:prefill
         query = query[:num_actual_tokens]
-        key = key[:num_actual_tokens]
-        value = value[:num_actual_tokens]
+        if key is not None:
+            key = key[:num_actual_tokens]
+        if value is not None:
+            value = value[:num_actual_tokens]
 
         output_actual_tokens = output[:num_actual_tokens]
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
index b2639c0df0412..16fb52ab501c1 100644
--- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
+++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
@@ -142,7 +142,14 @@ class RocmAiterUnifiedAttentionImpl(RocmAttentionImpl):
 
         key_cache, value_cache = kv_cache.unbind(0)
 
-        if self.kv_sharing_target_layer_name is None:
+        # key and value may be None in the case of cross attention. They are
+        # calculated once based on the output from the encoder and then cached
+        # in KV cache.
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             ops.reshape_and_cache_flash(
@@ -169,7 +176,10 @@ class RocmAiterUnifiedAttentionImpl(RocmAttentionImpl):
         max_seqlen_k = attn_metadata.max_seq_len
         block_table = attn_metadata.block_table
 
-        descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+        descale_shape = (
+            cu_seqlens_q.shape[0] - 1,
+            key.shape[1] if key is not None else self.num_kv_heads,
+        )
 
         self.unified_attention(
             q=query[:num_actual_tokens],
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 6dfdfc19ccba1..868143cc192e7 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -238,12 +238,9 @@ class RocmAttentionImpl(AttentionImpl):
 
         RocmAttentionBackend.validate_head_size(head_size)
 
-        if attn_type != AttentionType.DECODER:
+        if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]:
             raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "RocmAttentionImpl"
+                "Encoder self-attention is not implemented for RocmAttentionImpl"
             )
 
         self.fp8_dtype = current_platform.fp8_dtype()

From 0ff70821c9b0b991197fa7f3264bf9dd78b8d4b3 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Sun, 23 Nov 2025 20:18:55 -0800
Subject: [PATCH 190/249] [Core] Deprecate `xformers` (#29262)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 docker/Dockerfile.nightly_torch               |  35 +-
 .../contributing/ci/update_pytorch_version.md |  15 -
 docs/getting_started/quickstart.md            |   2 +-
 .../openai_embedding_long_text/service.sh     |   1 -
 requirements/cuda.txt                         |   1 -
 .../test_basic_correctness.py                 |   3 -
 tests/kernels/attention/test_attention.py     | 129 ------
 .../attention/test_attention_selector.py      |   8 +-
 tests/kernels/attention/test_mha_attn.py      |   4 -
 tests/kernels/utils.py                        |  78 +---
 tests/lora/test_minicpmv_tp.py                |  12 -
 tests/lora/test_qwen2vl.py                    |  15 -
 vllm/attention/backends/registry.py           |   1 -
 vllm/attention/layer.py                       |  38 --
 vllm/attention/ops/vit_attn_wrappers.py       |  38 +-
 vllm/attention/selector.py                    |   9 +-
 vllm/config/multimodal.py                     |   6 +
 vllm/envs.py                                  |   1 -
 vllm/model_executor/models/dots_ocr.py        |  27 +-
 vllm/model_executor/models/ernie45_vl.py      |  33 +-
 vllm/model_executor/models/glm4_1v.py         |  31 +-
 vllm/model_executor/models/keye.py            |  30 +-
 vllm/model_executor/models/paddleocr_vl.py    |  13 -
 vllm/model_executor/models/pixtral.py         |   1 +
 vllm/model_executor/models/qwen2_5_vl.py      |  25 +-
 vllm/model_executor/models/qwen2_vl.py        |  31 +-
 .../models/qwen3_omni_moe_thinker.py          |  12 +-
 vllm/model_executor/models/qwen3_vl.py        |  13 +-
 vllm/platforms/cuda.py                        |   7 +-
 vllm/utils/__init__.py                        |   1 -
 vllm/v1/attention/backends/xformers.py        | 420 ------------------
 31 files changed, 77 insertions(+), 963 deletions(-)
 delete mode 100644 vllm/v1/attention/backends/xformers.py

diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index b88b9c4992200..d663c82c3885e 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -76,34 +76,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
 uv pip install --system -r requirements/common.txt
 
-# must put before installing xformers, so it can install the correct version of xfomrers.
-ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-
-# Build xformers with cuda and torch nightly
-# following official xformers guidance: https://github.com/facebookresearch/xformers#build
-# todo(elainewy): cache xformers build result for faster build
-ARG max_jobs=16
-ENV MAX_JOBS=${max_jobs}
-ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-     --mount=type=cache,target=/root/.cache/uv \
-    echo 'git clone xformers...' \
-    && git clone https://github.com/facebookresearch/xformers.git --recursive \
-    && cd xformers \
-    && git checkout ${XFORMERS_COMMIT} \
-    && git submodule update --init --recursive \
-    && echo 'finish git clone xformers...' \
-    && rm -rf build \
-    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
-    && cd .. \
-    && rm -rf xformers
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system xformers-dist/*.whl --verbose
-
 # build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
@@ -233,11 +205,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm
     --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system vllm-dist/*.whl --verbose
 
-# install xformers again for the new environment
-RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
-
 ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
 
 # install package for build flashinfer
@@ -307,7 +274,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/nightly_torch_test.txt
 
 # Logging to confirm the torch versions
-RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+RUN pip freeze | grep -E 'torch|vllm|flashinfer'
 
 # Logging to confirm all the packages are installed
 RUN pip freeze
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 09fd85a466eed..735bb2e205332 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -98,21 +98,6 @@ to warm it up so that future builds are faster.
     <img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
 </p>
 
-## Update dependencies
-
-Several vLLM dependencies like xFormers depend on PyTorch and need
-to be updated accordingly. Rather than waiting for all of them to publish new
-releases (which would take too much time), they can be built from
-source to unblock the update process.
-
-### xFormers
-
-```bash
-export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
-MAX_JOBS=16 uv pip install --system \
-    --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
-```
-
 ## Update all the different vLLM platforms
 
 Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 9e86f785b10c7..94920dc5306b3 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -283,7 +283,7 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
 
 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
 
-- On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
+- On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
 - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
 
 For AMD ROCm, you can further control the specific Attention implementation using the following variables:
diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh
index 1577de85f7ff2..b5c92749466b0 100644
--- a/examples/online_serving/openai_embedding_long_text/service.sh
+++ b/examples/online_serving/openai_embedding_long_text/service.sh
@@ -22,7 +22,6 @@ API_KEY=${API_KEY:-"your-api-key"}
 POOLING_TYPE=${POOLING_TYPE:-"auto"}  # auto, MEAN, CLS, LAST
 export VLLM_ENABLE_CHUNKED_PROCESSING=true
 export CUDA_VISIBLE_DEVICES=2,3,4,5
-# export VLLM_ATTENTION_BACKEND=XFORMERS
 
 echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
 echo "=================================================================="
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index d63fe9e1e77c1..15e8aadc56f47 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -9,6 +9,5 @@ torch==2.9.0
 torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.2
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 0cf1e85d4e8ee..521d6c33dd390 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -74,9 +74,6 @@ def test_models(
     model_executor: str,
     enable_prompt_embeds: bool,
 ) -> None:
-    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
-        pytest.skip(f"{backend} does not support gemma2 with full context length.")
-
     with monkeypatch.context() as m:
         m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 9662e73321ebe..1a7d5ce0ddc1e 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -13,12 +13,6 @@ from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.platforms import current_platform
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
 
-if not current_platform.is_rocm():
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
-
-    from tests.kernels.utils import make_alibi_bias
-
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
@@ -448,129 +442,6 @@ def ref_multi_query_kv_attention(
     return torch.cat(ref_outputs, dim=0)
 
 
-@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(
-    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
-)
-@torch.inference_mode()
-def test_multi_query_kv_attention(
-    num_seqs: int,
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    use_alibi: bool = False,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
-    # As the xformers library is already tested with its own tests, we can use
-    # a smaller MAX_SEQ_LEN here.
-    max_len = min(MAX_SEQ_LEN, 4096)
-    seq_lens = random.sample(range(1, max_len), num_seqs)
-    num_tokens = sum(seq_lens)
-
-    scale = float(1.0 / (head_size**0.5))
-    num_query_heads, num_kv_heads = num_heads
-    qkv = torch.empty(
-        num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype
-    )
-    qkv.uniform_(-scale, scale)
-    query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1)
-
-    num_queries_per_kv = num_query_heads // num_kv_heads
-    if num_queries_per_kv > 1:
-        # Handle MQA and GQA
-        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
-        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
-    alibi_bias = None
-    if use_alibi:
-        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
-        attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
-        output = torch.empty_like(query)
-        start = 0
-        # Dynamic sequence length not supported with custom attn_bias.
-        for i, seq_len in enumerate(seq_lens):
-            end = start + seq_len
-            out = xops.memory_efficient_attention_forward(
-                query[None, start:end],
-                key[None, start:end],
-                value[None, start:end],
-                attn_bias=attn_bias[i],
-                p=0.0,
-                scale=scale,
-            )
-            output[start:end].copy_(out.view_as(query[start:end]))
-            start += seq_len
-        # xformers.AttentionBias to Tensor for use in reference impl.
-        alibi_bias = [
-            b.materialize((1, num_query_heads, i, i), device=device).squeeze()
-            for b, i in zip(attn_bias, seq_lens)
-        ]
-    else:
-        attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
-        output = xops.memory_efficient_attention_forward(
-            query.unsqueeze(0),
-            key.unsqueeze(0),
-            value.unsqueeze(0),
-            attn_bias=attn_bias,
-            p=0.0,
-            scale=scale,
-        )
-        output = output.squeeze(0)
-
-    cu_seq_lens = [0]
-    for seq_len in seq_lens:
-        cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
-    ref_output = ref_multi_query_kv_attention(
-        cu_seq_lens,
-        query,
-        key,
-        value,
-        scale,
-        alibi_bias,
-        dtype,
-    )
-    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
-    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
-
-
-@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", [64])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(
-    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
-)
-@torch.inference_mode()
-def test_multi_query_kv_attention_with_alibi(
-    num_seqs: int,
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-) -> None:
-    return test_multi_query_kv_attention(
-        num_seqs,
-        num_heads,
-        head_size,
-        dtype,
-        seed,
-        device,
-        use_alibi=True,
-    )
-
-
 @pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
 def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
     head_size = 64
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 9be56a33f76c8..cd34b520ea71b 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -34,7 +34,7 @@ DEVICE_MLA_BACKENDS = {
 }
 
 DEVICE_REGULAR_ATTN_BACKENDS = {
-    "cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
+    "cuda": ["FLASHINFER", "FLASH_ATTN"],
     "hip": ["ROCM_ATTN"],
     "cpu": ["CPU_ATTN"],
 }
@@ -207,12 +207,6 @@ def test_env(
                     )
                     expected = "FLASHINFER"
                     assert backend.get_name() == expected
-                elif name == "XFORMERS":
-                    backend = get_attn_backend(
-                        32, torch.float16, None, block_size, use_mla=use_mla
-                    )
-                    expected = "XFORMERS"
-                    assert backend.get_name() == expected
                 elif name == "FLASH_ATTN":
                     backend = get_attn_backend(
                         32, torch.float16, None, block_size, use_mla=use_mla
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index a878ac6396ce5..ae3c63cc62d6b 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -24,10 +24,6 @@ from vllm.platforms.rocm import RocmPlatform
 def clear_cache():
     """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
-    # Clear xformers availability cache
-    import vllm.attention.layer as layer_module
-
-    layer_module.USE_XFORMERS_OPS = None
 
 
 @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 5d5a26fbfc2cd..9307ef7814a8b 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -509,43 +509,6 @@ def pack_qkv(qkv: QKVInputs, device: torch.device | str) -> PackedQKVInputs:
     )
 
 
-def make_alibi_bias(
-    alibi_slopes: torch.Tensor,
-    num_kv_heads: int,
-    dtype: torch.dtype,
-    seq_lens: list[int],
-) -> list[Any]:
-    """Create ALiBi biases compatible with xFormers attention tests."""
-    from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias
-
-    if alibi_slopes is None:
-        return [None for _ in seq_lens]
-
-    attn_biases: list[Any] = []
-    num_heads = alibi_slopes.shape[0]
-    assert num_heads >= num_kv_heads, (
-        "ALiBi slopes expect at least as many heads as KV heads"
-    )
-
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
-        bias = bias[None, :] - bias[:, None]
-
-        padded_len = (seq_len + 7) // 8 * 8
-        bias_tensor = torch.empty(
-            1,
-            num_heads,
-            seq_len,
-            padded_len,
-            device=alibi_slopes.device,
-            dtype=dtype,
-        )[:, :, :, :seq_len].copy_(bias)
-        bias_tensor.mul_(alibi_slopes[:, None, None])
-        attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor))
-
-    return attn_biases
-
-
 def _make_metadata_tensors(
     seq_lens: list[int] | None,
     context_lens: list[int] | None,
@@ -649,23 +612,12 @@ def make_kv_cache(
 
     Returns:
 
-    * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
-    *     for backend 'XFORMERS'
     * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
     *     for backend 'FLASH_ATTN'
     """
-    if backend == "XFORMERS":
-        kv_cache = torch.rand((2, num_blocks, block_size * num_heads * head_size)).to(
-            device
-        )
-    elif backend == "FLASH_ATTN":
-        kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(
-            device
-        )
-    else:
-        raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
-        )
+    if backend != "FLASH_ATTN":
+        raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.")
+    kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(device)
     if default_val is not None:
         kv_cache[:, :, :] = default_val
     return kv_cache
@@ -843,22 +795,14 @@ def assert_actual_matches_ideal(
     * output_under_test: actually observed output value
     """
     ideal_output = test_params.packed_qkvo.ideal_output
-    if backend == "XFORMERS":
-        torch.testing.assert_close(
-            ideal_output, output_under_test.view_as(ideal_output)
-        )
-
-    elif backend == "FLASH_ATTN":
-        # For FlashAttention override the accuracy thresholds to non default
-        # values since we notice a higher difference between the ideal and
-        # actual output.
-        torch.testing.assert_close(
-            ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
-        )
-    else:
-        raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
-        )
+    if backend != "FLASH_ATTN":
+        raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.")
+    # For FlashAttention override the accuracy thresholds to non default
+    # values since we notice a higher difference between the ideal and
+    # actual output.
+    torch.testing.assert_close(
+        ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
+    )
 
 
 # Copied/modified from torch._refs.__init__.py
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 1cf8ed602b6a4..e430826461a14 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -57,10 +57,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
@@ -84,10 +80,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
 @pytest.mark.skipif(
     current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
 )
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
 @multi_gpu_test(num_gpus=4)
 def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
@@ -108,10 +100,6 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
 @pytest.mark.skipif(
     current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
 )
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
 @multi_gpu_test(num_gpus=4)
 def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 1800ca107a426..7d8c940100ca4 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -2,12 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
-import pytest
-
 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
 from vllm.sampling_params import BeamSearchParams
 
 
@@ -142,10 +139,6 @@ QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
 QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm",
-)
 def test_qwen2vl_lora(qwen2vl_lora_files):
     """Test Qwen 2.0 VL model with LoRA"""
     config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
@@ -156,10 +149,6 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
         tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm",
-)
 def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
     """Test Qwen 2.0 VL model with LoRA through beam search."""
     config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
@@ -178,10 +167,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
         )
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
-)
 def test_qwen25vl_lora(qwen25vl_lora_files):
     """Test Qwen 2.5 VL model with LoRA"""
     config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index 6747cf7743b14..125e4e3827747 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -43,7 +43,6 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
 
     FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
     TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
-    XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"
     ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
     ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"
     ROCM_AITER_TRITON_MLA = (
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a8e796a1eab63..f1d57ac50fb9f 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -51,31 +51,6 @@ else:
 
 FP8_DTYPE = current_platform.fp8_dtype()
 logger = init_logger(__name__)
-USE_XFORMERS_OPS = None
-
-
-def check_xformers_availability():
-    global USE_XFORMERS_OPS
-    if USE_XFORMERS_OPS is not None:
-        return USE_XFORMERS_OPS
-
-    if current_platform.is_cuda() and current_platform.has_device_capability(100):
-        # Xformers FA is not compatible with B200
-        USE_XFORMERS_OPS = False
-    else:
-        try:
-            from importlib.util import find_spec
-
-            find_spec("xformers.ops")
-            USE_XFORMERS_OPS = True
-        except ImportError:
-            USE_XFORMERS_OPS = False
-
-    # the warning only needs to be shown once
-    if not USE_XFORMERS_OPS:
-        logger.warning("Xformers is not available, falling back.")
-
-    return USE_XFORMERS_OPS
 
 
 def check_upstream_fa_availability(dtype: torch.dtype):
@@ -533,7 +508,6 @@ class MultiHeadAttention(nn.Module):
             if backend
             in {
                 AttentionBackendEnum.TORCH_SDPA,
-                AttentionBackendEnum.XFORMERS,
                 AttentionBackendEnum.PALLAS,
                 AttentionBackendEnum.ROCM_AITER_FA,
                 AttentionBackendEnum.FLASH_ATTN,
@@ -549,12 +523,6 @@ class MultiHeadAttention(nn.Module):
             )
         )
 
-        if (
-            self.attn_backend == AttentionBackendEnum.XFORMERS
-            and not check_xformers_availability()
-        ):
-            self.attn_backend = AttentionBackendEnum.TORCH_SDPA
-
         self.is_flash_attn_backend = self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
@@ -614,12 +582,6 @@ class MultiHeadAttention(nn.Module):
                 max_seqlen_k=kv_len,
                 softmax_scale=self.scale,
             )
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-
-            out = xops.memory_efficient_attention_forward(
-                query, key, value, scale=self.scale
-            )
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             query, key, value = (x.transpose(1, 2) for x in (query, key, value))
             out = F.scaled_dot_product_attention(query, key, value, scale=self.scale)
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 06a9f7cd82266..46f8f5117f7a7 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -3,7 +3,7 @@
 """
 This file contains ops for ViT attention to be compatible with torch.compile
 as there are operations here not supported by torch.compile (for instance,
-`to_list` in xformers attn, or `.item()` in flash attention)
+`.item()` in flash attention)
 
 Using these ops and wrapping vision blocks with `torch.compile` can speed up
 throughput in vision models by ~5% relative on H100, and improve token
@@ -19,42 +19,6 @@ import torch.nn.functional as F
 from vllm.utils.torch_utils import direct_register_custom_op
 
 
-def xformers_attn_seqlens_wrapper(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-    attn_bias = BlockDiagonalMask.from_seqlens(
-        q_seqlen=seqlens.tolist(), kv_seqlen=None, device=q.device
-    )
-    context_layer = xops.memory_efficient_attention_forward(
-        q, k, v, attn_bias=attn_bias, p=0, scale=None
-    )
-    context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
-    return context_layer
-
-
-def xformers_attn_seqlens_wrapper_fake(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
-    b, s, h, d = q.shape
-    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
-
-
-direct_register_custom_op(
-    op_name="xformers_attn_seqlens_wrapper",
-    op_func=xformers_attn_seqlens_wrapper,
-    fake_impl=xformers_attn_seqlens_wrapper_fake,
-)
-
-
-def vit_xformers_attn_wrapper(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
-    return torch.ops.vllm.xformers_attn_seqlens_wrapper(q, k, v, seqlens)
-
-
 def flash_attn_maxseqlen_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index e9af08b2316d2..ad19b58aa155c 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -36,7 +36,14 @@ def get_env_variable_attn_backend() -> AttentionBackendEnum | None:
     * None otherwise
     """
     backend_name = os.environ.get(STR_BACKEND_ENV_VAR)
-    return None if backend_name is None else AttentionBackendEnum[backend_name]
+    if backend_name is None:
+        return None
+    if backend_name == "XFORMERS":
+        raise ValueError(
+            "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
+            "details). Please select a supported attention backend."
+        )
+    return AttentionBackendEnum[backend_name]
 
 
 # Global state allows a particular choice of backend
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 9f62b35ed515c..00a81a319bf72 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -173,6 +173,12 @@ class MultiModalConfig:
         # We need to import the real type here (deferred to avoid circular import).
         from vllm.attention.backends.registry import AttentionBackendEnum
 
+        if isinstance(value, str) and value.upper() == "XFORMERS":
+            raise ValueError(
+                "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
+                "details). Please select a supported attention backend."
+            )
+
         if value is None or isinstance(value, AttentionBackendEnum):
             return value
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 9b1ed1fc680b4..56558548d3981 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -640,7 +640,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Example options:
     # - "TORCH_SDPA": use torch.nn.MultiheadAttention
     # - "FLASH_ATTN": use FlashAttention
-    # - "XFORMERS": use XFormers
     # - "FLASHINFER": use flashinfer
     # - "FLASHMLA": use FlashMLA
     # - "FLASH_ATTN_MLA": use FlashAttention for MLA
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 2d2251e83b5b1..5460018d0d67a 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -306,7 +306,6 @@ class DotsVisionAttention(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -324,7 +323,6 @@ class DotsVisionAttention(nn.Module):
         rotary_pos_emb: torch.Tensor | None = None,
         *,
         max_seqlen: int | None = None,
-        seqlens: list[int] | None = None,
     ) -> torch.Tensor:
         # [S, C] -> [S, B=1, C]
         x = hidden_states.unsqueeze(1)
@@ -374,16 +372,6 @@ class DotsVisionAttention(nn.Module):
                 out_i = out_i.permute(0, 2, 1, 3)
                 outputs.append(out_i)
             context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
         else:
             raise RuntimeError("Unsupported attention backend")
 
@@ -545,14 +533,12 @@ class DotsVisionBlock(nn.Module):
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
         max_seqlen: int | None = None,
-        seqlens: list[int] | None = None,
     ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
             rotary_pos_emb=rotary_pos_emb,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
         return hidden_states
@@ -663,18 +649,14 @@ class DotsVisionTransformer(nn.Module):
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
-    def compute_attn_mask_seqlen(
-        self, cu_seqlens: torch.Tensor
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+        max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: list[list[int]]
@@ -694,14 +676,13 @@ class DotsVisionTransformer(nn.Module):
         )
         cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
 
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         for blk in self.blocks:
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         if self.post_trunk_norm is not None:
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index daa5bf03ea4a9..07b34fbc8addb 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -214,7 +214,6 @@ class Ernie4_5_VisionAttention(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -259,7 +258,6 @@ class Ernie4_5_VisionAttention(nn.Module):
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -311,20 +309,6 @@ class Ernie4_5_VisionAttention(nn.Module):
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -404,14 +388,12 @@ class Ernie4_5_VisionBlock(nn.Module):
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
             rotary_pos_emb=rotary_pos_emb,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
         return hidden_states
@@ -562,18 +544,14 @@ class Ernie4_5_VisionTransformer(nn.Module):
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
-    def compute_attn_mask_seqlen(
-        self, cu_seqlens: torch.Tensor
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+        max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, num_pad=0
@@ -598,8 +576,8 @@ class Ernie4_5_VisionTransformer(nn.Module):
         if hidden_states.ndim == 2:
             hidden_states = hidden_states.unsqueeze(dim=1)
 
-        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
 
         for i, blk in enumerate(self.blocks):
             hidden_states = blk(
@@ -607,7 +585,6 @@ class Ernie4_5_VisionTransformer(nn.Module):
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         final_output = self.ln(hidden_states)
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index d141e95498064..7e0370886884f 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -309,7 +309,6 @@ class Glm4vVisionAttention(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -345,7 +344,6 @@ class Glm4vVisionAttention(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -400,20 +398,6 @@ class Glm4vVisionAttention(nn.Module):
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -461,7 +445,6 @@ class Glm4vVisionBlock(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
@@ -469,7 +452,6 @@ class Glm4vVisionBlock(nn.Module):
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         x_fused_norm, residual = self.norm2(x, residual=x_attn)
         x = residual + self.mlp(x_fused_norm)
@@ -803,15 +785,14 @@ class Glm4vVisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
-        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+    ) -> int | None:
+        max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -836,8 +817,9 @@ class Glm4vVisionTransformer(nn.Module):
         ).cumsum(dim=0, dtype=torch.int32)
         cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
 
-        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         x = self.embeddings(
             x, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1]
         )
@@ -851,7 +833,6 @@ class Glm4vVisionTransformer(nn.Module):
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         # adapter
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 8fc3db296aa79..302260b952992 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -9,6 +9,7 @@ from typing import Annotated, Any, Literal, TypeAlias, TypeVar
 import numpy as np
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from einops import rearrange
 from transformers import PretrainedConfig
 from transformers.activations import GELUActivation
@@ -424,7 +425,7 @@ class KeyeSiglipAttention(nn.Module):
 
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.TORCH_SDPA,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -451,7 +452,6 @@ class KeyeSiglipAttention(nn.Module):
         )
 
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         batch_size = q.shape[0]
 
         if rope_emb is None:
@@ -498,17 +498,21 @@ class KeyeSiglipAttention(nn.Module):
                 softmax_scale=self.scale,
             )
             context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (
+                    rearrange(x, "b s h d -> b h s d") for x in (q_i, k_i, v_i)
+                )
+                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
 
         context_layer = rearrange(context_layer, "b s h d -> b s (h d)").contiguous()
 
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index dee0c16ab0f63..74bb868492da9 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -38,7 +38,6 @@ from vllm.attention.layer import (
 )
 from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
-    vit_xformers_attn_wrapper,
 )
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -657,7 +656,6 @@ class SiglipAttention(nn.Module):
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor | None,
         max_seqlen: torch.Tensor | None,
-        seqlens: torch.Tensor | None,
     ) -> torch.Tensor:
         batch_size, _, _ = hidden_states.shape
 
@@ -703,10 +701,6 @@ class SiglipAttention(nn.Module):
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            if seqlens is None:
-                raise ValueError("xFormers attention backend requires seqlens tensor.")
-            context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
         else:
             raise RuntimeError(
                 f"PaddleOCR-VL does not support {self.attn_backend} backend now."
@@ -818,7 +812,6 @@ class SiglipEncoderLayer(nn.Module):
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor | None,
         max_seqlen: torch.Tensor | None,
-        seqlens: torch.Tensor | None,
     ) -> torch.Tensor:
         residual = hidden_states
 
@@ -828,7 +821,6 @@ class SiglipEncoderLayer(nn.Module):
             cu_seqlens=cu_seqlens,
             rotary_pos_emb=rotary_pos_emb,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         hidden_states = residual + hidden_states
@@ -870,7 +862,6 @@ class SiglipEncoder(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -943,14 +934,11 @@ class SiglipEncoder(nn.Module):
             cu_seqlens = cu_seqlens.to(device=device)
 
         max_seqlen = None
-        seqlens = None
         if self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
 
         hidden_states = inputs_embeds
         for encoder_layer in self.layers:
@@ -959,7 +947,6 @@ class SiglipEncoder(nn.Module):
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
         return hidden_states
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 8a034fd72b02a..6011d93a795d1 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -74,6 +74,7 @@ from .vision import (
 )
 
 try:
+    # Note: vLLM does not install xformers by default.
     from xformers import ops as xops
 
     if current_platform.is_cuda() and current_platform.has_device_capability(100):
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 1500a437613cc..8c707c2561af1 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -46,7 +46,6 @@ from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
     vit_torch_sdpa_wrapper,
-    vit_xformers_attn_wrapper,
 )
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
@@ -375,7 +374,6 @@ class Qwen2_5_VisionAttention(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -435,8 +433,6 @@ class Qwen2_5_VisionAttention(nn.Module):
                 v,
                 cu_seqlens,
             )
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
 
         output, _ = self.proj(context_layer)
         return output
@@ -448,9 +444,7 @@ class Qwen2_5_VisionAttention(nn.Module):
         "cu_seqlens": 0,
         "rotary_pos_emb_cos": 0,
         "rotary_pos_emb_sin": 0,
-        "seqlens": 0,
     },
-    mark_unbacked_dims={"seqlens": 0},
     enable_if=should_torch_compile_mm_vit,
 )
 class Qwen2_5_VisionBlock(nn.Module):
@@ -501,7 +495,6 @@ class Qwen2_5_VisionBlock(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
@@ -509,7 +502,6 @@ class Qwen2_5_VisionBlock(nn.Module):
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         x_fused_norm, residual = self.norm2(x, residual=x_attn)
         x = residual + self.mlp(x_fused_norm)
@@ -670,7 +662,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -822,17 +813,14 @@ class Qwen2_5_VisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        seqlens = torch.zeros(1, device=cu_seqlens.device)
         if self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
-        return max_seqlen, seqlens
+        return max_seqlen
 
     @staticmethod
     def invert_permutation(perm: torch.Tensor) -> torch.Tensor:
@@ -897,10 +885,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
 
         # transformers
         # pre-compute seqlens for window/full attn to reduce cuMemcpy operations
-        max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(cu_seqlens)
-        max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
-            cu_window_seqlens
-        )
+        max_seqlen_full = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen_window = self.compute_attn_mask_seqlen(cu_window_seqlens)
 
         cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
         cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True)
@@ -927,11 +913,9 @@ class Qwen2_5_VisionTransformer(nn.Module):
             if layer_num in self.fullatt_block_indexes:
                 cu_seqlens_now = cu_seqlens
                 max_seqlen_now = max_seqlen_full
-                seqlens_now = seqlens_full
             else:
                 cu_seqlens_now = cu_window_seqlens
                 max_seqlen_now = max_seqlen_window
-                seqlens_now = seqlens_window
 
             hidden_states = blk(
                 hidden_states,
@@ -939,7 +923,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen_now,
-                seqlens=seqlens_now,
             )
 
         # For Qwen2.5-VL-3B, float16 will overflow at last block
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 479a7871e364f..9d1d023aed172 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -348,7 +348,6 @@ class Qwen2VisionAttention(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -384,7 +383,6 @@ class Qwen2VisionAttention(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, 3 * head * head_dim]
         x, _ = self.qkv(x)
@@ -445,20 +443,6 @@ class Qwen2VisionAttention(nn.Module):
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -509,7 +493,6 @@ class Qwen2VisionBlock(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -517,7 +500,6 @@ class Qwen2VisionBlock(nn.Module):
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -728,18 +710,14 @@ class Qwen2VisionTransformer(nn.Module):
         sin_combined = sin[pos_ids].flatten(1)
         return cos_combined, sin_combined
 
-    def compute_attn_mask_seqlen(
-        self, cu_seqlens: torch.Tensor
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+        max_seqlen = None
         if self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -771,7 +749,7 @@ class Qwen2VisionTransformer(nn.Module):
         x = x.unsqueeze(1)
 
         # pre-compute seqlens for attn mask to reduce cuMemcpy operations
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
         for blk in self.blocks:
             x = blk(
@@ -780,7 +758,6 @@ class Qwen2VisionTransformer(nn.Module):
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         # adapter
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 54ef56f83344e..61f218f16d79c 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -224,7 +224,6 @@ class Qwen3_VisionBlock(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -232,7 +231,6 @@ class Qwen3_VisionBlock(nn.Module):
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -500,14 +498,11 @@ class Qwen3Omni_VisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        seqlens = torch.zeros(1, device=cu_seqlens.device)
         if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -533,7 +528,7 @@ class Qwen3Omni_VisionTransformer(nn.Module):
         hidden_states = hidden_states.unsqueeze(1)
         rotary_pos_emb_cos = rotary_pos_emb_cos.to(hidden_states.device)
         rotary_pos_emb_sin = rotary_pos_emb_sin.to(hidden_states.device)
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
 
         hidden_states_list = []
         deepstack_visual_indexes = self.deepstack_visual_indexes
@@ -545,7 +540,6 @@ class Qwen3Omni_VisionTransformer(nn.Module):
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
             if (
                 deepstack_visual_indexes is not None
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 90c4894d33e88..4cd6fa14c32df 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -235,7 +235,6 @@ class Qwen3_VisionBlock(nn.Module):
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -243,7 +242,6 @@ class Qwen3_VisionBlock(nn.Module):
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -391,7 +389,6 @@ class Qwen3_VisionTransformer(nn.Module):
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -531,17 +528,14 @@ class Qwen3_VisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        seqlens = torch.zeros(1, device=cu_seqlens.device)
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -569,7 +563,7 @@ class Qwen3_VisionTransformer(nn.Module):
         cu_seqlens = torch.from_numpy(cu_seqlens)
 
         hidden_states = hidden_states.unsqueeze(1)
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
 
         deepstack_feature_lists = []
@@ -580,7 +574,6 @@ class Qwen3_VisionTransformer(nn.Module):
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
             if layer_num in self.deepstack_visual_indexes:
                 deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index f9bf242b7194e..06793a3d1bb14 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -277,12 +277,7 @@ class CudaPlatformBase(Platform):
         except ImportError:
             pass
 
-        if cls.has_device_capability(100):
-            # xFormers doesn't support Blackwell, fall back to SDPA
-            # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501
-            return AttentionBackendEnum.TORCH_SDPA
-        else:
-            return AttentionBackendEnum.XFORMERS
+        return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
     def get_valid_backends(
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 3ef44e7703204..d94da71b289f3 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -49,7 +49,6 @@ STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
 # Possible string values of STR_BACKEND_ENV_VAR
 # register, corresponding to possible backends
 STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
-STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"
 
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
deleted file mode 100644
index 5039c44b9c3e6..0000000000000
--- a/vllm/v1/attention/backends/xformers.py
+++ /dev/null
@@ -1,420 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention layer with XFormersAttention."""
-
-from dataclasses import dataclass
-from typing import ClassVar, Optional
-
-import torch
-
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionImpl,
-    AttentionType,
-    MultipleOf,
-)
-from vllm.attention.ops.triton_unified_attention import unified_attention
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder,
-    CommonAttentionMetadata,
-    split_decodes_and_prefills,
-)
-from vllm.v1.kv_cache_interface import AttentionSpec
-
-try:
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import (
-        AttentionBias,
-        PagedBlockDiagonalCausalWithOffsetPaddedKeysMask,
-    )
-
-    XFORMERS_AVAILABLE = True
-except ImportError:
-    XFORMERS_AVAILABLE = False
-
-from vllm import _custom_ops as ops
-
-logger = init_logger(__name__)
-
-
-class XFormersAttentionBackend(AttentionBackend):
-    accept_output_buffer: bool = True
-    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-
-    @staticmethod
-    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
-        return [MultipleOf(16)]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [
-            32,
-            40,
-            48,
-            56,
-            64,
-            72,
-            80,
-            88,
-            96,
-            104,
-            112,
-            120,
-            128,
-            136,
-            144,
-            152,
-            160,
-            168,
-            176,
-            184,
-            192,
-            200,
-            208,
-            216,
-            224,
-            232,
-            240,
-            248,
-            256,
-        ]
-
-    @staticmethod
-    def get_name() -> str:
-        return "XFORMERS"
-
-    @staticmethod
-    def get_impl_cls() -> type["XFormersAttentionImpl"]:
-        return XFormersAttentionImpl
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        cache_dtype_str: str = "auto",
-    ) -> tuple[int, ...]:
-        if block_size % 16 != 0:
-            raise ValueError("Block size must be a multiple of 16.")
-        return (2, num_blocks, block_size, num_kv_heads, head_size)
-
-    @staticmethod
-    def get_builder_cls() -> type["XFormersAttentionMetadataBuilder"]:
-        return XFormersAttentionMetadataBuilder
-
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return False
-
-
-@dataclass
-class XFormersAttentionMetadata:
-    num_actual_tokens: int  # Number of tokens excluding padding.
-    max_query_len: int
-    query_start_loc: torch.Tensor
-    max_seq_len: int
-    seq_lens: torch.Tensor
-    block_table: torch.Tensor
-    slot_mapping: torch.Tensor
-
-    num_prefill_tokens: int = 0
-    num_decode_tokens: int = 0
-    num_prefills: int = 0
-    num_decodes: int = 0
-
-    # Biases for different attention types.
-    attn_bias: Optional["AttentionBias"] = None
-
-    # Self-attention prefill/decode metadata cache
-    _cached_prefill_metadata: Optional["XFormersAttentionMetadata"] = None
-    _cached_decode_metadata: Optional["XFormersAttentionMetadata"] = None
-
-    @property
-    def prefill_metadata(self) -> Optional["XFormersAttentionMetadata"]:
-        if self.num_prefills == 0:
-            return None
-
-        if self._cached_prefill_metadata is not None:
-            # Recover cached prefill-phase attention
-            # metadata structure
-            return self._cached_prefill_metadata
-
-        q_start_loc = self.query_start_loc[self.num_decodes :]
-        q_seqlens = torch.diff(q_start_loc)
-        kv_seqlens = self.seq_lens[self.num_decodes :]
-        # Construct & cache prefill-phase attention metadata structure
-        self._cached_prefill_metadata = XFormersAttentionMetadata(
-            num_actual_tokens=self.num_prefill_tokens,
-            max_query_len=int(q_seqlens.max().item()),
-            query_start_loc=q_start_loc - q_start_loc[0],
-            max_seq_len=int(kv_seqlens.max().item()),
-            seq_lens=kv_seqlens,
-            block_table=self.block_table[self.num_decodes :],
-            slot_mapping=self.slot_mapping[self.num_decode_tokens :],
-        )
-        return self._cached_prefill_metadata
-
-    @property
-    def decode_metadata(self) -> Optional["XFormersAttentionMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-
-        if self._cached_decode_metadata is not None:
-            # Recover cached decode-phase attention
-            # metadata structure
-            return self._cached_decode_metadata
-
-        q_start_loc = self.query_start_loc
-        q_seqlens = torch.diff(q_start_loc)
-        decode_kv_seqlens = self.seq_lens[: self.num_decodes]
-        # Construct & cache decode-phase attention metadata structure
-        self._cached_decode_metadata = XFormersAttentionMetadata(
-            num_actual_tokens=self.num_decode_tokens,
-            max_query_len=int(q_seqlens[: self.num_decodes].max().item()),
-            query_start_loc=q_start_loc[: self.num_decodes + 1],
-            max_seq_len=int(decode_kv_seqlens.max().item()),
-            seq_lens=decode_kv_seqlens,
-            block_table=self.block_table[: self.num_decodes],
-            slot_mapping=self.slot_mapping[: self.num_decode_tokens],
-            attn_bias=self.attn_bias,
-        )
-        return self._cached_decode_metadata
-
-
-class XFormersAttentionMetadataBuilder(
-    AttentionMetadataBuilder[XFormersAttentionMetadata]
-):
-    reorder_batch_threshold: int = 1
-
-    def __init__(
-        self,
-        kv_cache_spec: AttentionSpec,
-        layer_names: list[str],
-        vllm_config: VllmConfig,
-        device: torch.device,
-    ):
-        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
-
-        assert XFORMERS_AVAILABLE
-        self.block_size = kv_cache_spec.block_size
-        self._num_decodes = 0
-        self._num_decode_tokens = 0
-
-    def build(
-        self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        fast_build: bool = False,
-    ) -> XFormersAttentionMetadata:
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
-            )
-        )
-
-        num_actual_tokens = common_attn_metadata.num_actual_tokens
-        q_start_loc = common_attn_metadata.query_start_loc
-        q_seqlens = torch.diff(q_start_loc)
-        max_query_len = common_attn_metadata.max_query_len
-        kv_seqlens = common_attn_metadata.seq_lens
-        max_seq_len = common_attn_metadata.max_seq_len
-        block_table = common_attn_metadata.block_table_tensor
-        slot_mapping = common_attn_metadata.slot_mapping
-
-        bias = None
-        if num_decodes > 0:
-            # Construct the decoder bias.
-            decode_q_seqlens = q_seqlens[:num_decodes]
-            decode_kv_seqlens = kv_seqlens[:num_decodes]
-            bias = PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=decode_q_seqlens.tolist(),
-                kv_seqlen=decode_kv_seqlens.tolist(),
-                page_size=self.block_size,
-                block_tables=block_table[:num_decodes],
-                device=block_table.device,
-            )
-
-        return XFormersAttentionMetadata(
-            num_actual_tokens=num_actual_tokens,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            num_prefills=num_prefills,
-            num_decodes=num_decodes,
-            max_query_len=max_query_len,
-            query_start_loc=q_start_loc,
-            max_seq_len=max_seq_len,
-            seq_lens=kv_seqlens,
-            block_table=block_table,
-            slot_mapping=slot_mapping,
-            attn_bias=bias,
-        )
-
-
-class XFormersAttentionImpl(AttentionImpl):
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: list[float] | None,
-        sliding_window: int | None,
-        kv_cache_dtype: str,
-        logits_soft_cap: float | None = None,
-        attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: str | None = None,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if alibi_slopes is not None:
-            raise NotImplementedError("XFormers does not support alibi slopes yet.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.kv_cache_dtype = kv_cache_dtype
-        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        if sliding_window is None:
-            self.sliding_window = (-1, -1)
-        else:
-            self.sliding_window = (sliding_window - 1, 0)
-        if logits_soft_cap is None:
-            # Setting logits_soft_cap to 0 means no soft cap.
-            logits_soft_cap = 0
-        self.logits_soft_cap = logits_soft_cap
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "XFormersAttentionImpl."
-            )
-
-    def forward(
-        self,
-        layer: torch.nn.Module,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: XFormersAttentionMetadata,
-        output: torch.Tensor | None = None,
-        output_scale: torch.Tensor | None = None,
-        output_block_scale: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        """Forward pass with XFormers.
-
-        Args:
-            query: shape = [num_tokens, num_heads, head_size]
-            key: shape = [num_tokens, num_kv_heads, head_size]
-            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache: shape =
-                [2, num_blocks, block_size, num_kv_heads, head_size]
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        assert output is not None, "Output tensor must be provided."
-
-        if output_scale is not None or output_block_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for XFormersAttentionImpl"
-            )
-
-        if attn_metadata is None:
-            # Profiling run.
-            return output.fill_(0)
-
-        # Cache the input KVs.
-        key_cache, value_cache = kv_cache.unbind(0)
-        if self.kv_sharing_target_layer_name is None:
-            # Reshape the input keys and values and store them in the cache.
-            # Skip this if sharing KV cache with an earlier attention layer.
-            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-            # not padded. However, we don't need to do key[:num_actual_tokens]
-            # and value[:num_actual_tokens] because the reshape_and_cache_flash
-            # op uses the slot_mapping's shape to determine the number of
-            # actual tokens.
-            ops.reshape_and_cache_flash(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
-        num_actual_tokens = attn_metadata.num_actual_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        if prefill_meta := attn_metadata.prefill_metadata:
-            descale_shape = (prefill_meta.query_start_loc.shape[0] - 1, key.shape[1])
-            unified_attention(
-                q=query[num_decode_tokens:num_actual_tokens],
-                k=key_cache,
-                v=value_cache,
-                out=output[num_decode_tokens:num_actual_tokens],
-                cu_seqlens_q=prefill_meta.query_start_loc,
-                max_seqlen_q=prefill_meta.max_query_len,
-                seqused_k=prefill_meta.seq_lens,
-                max_seqlen_k=prefill_meta.max_seq_len,
-                softmax_scale=self.scale,
-                causal=True,
-                alibi_slopes=self.alibi_slopes,
-                window_size=self.sliding_window,
-                block_table=prefill_meta.block_table,
-                softcap=self.logits_soft_cap,
-                q_descale=None,  # Not supported
-                k_descale=layer._k_scale.expand(descale_shape),
-                v_descale=layer._v_scale.expand(descale_shape),
-            )
-
-        if decode_meta := attn_metadata.decode_metadata:
-            # Query for decode. KV is not needed because it is already cached.
-            decode_query = query[:num_decode_tokens]
-            # Reshape query to [1, B_T, G, H, D].
-            q = decode_query.view(
-                1, -1, self.num_kv_heads, self.num_queries_per_kv, self.head_size
-            )
-            # Reshape the k and v caches to [1, Bkv_T, G, H, D]
-            cache_k = key_cache.view(
-                1, -1, self.num_kv_heads, 1, self.head_size
-            ).expand(
-                1,
-                -1,
-                self.num_kv_heads,
-                self.num_queries_per_kv,
-                self.head_size,
-            )
-            cache_v = value_cache.view(
-                1, -1, self.num_kv_heads, 1, self.head_size
-            ).expand(
-                1,
-                -1,
-                self.num_kv_heads,
-                self.num_queries_per_kv,
-                self.head_size,
-            )
-
-            attn_bias = decode_meta.attn_bias
-            output[:num_decode_tokens] = xops.memory_efficient_attention_forward(
-                q,
-                cache_k,
-                cache_v,
-                attn_bias=attn_bias,
-                p=0.0,
-                scale=self.scale,
-            ).view(decode_query.shape)
-
-        # Reshape the output tensor.
-        return output

From ed40d85929f25a939b8d79ef3f2fb923f5a7bb54 Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Mon, 24 Nov 2025 14:48:45 +0800
Subject: [PATCH 191/249] [BugFix] Fix R-VL model loading error (#29299)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
---
 examples/offline_inference/vision_language_multi_image.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index d6e169548f15b..301265d4e17f7 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1110,6 +1110,7 @@ def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
         model=model_name,
         max_model_len=16384,
         max_num_seqs=16,
+        trust_remote_code=True,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 

From 68dfe28eaefc20ce8d847c3b3ccf712716d20c20 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Mon, 24 Nov 2025 18:02:28 +0800
Subject: [PATCH 192/249] [Feature][Benchmark] add --link-vars can filter when
 serve_param equal bench_param (#28909)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/benchmarks/sweep/serve.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 45ac446a7aedf..1298e4acbd87d 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -211,6 +211,7 @@ def run_combs(
     output_dir: Path,
     num_runs: int,
     dry_run: bool,
+    links: list[tuple[str, str]],
 ):
     all_data = list[dict[str, object]]()
     for serve_comb in serve_params:
@@ -226,6 +227,14 @@ def run_combs(
             else contextlib.nullcontext()
         ) as server:
             for bench_comb in bench_params:
+                should_run = all(
+                    serve_key in serve_comb
+                    and bench_key in bench_comb
+                    and serve_comb[serve_key] == bench_comb[bench_key]
+                    for serve_key, bench_key in links
+                )
+                if not should_run:
+                    continue
                 base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
 
                 comb_data = run_comb(
@@ -262,6 +271,7 @@ class SweepServeArgs:
     num_runs: int
     dry_run: bool
     resume: str | None
+    link_vars: list[tuple[str, str]] | None
 
     parser_name: ClassVar[str] = "serve"
     parser_help: ClassVar[str] = "Run vLLM server benchmark under multiple settings."
@@ -285,7 +295,7 @@ class SweepServeArgs:
         else:
             # i.e.: run bench_cmd without any modification
             bench_params = ParameterSweep.from_records([{}])
-
+        link_vars = cls.parse_link_vars(args.link_vars)
         num_runs = args.num_runs
         if num_runs < 1:
             raise ValueError("`num_runs` should be at least 1.")
@@ -301,6 +311,7 @@ class SweepServeArgs:
             num_runs=num_runs,
             dry_run=args.dry_run,
             resume=args.resume,
+            link_vars=link_vars,
         )
 
     @classmethod
@@ -376,8 +387,28 @@ class SweepServeArgs:
             "parameter combinations for which there are still no output files.",
         )
 
+        parser.add_argument(
+            "--link-vars",
+            type=str,
+            default="",
+            help=(
+                "Comma-separated list of linked variables between serve and bench, "
+                "e.g. max_num_seqs=max_concurrency,max_model_len=random_input_len"
+            ),
+        )
+
         return parser
 
+    @staticmethod
+    def parse_link_vars(s: str) -> list[tuple[str, str]]:
+        if not s:
+            return []
+        pairs = []
+        for item in s.split(","):
+            a, b = item.split("=")
+            pairs.append((a.strip(), b.strip()))
+        return pairs
+
 
 def run_main(args: SweepServeArgs):
     timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -397,6 +428,7 @@ def run_main(args: SweepServeArgs):
             output_dir=output_dir,
             num_runs=args.num_runs,
             dry_run=args.dry_run,
+            links=args.link_vars,
         )
     except BaseException as exc:
         raise RuntimeError(

From 8005e606bf280b7b6002f57e95ae3210ddc6f041 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=B0=E5=85=AE?=
 <38908462+zhyajie@users.noreply.github.com>
Date: Mon, 24 Nov 2025 18:16:52 +0800
Subject: [PATCH 193/249] [Bugfix][Rocm] Fix shared expert weight loading
 failure in DeepSeek-MTP (#27563)

Signed-off-by: zhyajie <yajizhan@amd.com>
Co-authored-by: zhyajie <yajizhan@amd.com>
---
 vllm/model_executor/models/deepseek_mtp.py | 153 +++++++++++++++------
 vllm/model_executor/models/deepseek_v2.py  |  14 +-
 2 files changed, 121 insertions(+), 46 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index e028dc497aa6a..6e23037b919ab 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -1,15 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable
+import typing
+from collections.abc import Callable, Iterable
 
 import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -231,6 +233,9 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
         return self.model.compute_logits(hidden_states, spec_step_idx)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        rocm_aiter_moe_shared_expert_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
         stacked_params_mapping = [
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
@@ -238,11 +243,16 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
             ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
         ]
 
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+        expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts,
+            num_experts=self.config.n_routed_experts
+            + (
+                self.config.n_shared_experts
+                if rocm_aiter_moe_shared_expert_enabled
+                else 0
+            ),
         )
 
         params_dict = dict(self.named_parameters())
@@ -253,6 +263,9 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
             spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
             if spec_layer is None:
                 continue
+            is_fusion_moe_shared_experts_layer = (
+                rocm_aiter_moe_shared_expert_enabled and ("mlp.shared_experts" in name)
+            )
             name = self._rewrite_spec_layer_name(spec_layer, name)
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
@@ -266,6 +279,8 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                 if ("mlp.experts." in name) and name not in params_dict:
                     continue
+                if is_fusion_moe_shared_experts_layer:
+                    continue
                 name_mapped = name.replace(weight_name, param_name)
 
                 # QKV fusion is optional, fall back to normal
@@ -286,45 +301,105 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(
-                        param,
-                        loaded_weight,
-                        name,
-                        shard_id=shard_id,
-                        expert_id=expert_id,
+                # Special handling: when AITER fusion_shared_experts is enabled,
+                # checkpoints may provide a single widened shared_experts tensor
+                # without explicit expert indices
+                # (e.g. ...mlp.shared_experts.gate_proj.weight).
+                # For models with multiple shared experts, split that tensor
+                # evenly into per-shared-expert slices and load them into
+                # appended expert slots mlp.experts.{n_routed_experts + j}.*
+                # accordingly.
+                num_chunks = 1
+                if is_fusion_moe_shared_experts_layer:
+                    num_chunks = getattr(self.config, "n_shared_experts", 1) or 1
+                    # Determine split axis based on op type
+                    # gate/up: ColumnParallel → split along dim 0
+                    # down: RowParallel → split along dim 1
+                    split_dim = 1 if "down_proj.weight" in name else 0
+                    total = loaded_weight.shape[split_dim]
+                    assert total % num_chunks == 0, (
+                        f"Shared expert weight dim {total} "
+                        f"not divisible by num_chunks {num_chunks}"
                     )
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
+                    chunk_size = total // num_chunks
 
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
+                for j in range(num_chunks):
+                    chunk_name = name
+                    weight_to_load = loaded_weight
 
-                    # According to DeepSeek-V3 Technical Report, MTP modules
-                    # shares embedding layer. We only load the first weights.
-                    if (
-                        spec_layer != self.model.mtp_start_layer_idx
-                        and ".layers" not in name
-                    ):
-                        continue
+                    if is_fusion_moe_shared_experts_layer:
+                        if split_dim == 0:
+                            weight_to_load = loaded_weight[
+                                j * chunk_size : (j + 1) * chunk_size, :
+                            ]
+                        else:
+                            weight_to_load = loaded_weight[
+                                :, j * chunk_size : (j + 1) * chunk_size
+                            ]
+                        # Synthesize an expert-style name so expert mapping
+                        # can route it
+                        chunk_name = name.replace(
+                            "mlp.shared_experts",
+                            f"mlp.experts.{self.config.n_routed_experts + j}",
+                        )
 
-                    param = params_dict[name]
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
-                    )
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
+                    # Use expert_params_mapping to locate the destination
+                    # param and delegate to its expert-aware weight_loader
+                    # with expert_id.
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if weight_name not in chunk_name:
+                            continue
+
+                        # Do not modify `name` since the loop may continue here
+                        # Instead, create a new variable
+                        name_mapped = chunk_name.replace(weight_name, param_name)
+
+                        param = params_dict[name_mapped]
+                        # We should ask the weight loader to return success or
+                        # not here since otherwise we may skip experts with
+                        # other available replicas.
+                        weight_loader = typing.cast(
+                            Callable[..., bool], param.weight_loader
+                        )
+                        success = weight_loader(
+                            param,
+                            weight_to_load,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                            return_success=True,
+                        )
+                        if success:
+                            if not is_fusion_moe_shared_experts_layer:
+                                name = name_mapped
+                            else:
+                                loaded_params.add(name_mapped)
+                            break
+                    else:
+                        # Skip loading extra bias for GPTQ models.
+                        if name.endswith(".bias") and name not in params_dict:
+                            continue
+
+                        name = maybe_remap_kv_scale_name(name, params_dict)
+                        if name is None:
+                            continue
+
+                        # According to DeepSeek-V3 Technical Report, MTP modules
+                        # shares embedding layer. We only load the first weights.
+                        if (
+                            spec_layer != self.model.mtp_start_layer_idx
+                            and ".layers" not in name
+                        ):
+                            continue
+
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+            if not is_fusion_moe_shared_experts_layer:
+                loaded_params.add(name)
         return loaded_params
 
     def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 7cfd381592b49..ad932559b983d 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1479,8 +1479,8 @@ class DeepseekV2ForCausalLM(
             if spec_layer is not None:
                 continue  # skip spec decode layers for main model
 
-            is_fuse_shared_experts_layer = rocm_aiter_moe_shared_expert_enabled and (
-                "mlp.shared_experts" in name
+            is_fusion_moe_shared_experts_layer = (
+                rocm_aiter_moe_shared_expert_enabled and ("mlp.shared_experts" in name)
             )
 
             for param_name, weight_name, shard_id in stacked_params_mapping:
@@ -1495,7 +1495,7 @@ class DeepseekV2ForCausalLM(
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                 if ("mlp.experts." in name) and name not in params_dict:
                     continue
-                if is_fuse_shared_experts_layer:
+                if is_fusion_moe_shared_experts_layer:
                     continue
                 name_mapped = name.replace(weight_name, param_name)
 
@@ -1531,7 +1531,7 @@ class DeepseekV2ForCausalLM(
                 # appended expert slots mlp.experts.{n_routed_experts + j}.*
                 # accordingly.
                 num_chunks = 1
-                if is_fuse_shared_experts_layer:
+                if is_fusion_moe_shared_experts_layer:
                     num_chunks = getattr(self.config, "n_shared_experts", 1) or 1
                     # Determine split axis based on op type
                     # gate/up: ColumnParallel → split along dim 0
@@ -1548,7 +1548,7 @@ class DeepseekV2ForCausalLM(
                     chunk_name = name
                     weight_to_load = loaded_weight
 
-                    if is_fuse_shared_experts_layer:
+                    if is_fusion_moe_shared_experts_layer:
                         if split_dim == 0:
                             weight_to_load = loaded_weight[
                                 j * chunk_size : (j + 1) * chunk_size, :
@@ -1599,7 +1599,7 @@ class DeepseekV2ForCausalLM(
                             return_success=True,
                         )
                         if success:
-                            if not is_fuse_shared_experts_layer:
+                            if not is_fusion_moe_shared_experts_layer:
                                 name = name_mapped
                             else:
                                 loaded_params.add(name_mapped)
@@ -1628,7 +1628,7 @@ class DeepseekV2ForCausalLM(
                             param, "weight_loader", default_weight_loader
                         )
                         weight_loader(param, loaded_weight)
-            if not is_fuse_shared_experts_layer:
+            if not is_fusion_moe_shared_experts_layer:
                 loaded_params.add(name)
 
         return loaded_params

From eca7a8fb59c223c603922be0bd62f5c460972a50 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Mon, 24 Nov 2025 12:10:48 +0100
Subject: [PATCH 194/249] [Doc]: fix typos in various files (#29230)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 benchmarks/kernels/deepgemm/README.md | 2 +-
 vllm/config/vllm.py                   | 2 +-
 vllm/forward_context.py               | 2 +-
 vllm/v1/worker/gpu_input_batch.py     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md
index 41e68e047be82..a28c6956be0e9 100644
--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -2,7 +2,7 @@
 
 This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
 
-Currently this just includes dense GEMMs and only works on Hopper GPUs.
+Currently, this just includes dense GEMMs and only works on Hopper GPUs.
 
 ## Setup
 
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index d64e315b4fe39..8a3599416bc72 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -96,7 +96,7 @@ class VllmConfig:
     """`torch.compile` and cudagraph capture configuration for the model.
 
     As a shorthand, one can append compilation arguments via 
-    -0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`).
+    -0.parameter=argument such as `-O.mode=3` (same as `-O='{"mode":3}'`).
 
     You can specify the full compilation config like so:
     `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 25fb7181a8f29..7cb490e391abb 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -153,7 +153,7 @@ class DPMetadata:
     @contextmanager
     def sp_local_sizes(self, sequence_parallel_size: int):
         """
-        Context mamager for setting self.local_sizes. Same as self.chunked_sizes
+        Context manager for setting self.local_sizes. Same as self.chunked_sizes
         but without any chunking.
         """
         self.local_sizes = _compute_sp_num_tokens(
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index d6fef450c028a..4a2818ab1bfd8 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -525,7 +525,7 @@ class InputBatch:
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
         #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
-        # instead, we need to temporiarily copy the data for one of the indices
+        # instead, we need to temporarily copy the data for one of the indices
         # TODO(lucas): optimize this by only copying valid indices
         tmp = self.token_ids_cpu[i1, ...].copy()
         self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]

From 4de87866a8fdc9395ccd40e00c0f9075439b07ae Mon Sep 17 00:00:00 2001
From: R3hankhan <Rehan.Khan7@ibm.com>
Date: Mon, 24 Nov 2025 17:38:09 +0530
Subject: [PATCH 195/249] [CPU][IBM Z] Fix BF16 support and vectorize math
 operations for s390x (#28926)

Signed-off-by: Rehan Khan <Rehan.Khan7@ibm.com>
---
 csrc/cpu/cpu_attn_impl.hpp |   2 +-
 csrc/cpu/cpu_types_vxe.hpp | 586 +++++++++++++++++++++++++++++++++----
 2 files changed, 531 insertions(+), 57 deletions(-)

diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 12c6f5d3015cc..98f55d7c014be 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -847,7 +847,7 @@ struct VecTypeTrait<c10::BFloat16> {
 };
 #endif
 
-#if !defined(__powerpc__)
+#if !defined(__powerpc__) && !defined(__s390x__)
 template <>
 struct VecTypeTrait<c10::Half> {
   using vec_t = vec_op::FP16Vec16;
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
index 51bca37e699b9..9efd8b7ec14a4 100644
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -4,6 +4,7 @@
 
 #include <vecintrin.h>
 #include <cmath>
+#include <limits>
 #include <torch/all.h>
 namespace vec_op {
 
@@ -174,8 +175,9 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   }
 
   explicit FP32Vec8(const BF16Vec8& v) {
-    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
-    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+    // On big-endian s390x, place BF16 first to get correct byte order
+    reg.val[0] = (__vector float)vec_mergeh(v.reg, zero);
+    reg.val[1] = (__vector float)vec_mergel(v.reg, zero);
   }
 
   float reduce_sum() const {
@@ -189,51 +191,257 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   }
 
   FP32Vec8 exp() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::exp(ar.values[0]);
-    ret.val[0][1] = std::exp(ar.values[1]);
-    ret.val[0][2] = std::exp(ar.values[2]);
-    ret.val[0][3] = std::exp(ar.values[3]);
-    ret.val[1][0] = std::exp(ar.values[4]);
-    ret.val[1][1] = std::exp(ar.values[5]);
-    ret.val[1][2] = std::exp(ar.values[6]);
-    ret.val[1][3] = std::exp(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    f32x4x2_t out;
+
+    const __vector float log2e = vec_splats(1.44269504088896341f);
+    const __vector float one = vec_splats(1.0f);
+    const __vector float min_x = vec_splats(-87.3f);
+    const __vector float max_x = vec_splats(88.7f);
+
+    // 5th-degree minimax polynomial for 2^r (r in [0,1))
+    const __vector float c1 = vec_splats(0.6931471805599453f);
+    const __vector float c2 = vec_splats(0.240226506959101f);
+    const __vector float c3 = vec_splats(0.05550410866482158f);
+    const __vector float c4 = vec_splats(0.009618129107628477f);
+    const __vector float c5 = vec_splats(0.0013333558146428443f);
+
+    for (int i = 0; i < 2; i++) {
+      __vector float x = reg.val[i];
+
+      x = vec_max(x, min_x);
+      x = vec_min(x, max_x);
+
+      __vector float y = vec_mul(x, log2e);
+
+      __vector float kf = vec_floor(y);
+      __vector float r = vec_sub(y, kf);
+
+      __vector signed int k = vec_signed(kf);
+      const __vector signed int min_k = vec_splats((signed int)-126);
+      const __vector signed int max_k = vec_splats((signed int)127);
+      k = vec_min(vec_max(k, min_k), max_k);
+
+      // Build 2^k from exponent bits
+      __vector signed int exp_int = vec_add(k, vec_splats((signed int)127));
+      __vector unsigned int bits = (__vector unsigned int)exp_int;
+      bits = vec_sl(bits, vec_splats((unsigned int)23));
+      __vector float pow2k = (__vector float)bits;
+
+      // Improved minimax polynomial
+      __vector float poly = vec_madd(c5, r, c4);
+      poly = vec_madd(poly, r, c3);
+      poly = vec_madd(poly, r, c2);
+      poly = vec_madd(poly, r, c1);
+      poly = vec_madd(poly, r, one);
+
+      out.val[i] = vec_mul(pow2k, poly);
+    }
+
+    return FP32Vec8(out);
   }
 
   FP32Vec8 tanh() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::tanh(ar.values[0]);
-    ret.val[0][1] = std::tanh(ar.values[1]);
-    ret.val[0][2] = std::tanh(ar.values[2]);
-    ret.val[0][3] = std::tanh(ar.values[3]);
-    ret.val[1][0] = std::tanh(ar.values[4]);
-    ret.val[1][1] = std::tanh(ar.values[5]);
-    ret.val[1][2] = std::tanh(ar.values[6]);
-    ret.val[1][3] = std::tanh(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    // tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
+    const __vector float one = vec_splats(1.0f);
+    const __vector float two = vec_splats(2.0f);
+    const __vector float zero = vec_splats(0.0f);
+    const __vector float sat =
+        vec_splats(9.0f);  // beyond this, tanh(x) ~ sign(x)
+
+    f32x4x2_t out;
+
+    for (int i = 0; i < 2; i++) {
+      __vector float x = reg.val[i];
+      __vector float ax = vec_abs(x);
+
+      // sign(x): +1 or -1
+      __vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
+
+      // saturation mask: |x| > sat
+      __vector __bool int saturated = vec_cmpgt(ax, sat);
+
+      // 2x
+      __vector float two_x = vec_mul(x, two);
+
+      // Build a temporary FP32Vec8 with both lanes = 2x, reuse exp()
+      f32x4x2_t tmp;
+      tmp.val[0] = two_x;
+      tmp.val[1] = two_x;
+      FP32Vec8 exp_2x_vec(tmp);
+
+      FP32Vec8 e2x = exp_2x_vec.exp();
+      __vector float e = e2x.reg.val[i];
+
+      // tanh(x) = (e - 1) / (e + 1)
+      __vector float num = vec_sub(e, one);
+      __vector float den = vec_add(e, one);
+
+      __vector float t = vec_div(num, den);
+
+      // For large |x|, clamp to sign(x)
+      out.val[i] = vec_sel(t, sign, saturated);
+    }
+
+    return FP32Vec8(out);
   }
 
   FP32Vec8 er() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::erf(ar.values[0]);
-    ret.val[0][1] = std::erf(ar.values[1]);
-    ret.val[0][2] = std::erf(ar.values[2]);
-    ret.val[0][3] = std::erf(ar.values[3]);
-    ret.val[1][0] = std::erf(ar.values[4]);
-    ret.val[1][1] = std::erf(ar.values[5]);
-    ret.val[1][2] = std::erf(ar.values[6]);
-    ret.val[1][3] = std::erf(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    // A&S 7.1.26 approximation:
+    // erf(x) = sign(x) * (1 - ((((a5*t + a4)*t + a3)*t + a2)*t + a1) * t *
+    // exp(-x^2)) t = 1 / (1 + p*|x|),  p = 0.3275911
+
+    const __vector float one = vec_splats(1.0f);
+    const __vector float zero = vec_splats(0.0f);
+    const __vector float p = vec_splats(0.3275911f);
+
+    // Polynomial coeffs
+    const __vector float a1 = vec_splats(0.254829592f);
+    const __vector float a2 = vec_splats(-0.284496736f);
+    const __vector float a3 = vec_splats(1.421413741f);
+    const __vector float a4 = vec_splats(-1.453152027f);
+    const __vector float a5 = vec_splats(1.061405429f);
+
+    // Threshold where erf(x) ~ sign(x)
+    const __vector float sat = vec_splats(6.0f);
+
+    f32x4x2_t out;
+
+    for (int lane = 0; lane < 2; lane++) {
+      __vector float x = reg.val[lane];
+      __vector float ax = vec_abs(x);
+
+      // sign(x)
+      __vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
+
+      // |x| > 6 → erf(x) = ±1
+      __vector __bool int saturated = vec_cmpgt(ax, sat);
+
+      // t = 1 / (1 + p * |x|)
+      __vector float t = vec_madd(p, ax, one);
+      t = vec_div(one, t);
+
+      // poly = a5
+      __vector float poly = a5;
+      poly = vec_madd(poly, t, a4);
+      poly = vec_madd(poly, t, a3);
+      poly = vec_madd(poly, t, a2);
+      poly = vec_madd(poly, t, a1);
+
+      // full polynomial: poly = poly * t
+      poly = vec_mul(poly, t);
+
+      // Compute exp(-x^2)
+      __vector float x2 = vec_mul(x, x);
+      __vector float neg_x2 = vec_neg(x2);
+
+      f32x4x2_t tmp;
+      tmp.val[0] = neg_x2;
+      tmp.val[1] = neg_x2;
+      FP32Vec8 exp_neg_x2(tmp);
+
+      FP32Vec8 e = exp_neg_x2.exp();
+      __vector float ex = e.reg.val[lane];
+
+      // erf(x) = sign * (1 - poly * exp(-x^2))
+      __vector float term = vec_mul(poly, ex);
+      __vector float y = vec_sub(one, term);
+      y = vec_mul(y, sign);
+
+      // saturated → ±1
+      __vector float sat_val = vec_mul(sign, one);
+      out.val[lane] = vec_sel(y, sat_val, saturated);
+    }
+
+    return FP32Vec8(out);
+  }
+  // Elementwise sigmoid(x) = 1 / (1 + exp(-x))
+  FP32Vec8 sigmoid() const {
+    const __vector float one = vec_splats(1.0f);
+
+    f32x4x2_t neg;
+    for (int i = 0; i < 2; ++i) {
+      neg.val[i] = vec_neg(reg.val[i]);
+    }
+
+    FP32Vec8 neg_x(neg);
+    FP32Vec8 e = neg_x.exp();  // exp(-x)
+
+    f32x4x2_t denom;
+    for (int i = 0; i < 2; ++i) {
+      denom.val[i] = vec_add(one, e.reg.val[i]);
+    }
+
+    FP32Vec8 denom_vec(denom);
+    FP32Vec8 one_vec(1.0f);
+
+    return one_vec / denom_vec;
+  }
+
+  // Tanh-based GELU:
+  // gelu(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x^3)))
+  FP32Vec8 gelu_tanh() const {
+    const __vector float k_s2pi = vec_splats(0.7978845608028654f);  // √(2/π)
+    const __vector float k_0_0447 = vec_splats(0.044715f);
+
+    f32x4x2_t x2, x3, inner;
+    for (int i = 0; i < 2; ++i) {
+      __vector float x = reg.val[i];
+      x2.val[i] = vec_mul(x, x);                            // x^2
+      x3.val[i] = vec_mul(x2.val[i], x);                    // x^3
+      __vector float t = vec_madd(k_0_0447, x3.val[i], x);  // x + 0.044715*x^3
+      inner.val[i] = vec_mul(k_s2pi, t);                    // √(2/π)*(...)
+    }
+
+    FP32Vec8 inner_vec(inner);
+    FP32Vec8 t = inner_vec.tanh();  // tanh part
+
+    FP32Vec8 one_vec(1.0f);
+    FP32Vec8 half_vec(0.5f);
+
+    FP32Vec8 x_vec(*this);
+    return x_vec * half_vec * (one_vec + t);
+  }
+
+  // Erf-based GELU:
+  // gelu(x) = 0.5 * x * (1 + erf(x / √2))
+  FP32Vec8 gelu_erf() const {
+    const __vector float inv_sqrt2 = vec_splats(0.7071067811865476f);  // 1/√2
+    FP32Vec8 x_vec(*this);
+
+    f32x4x2_t scaled;
+    for (int i = 0; i < 2; ++i) {
+      scaled.val[i] = vec_mul(reg.val[i], inv_sqrt2);
+    }
+    FP32Vec8 x_scaled(scaled);
+
+    FP32Vec8 erf_x = x_scaled.er();
+
+    FP32Vec8 one_vec(1.0f);
+    FP32Vec8 half_vec(0.5f);
+
+    return x_vec * half_vec * (one_vec + erf_x);
+  }
+
+  // Elementwise reciprocal: 1/x (scalar per lane, for correctness)
+  FP32Vec8 rcp() const {
+    AliasReg in, out;
+    in.reg = reg;
+
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      out.values[i] = 1.0f / in.values[i];
+    }
+    return FP32Vec8(out.reg);
+  }
+
+  // Elementwise rsqrt(x) = 1 / sqrt(x) (scalar per lane, for correctness)
+  FP32Vec8 rsqrt() const {
+    AliasReg in, out;
+    in.reg = reg;
+
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      out.values[i] = 1.0f / std::sqrt(in.values[i]);
+    }
+    return FP32Vec8(out.reg);
   }
 
   FP32Vec8 operator*(const FP32Vec8& b) const {
@@ -316,10 +524,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   }
 
   explicit FP32Vec16(const BF16Vec16& v) {
-    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
-    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
-    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
-    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+    // On big-endian s390x, place BF16 first to get correct byte order
+    reg.val[0] = (__vector float)vec_mergeh(v.reg.val[0], zero);
+    reg.val[1] = (__vector float)vec_mergel(v.reg.val[0], zero);
+    reg.val[2] = (__vector float)vec_mergeh(v.reg.val[1], zero);
+    reg.val[3] = (__vector float)vec_mergel(v.reg.val[1], zero);
   }
 
   explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
@@ -376,6 +585,23 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return result;
   }
 
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
+                                vec_max(reg.val[1], b.reg.val[1]),
+                                vec_max(reg.val[2], b.reg.val[2]),
+                                vec_max(reg.val[3], b.reg.val[3])}));
+  }
+
+  float reduce_max() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = ar.values[0];
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) {
+      if (ar.values[i] > result) result = ar.values[i];
+    });
+    return result;
+  }
+
   void save(float* ptr) const {
     vec_xst(reg.val[0], 0, ptr);
     vec_xst(reg.val[1], 16, ptr);
@@ -402,15 +628,14 @@ struct VecType<c10::BFloat16> {
   using vec_type = BF16Vec8;
 };
 
+// On s390x, FP16 (Half) is not natively supported, use FP32 vectors instead
+using FP16Vec16 = FP32Vec16;
+
 template <typename T>
 void storeFP32(float v, T* ptr) {
   *ptr = v;
 }
 
-inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
-  acc = acc + a * b;
-}
-
 namespace c10 {
 struct BFloat16 {
   uint16_t value;  // Assume BFloat16 is defined as a struct containing a 16-bit
@@ -429,6 +654,79 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
   #define __VEC_CLASS_FP_NAN (1 << 6)
 #endif
 
+// Optimized FMA (Fused Multiply-Add) implementations using IBM Z vector
+// intrinsics
+
+// FP32Vec4 FMA: acc = acc + (a * b) or equivalently acc = fma(a, b, acc)
+FORCE_INLINE void fma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_madd(a.reg, b.reg, acc.reg);
+}
+
+// FP32Vec8 FMA: acc = acc + (a * b)
+FORCE_INLINE void fma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+// FP32Vec16 FMA: acc = acc + (a * b)
+FORCE_INLINE void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_madd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_madd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Multiply-Subtract: acc = acc - (a * b)
+FORCE_INLINE void fms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_msub(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void fms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void fms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_msub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_msub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Negative Multiply-Add: acc = -(a * b) + acc
+FORCE_INLINE void nfma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_nmadd(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void nfma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void nfma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_nmadd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_nmadd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Negative Multiply-Subtract: acc = -(a * b) - acc
+FORCE_INLINE void nfms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_nmsub(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void nfms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void nfms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_nmsub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_nmsub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
 const static __vector unsigned char omask = {2,  3,  6,  7,  10, 11, 14, 15,
                                              18, 19, 22, 23, 26, 27, 30, 31};
 const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
@@ -441,13 +739,24 @@ const static __vector unsigned int one = {1, 1, 1, 1};
 inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
   __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
   __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int lsb0 = inp0 >> sh16;
+  __vector unsigned int lsb1 = inp1 >> sh16;
+  lsb0 = lsb0 & one;
+  lsb1 = lsb1 & one;
+  __vector unsigned int rnd0 = lsb0 + bias;
+  __vector unsigned int rnd1 = lsb1 + bias;
+  inp0 = inp0 + rnd0;
+  inp1 = inp1 + rnd1;
   int cc;
   __vector __bool int sel0 =
       vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
   __vector __bool int sel1 =
       vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
-  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
-  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp0 = inp0 >> sh16;
+  inp1 = inp1 >> sh16;
+
   reg = (__vector signed short)vec_perm(inp0, inp1, omask);
 }
 
@@ -456,6 +765,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
   __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
   __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
   __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+  __vector unsigned int lsb0 = inp0 >> sh16;
+  __vector unsigned int lsb1 = inp1 >> sh16;
+  __vector unsigned int lsb2 = inp2 >> sh16;
+  __vector unsigned int lsb3 = inp3 >> sh16;
+  lsb0 = lsb0 & one;
+  lsb1 = lsb1 & one;
+  lsb2 = lsb2 & one;
+  lsb3 = lsb3 & one;
+  __vector unsigned int rnd0 = lsb0 + bias;
+  __vector unsigned int rnd1 = lsb1 + bias;
+  __vector unsigned int rnd2 = lsb2 + bias;
+  __vector unsigned int rnd3 = lsb3 + bias;
+  inp0 = inp0 + rnd0;
+  inp1 = inp1 + rnd1;
+  inp2 = inp2 + rnd2;
+  inp3 = inp3 + rnd3;
   int cc;
   __vector __bool int sel0 =
       vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
@@ -465,15 +790,164 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
       vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
   __vector __bool int sel3 =
       vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
-  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
-  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
-  inp2 = vec_sel(inp2, nan, sel2) >> sh16;
-  inp3 = vec_sel(inp3, nan, sel3) >> sh16;
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp2 = vec_sel(inp2, nan, sel2);
+  inp3 = vec_sel(inp3, nan, sel3);
+  inp0 = inp0 >> sh16;
+  inp1 = inp1 >> sh16;
+  inp2 = inp2 >> sh16;
+  inp3 = inp3 >> sh16;
+
   reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
   reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
 }
 
-inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
+// 1D softmax over `n` elements in `input`, writes result to `output`.
+// Uses FP32Vec8 for main body, scalar tail handling.
+// Requirement: n > 0
+FORCE_INLINE void softmax_fp32vec8(float* output, const float* input, int n) {
+  if (n <= 0) return;
+
+  // ---------- Pass 1: find max ----------
+  float max_val = -std::numeric_limits<float>::infinity();
+  int i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    FP32Vec8 v(input + i);
+    FP32Vec8::AliasReg ar;
+    ar.reg = v.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      if (ar.values[j] > max_val) max_val = ar.values[j];
+    }
+  }
+  for (; i < n; ++i) {
+    if (input[i] > max_val) max_val = input[i];
+  }
+
+  // ---------- Pass 2: compute exp(x - max) and sum ----------
+  float sum = 0.0f;
+  i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    float tmp[FP32Vec8::VEC_ELEM_NUM];
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      tmp[j] = input[i + j] - max_val;
+    }
+
+    FP32Vec8 v(tmp);
+    FP32Vec8 e = v.exp();
+
+    FP32Vec8::AliasReg ar;
+    ar.reg = e.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      output[i + j] = ar.values[j];
+      sum += ar.values[j];
+    }
+  }
+
+  // Tail
+  for (; i < n; ++i) {
+    float x = input[i] - max_val;
+    float ex = std::exp(x);  // scalar tail
+    output[i] = ex;
+    sum += ex;
+  }
+
+  // ---------- Pass 3: normalize ----------
+  float inv_sum = 1.0f / sum;
+  i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    float tmp[FP32Vec8::VEC_ELEM_NUM];
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      tmp[j] = output[i + j] * inv_sum;
+    }
+    FP32Vec8 v(tmp);
+    v.save(output + i);
+  }
+
+  for (; i < n; ++i) {
+    output[i] *= inv_sum;
+  }
+}
+
+// 1D RMSNorm kernel:
+//   input:  x[0..n-1]
+//   weight: w[0..n-1] (gamma), may be nullptr
+//   output: y[i] = x[i] * inv_rms * (weight[i] if weight != nullptr else 1)
+//   eps: small epsilon for numerical stability
+FORCE_INLINE void rmsnorm_fp32vec8(float* output, const float* input,
+                                   const float* weight, int n, float eps) {
+  if (n <= 0) return;
+
+  // ---------- Pass 1: compute sum of squares ----------
+  float sum_sq = 0.0f;
+  int i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    FP32Vec8 x_vec(input + i);
+
+    FP32Vec8 sq = x_vec * x_vec;
+
+    FP32Vec8::AliasReg ar;
+    ar.reg = sq.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      sum_sq += ar.values[j];
+    }
+  }
+
+  // Tail
+  for (; i < n; ++i) {
+    float v = input[i];
+    sum_sq += v * v;
+  }
+
+  float mean_sq = sum_sq / static_cast<float>(n);
+  float inv_rms = 1.0f / std::sqrt(mean_sq + eps);
+
+  // ---------- Pass 2: scale (and apply weight if given) ----------
+  const float inv_rms_f = inv_rms;
+  i = 0;
+
+  if (weight) {
+    // with gamma
+    for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+      FP32Vec8 x_vec(input + i);
+
+      float wtmp[FP32Vec8::VEC_ELEM_NUM];
+      for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+        wtmp[j] = weight[i + j];
+      }
+      FP32Vec8 w_vec(wtmp);
+
+      FP32Vec8 scale_vec(inv_rms_f);
+      FP32Vec8 y = x_vec * scale_vec * w_vec;
+      y.save(output + i);
+    }
+
+    for (; i < n; ++i) {
+      output[i] = input[i] * inv_rms_f * weight[i];
+    }
+  } else {
+    // without gamma
+    for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+      FP32Vec8 x_vec(input + i);
+      FP32Vec8 scale_vec(inv_rms_f);
+      FP32Vec8 y = x_vec * scale_vec;
+      y.save(output + i);
+    }
+
+    for (; i < n; ++i) {
+      output[i] = input[i] * inv_rms_f;
+    }
+  }
+}
+
+// Prefetch data to cache for better memory access performance
+FORCE_INLINE void prefetch(const void* addr) {
+  __builtin_prefetch(addr, 0, 3);  // 0=read, 3=high temporal locality
+}
 
 };  // namespace vec_op
 

From 2601f18a8216b8e0b8743cded669b8b4d0e3e980 Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Mon, 24 Nov 2025 22:08:29 +0800
Subject: [PATCH 196/249] [EPLB] Optimize EPLB for Async Rearrange Experts 
 (#22179)

Signed-off-by: David Chen <530634352@qq.com>
Co-authored-by: SunChenxiang123 <1291824390@qq.com>
---
 tests/distributed/test_eplb_execute.py     | 127 +++++-
 tests/distributed/test_eplb_spec_decode.py |  39 +-
 vllm/config/parallel.py                    |   4 +
 vllm/distributed/eplb/async_worker.py      | 115 ++++++
 vllm/distributed/eplb/eplb_state.py        | 433 ++++++++++++++++++---
 vllm/distributed/eplb/rebalance_execute.py | 137 ++++++-
 vllm/v1/worker/gpu_model_runner.py         |   2 +
 7 files changed, 779 insertions(+), 78 deletions(-)
 create mode 100644 vllm/distributed/eplb/async_worker.py

diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index 9498e75b279b7..781dfd44c1ef6 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -1,13 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import asyncio
 import random
 
 import pytest
 import torch
 import torch.distributed
 
-from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
+from vllm.distributed.eplb.rebalance_execute import (
+    move_from_buffer,
+    rearrange_expert_weights_inplace,
+    transfer_layer,
+)
 from vllm.distributed.parallel_state import (
     ensure_model_parallel_initialized,
     get_tp_group,
@@ -231,6 +236,100 @@ def verify_redundant_experts_have_same_weights(
                     )
 
 
+def _test_async_transfer_layer_without_mtp_worker(
+    env,
+    world_size: int,
+    num_layers: int,
+    num_local_experts: int,
+    num_logical_experts: int,
+) -> None:
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
+
+    tp_group = get_tp_group()
+    ep_group = tp_group.device_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
+
+    total_physical_experts = world_size * num_local_experts
+    hidden_sizes = [16, 32]
+
+    redundancy_config = create_redundancy_config(
+        num_logical_experts,
+        total_physical_experts,
+    )
+    old_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        redundancy_config,
+    )
+
+    new_redundancy_config = create_redundancy_config(
+        num_logical_experts,
+        total_physical_experts,
+    )
+    new_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        new_redundancy_config,
+    )
+
+    expert_weights = create_expert_weights(
+        num_layers,
+        num_local_experts,
+        hidden_sizes,
+        ep_rank,
+        device,
+        old_indices,
+    )
+
+    expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
+    cuda_stream = torch.cuda.Stream(device=device)
+
+    for layer_idx in range(num_layers):
+        is_unchanged, is_received_locally, experts_recv_loc = asyncio.run(
+            transfer_layer(
+                old_global_expert_indices=old_indices,
+                new_global_expert_indices=new_indices,
+                expert_weights=expert_weights,
+                expert_weights_buffer=expert_buffer,
+                ep_group=ep_group,
+                layer=layer_idx,
+                cuda_stream=cuda_stream,
+            )
+        )
+
+        cuda_stream.synchronize()
+        move_from_buffer(
+            expert_weights=expert_weights[layer_idx],
+            expert_weights_buffer=expert_buffer,
+            is_unchanged=is_unchanged,
+            is_received_locally=is_received_locally,
+            experts_recv_loc=experts_recv_loc,
+            new_indices=new_indices[layer_idx].tolist(),
+            ep_group=ep_group,
+        )
+
+    verify_expert_weights_after_shuffle(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        ep_rank,
+        num_local_experts,
+    )
+    verify_redundant_experts_have_same_weights(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        world_size,
+        num_local_experts,
+    )
+
+
 def _test_rearrange_expert_weights_with_redundancy(
     env, world_size, num_layers, num_local_experts, num_logical_experts
 ) -> None:
@@ -399,6 +498,32 @@ def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
             )
 
 
+@pytest.mark.parametrize(
+    "world_size,num_layers,num_local_experts,num_logical_experts",
+    [
+        (2, 2, 2, 3),
+    ],
+)
+def test_async_transfer_layer_without_mtp(
+    world_size: int,
+    num_layers: int,
+    num_local_experts: int,
+    num_logical_experts: int,
+):
+    """Exercise async EPLB transfer path without MTP/spec decode."""
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    distributed_run(
+        _test_async_transfer_layer_without_mtp_worker,
+        world_size,
+        num_layers,
+        num_local_experts,
+        num_logical_experts,
+    )
+
+
 @pytest.mark.parametrize("world_size", [2, 4])
 def test_rearrange_expert_weights_no_change(world_size):
     """
diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
index 11e23f128f331..c055b7a3f6dd7 100644
--- a/tests/distributed/test_eplb_spec_decode.py
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -10,10 +10,11 @@ from tests.utils import large_gpu_mark
 
 def get_model_args(
     model_name: str,
-    spec_model_name: str,
+    spec_model_name: str | None,
     spec_method: str,
     tp_size: int,
     model_max_len: int,
+    use_async: bool = False,
 ) -> dict:
     speculative_config = {
         "method": spec_method,
@@ -37,6 +38,8 @@ def get_model_args(
         "enable_eplb": True,
         "max_model_len": model_max_len,
     }
+    if use_async:
+        model_args["eplb_config"] = {"use_async": True}
     return model_args
 
 
@@ -94,3 +97,37 @@ def test_eplb_spec_decode(
         measured_value - RTOL < expected_gsm8k_value
         and measured_value + RTOL > expected_gsm8k_value
     ), f"Expected: {expected_gsm8k_value} |  Measured: {measured_value}"
+
+
+@large_gpu_mark(min_gb=80)
+def test_eplb_spec_decode_qwen3_next_mtp_async() -> None:
+    """
+    Ensure async EPLB works with MTP speculative decoding for Qwen3-Next.
+    """
+
+    TASK = "gsm8k"
+    FILTER = "exact_match,strict-match"
+    RTOL = 0.03
+    expected_gsm8k_value = 0.86
+
+    model_args = get_model_args(
+        model_name="Qwen/Qwen3-Next-80B-A3B-Instruct",
+        spec_model_name=None,
+        spec_method="mtp",
+        tp_size=4,
+        model_max_len=4096,
+        use_async=True,
+    )
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=TASK,
+        batch_size=64,
+        num_fewshot=8,
+    )
+    measured_value = results["results"][TASK][FILTER]
+    assert (
+        measured_value - RTOL < expected_gsm8k_value
+        and measured_value + RTOL > expected_gsm8k_value
+    ), f"Expected: {expected_gsm8k_value} |  Measured: {measured_value}"
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 4b0236d8de3f5..ad438a8b464e0 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -60,6 +60,10 @@ class EPLBConfig:
     Log the balancedness each step of expert parallelism.
     This is turned off by default since it will cause communication overhead.
     """
+    use_async: bool = False
+    """
+    Whether to use non-blocking EPLB.
+    """
 
 
 @config
diff --git a/vllm/distributed/eplb/async_worker.py b/vllm/distributed/eplb/async_worker.py
new file mode 100644
index 0000000000000..e4b4fc92eeaaa
--- /dev/null
+++ b/vllm/distributed/eplb/async_worker.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+The async worker that transfers experts in the background.
+"""
+
+import asyncio
+import threading
+from typing import TYPE_CHECKING
+
+import torch
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.parallel_state import get_ep_group
+from vllm.logger import init_logger
+
+from .rebalance_execute import transfer_layer
+
+if TYPE_CHECKING:
+    from .eplb_state import EplbState
+
+logger = init_logger(__name__)
+
+
+def start_async_worker(
+    state: "EplbState",
+    rank_mapping: dict[int, int] | None = None,
+    is_profile: bool = False,
+) -> threading.Thread:
+    ep_group = get_ep_group().device_group
+    rank = ep_group.rank()
+    device_index = state.cuda_device_index
+
+    def thread_target() -> None:
+        assert device_index is not None
+        torch.cuda.set_device(device_index)
+        cuda_stream = torch.cuda.Stream(device=device_index)
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            loop.run_until_complete(
+                transfer_run_periodically(
+                    state=state,
+                    ep_group=ep_group,
+                    is_profile=is_profile,
+                    rank_mapping=rank_mapping,
+                    cuda_stream=cuda_stream,
+                )
+            )
+        except Exception as exc:  # pragma: no cover - diagnostic path
+            logger.exception("async loop error (Rank %d): %s", rank, str(exc))
+        finally:
+            loop.close()
+
+    thread = threading.Thread(target=thread_target, daemon=True)
+    thread.start()
+    return thread
+
+
+async def transfer_run_periodically(
+    state: "EplbState",
+    ep_group: ProcessGroup,
+    is_profile: bool = False,
+    rank_mapping: dict[int, int] | None = None,
+    cuda_stream: torch.cuda.Stream = None,
+) -> None:
+    while True:
+        await asyncio.to_thread(state.rearrange_event.wait)
+        logger.info("async worker woke up for EPLB transfer")
+
+        for model_state in state.model_states.values():
+            if not model_state.is_async_enabled:
+                continue
+            current_num_layers = model_state.model.num_moe_layers
+            while (
+                model_state.rebalanced
+                and model_state.layer_to_transfer < current_num_layers
+            ):
+                if (
+                    not model_state.ep_buffer_ready
+                    and model_state.rebalanced
+                    and model_state.new_physical_to_logical_map is not None
+                ):
+                    await asyncio.to_thread(model_state.buffer_lock.acquire)
+                    try:
+                        if model_state.layer_to_transfer >= current_num_layers:
+                            break
+
+                        (
+                            model_state.is_unchanged,
+                            model_state.is_received_locally,
+                            model_state.experts_recv_loc,
+                        ) = await transfer_layer(
+                            old_global_expert_indices=model_state.physical_to_logical_map,
+                            new_global_expert_indices=model_state.new_physical_to_logical_map,
+                            expert_weights=model_state.model.expert_weights,
+                            expert_weights_buffer=model_state.expert_buffer,
+                            ep_group=ep_group,
+                            is_profile=is_profile,
+                            layer=model_state.layer_to_transfer,
+                            cuda_stream=cuda_stream,
+                            rank_mapping=rank_mapping,
+                        )
+                        event = torch.cuda.Event(blocking=False)
+                        cuda_stream.record_event(event)
+                        model_state.buffer_ready_event = event
+                        model_state.ep_buffer_ready = 1
+                    finally:
+                        model_state.buffer_lock.release()
+                else:
+                    if not model_state.rebalanced:
+                        break
+                    await asyncio.sleep(0.001)
+
+        state.rearrange_event.clear()
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 526d3ceac7b8f..9f8798a96a2fc 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -26,6 +26,7 @@ MoE layer. If we have 32 EP ranks, then each GPU will hold 288 / 32 = 9 local
 physical experts.
 """
 
+import threading
 import time
 from collections.abc import Sequence
 from dataclasses import dataclass
@@ -43,8 +44,9 @@ from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MixtureOfExperts
 
+from .async_worker import start_async_worker
 from .rebalance_algo import rebalance_experts
-from .rebalance_execute import rearrange_expert_weights_inplace
+from .rebalance_execute import move_from_buffer, rearrange_expert_weights_inplace
 
 logger = init_logger(__name__)
 
@@ -132,6 +134,74 @@ class EplbModelState:
     """
     model_name: str
     model: MixtureOfExperts
+    expert_buffer: list[torch.Tensor]
+    """
+    The buffer to store the expert weights during transfer.
+    """
+    buffer_lock: threading.Lock
+    """
+    The lock to protect the expert buffer.
+    """
+    buffer_ready_event: torch.cuda.Event | None
+    """
+    CUDA event recorded when the async worker finishes filling the buffer.
+    The main thread waits on this before consuming the buffer.
+    """
+    ep_buffer_ready: int
+    """
+    The flag indicates whether the expert buffer is ready for transfer.
+    0 or 1.
+    """
+    layer_to_transfer: int
+    """
+    The layer index to transfer in async mode.
+    """
+    rebalanced: bool
+    """
+    The flag indicates whether the experts rebalance have been computed.
+    """
+    pending_global_ready_check: bool
+    """
+    Whether the async EPLB needs to poll peers for buffer readiness.
+    """
+    is_unchanged: list[bool]
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    The size is same as the num of physical experts in the current layer.
+    """
+    is_received_locally: list[bool]
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    The size is same as the num of physical experts in the current layer.
+    """
+    experts_recv_loc: dict[int, int]
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    The size is same as the num of physical experts in the current layer.
+    """
+    is_async_enabled: bool
+    """
+    The flag indicates whether the EPLB is running in async mode.
+    """
+    cuda_device_index: int | None
+    """
+    CUDA device index for the async EPLB worker thread.
+    """
+    new_physical_to_logical_map: torch.Tensor | None = None
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    the size is same as physical_to_logical_map
+    """
+    new_logical_to_physical_map: torch.Tensor | None = None
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    the size is same as logical_to_physical_map
+    """
+    new_logical_replica_count: torch.Tensor | None = None
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    the size is same as logical_replica_count
+    """
 
 
 class EplbState:
@@ -164,12 +234,31 @@ class EplbState:
         Otherwise, the rearrangement will hang at collective
         communication calls.
         """
-        self.expert_rearrangement_step: int = 0
+        self.expert_rearrangement_step_interval: int = 0
         """
         Interval for expert rearrangement steps.
         This is a constant and is taken from the config.
         """
-        self.expert_rearrangement_step_interval: int = 0
+        self.is_async: bool = False
+        """
+        The flag indicates whether the EPLB is running in async mode.
+        """
+        self.rearrange_event = threading.Event()
+        """
+        Event to signal when a new rearrangement is needed for the async thread.
+        """
+        self.async_worker: threading.Thread | None = None
+        """
+        Background thread handling async transfers.
+        """
+        self.cuda_device_index: int | None = None
+        """
+        CUDA device index for the async EPLB worker thread.
+        """
+        if self.device.type == "cuda":
+            self.cuda_device_index = self.device.index
+            if self.cuda_device_index is None and torch.cuda.is_available():
+                self.cuda_device_index = torch.cuda.current_device()
 
     @staticmethod
     def build_initial_global_physical_to_logical_map(
@@ -239,6 +328,8 @@ class EplbState:
         Build the initial EPLB state.
         """
         self.validate_ep_configuration(model)
+        self.is_async = self.parallel_config.eplb_config.use_async
+
         physical_to_logical_map_list = (
             EplbState.build_initial_global_physical_to_logical_map(
                 model.num_routed_experts,
@@ -368,7 +459,12 @@ class EplbState:
             physical_to_logical_map = new_physical_to_logical_map.to(self.device)
             logical_to_physical_map.copy_(new_logical_to_physical_map)
             logical_replica_count.copy_(new_logical_replica_count)
+        else:
+            new_physical_to_logical_map = None
 
+            new_logical_to_physical_map = None
+
+            new_logical_replica_count = None
         model.set_eplb_state(
             expert_load_pass,
             logical_to_physical_map,
@@ -385,15 +481,33 @@ class EplbState:
             )
             self.expert_rearrangement_step = 0
 
-        self.model_states[model_config.compute_hash()] = EplbModelState(
-            physical_to_logical_map,
-            logical_to_physical_map,
-            logical_replica_count,
-            expert_load_pass,
-            expert_load_window,
-            model_config.model,
-            model,
+        expert_buffer = [torch.empty_like(w) for w in model.expert_weights[0]]
+
+        model_state = EplbModelState(
+            physical_to_logical_map=physical_to_logical_map,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+            expert_load_pass=expert_load_pass,
+            expert_load_window=expert_load_window,
+            model_name=model_config.model,
+            model=model,
+            expert_buffer=expert_buffer,
+            buffer_lock=threading.Lock(),
+            buffer_ready_event=None,
+            ep_buffer_ready=0,
+            layer_to_transfer=0,
+            rebalanced=False,
+            pending_global_ready_check=False,
+            is_unchanged=[],
+            is_received_locally=[],
+            experts_recv_loc={},
+            is_async_enabled=self.is_async,
+            cuda_device_index=self.cuda_device_index,
+            new_physical_to_logical_map=new_physical_to_logical_map,
+            new_logical_to_physical_map=new_logical_to_physical_map,
+            new_logical_replica_count=new_logical_replica_count,
         )
+        self.model_states[model_config.compute_hash()] = model_state
 
     def step(
         self,
@@ -420,7 +534,7 @@ class EplbState:
             - `max_tokens`: The maximum load across ranks.
             - `balancedness`: The ratio of average load to maximum load.
         """
-
+        ep_group = get_ep_group().device_group
         if is_profile:
             self.rearrange(is_profile=True)
             return
@@ -488,7 +602,49 @@ class EplbState:
         # rearrangement step and perform rearrangement to ensure all ranks are
         # performing collective communication.
         self.expert_rearrangement_step += 1
+
+        if self.is_async:
+            for eplb_model_state in self.model_states.values():
+                if not eplb_model_state.is_async_enabled:
+                    continue
+
+                all_ranks_buffer_ready = False
+                if eplb_model_state.pending_global_ready_check:
+                    all_ranks_buffer_ready = self._all_ranks_buffer_ready(
+                        eplb_model_state
+                    )
+                if (
+                    eplb_model_state.is_async_enabled
+                    and eplb_model_state.ep_buffer_ready
+                    and all_ranks_buffer_ready
+                ):
+                    self.move_to_workspace(
+                        model_state=eplb_model_state,
+                        ep_group=ep_group,
+                        is_profile=is_profile,
+                    )
+                    if (
+                        eplb_model_state.layer_to_transfer
+                        >= eplb_model_state.model.num_moe_layers
+                    ):
+                        self.post_eplb(eplb_model_state, is_profile)
+                        eplb_model_state.rebalanced = False
+                        eplb_model_state.layer_to_transfer = 0
+                        eplb_model_state.pending_global_ready_check = False
+                        logger.info(
+                            "finish async transfer for model %s rank %d layer %d",
+                            eplb_model_state.model_name,
+                            ep_group.rank(),
+                            eplb_model_state.model.num_moe_layers,
+                        )
+
         if self.expert_rearrangement_step >= self.expert_rearrangement_step_interval:
+            if any(
+                eplb_model_state.is_async_enabled and eplb_model_state.rebalanced
+                for eplb_model_state in self.model_states.values()
+            ):
+                # Still performing asynchronous rearrangement
+                return
             self.expert_rearrangement_step = 0
             self.rearrange()
 
@@ -524,7 +680,11 @@ class EplbState:
         if is_main_rank:
             torch.cuda.synchronize()
             time_start = time.perf_counter()
-            logger.info("Rearranging experts %s...", "(profile)" if is_profile else "")
+            logger.info(
+                "Rearranging experts %s %s...",
+                "(async mode)" if self.is_async else "sync mode",
+                "(profile)" if is_profile else "",
+            )
 
         if global_expert_loads is None:
             # Map the physical expert load to global logical experts
@@ -593,6 +753,7 @@ class EplbState:
         model = eplb_model_state.model
         num_replicas = model.num_physical_experts
         num_groups = model.num_expert_groups
+
         if rank_mapping is not None and len(rank_mapping) == ep_group.size():
             # NOTE(yongji): scale down, we need to rebalance the experts on
             # remaining GPUs, transfer the experts while we haven't shutdown
@@ -608,7 +769,7 @@ class EplbState:
             num_gpus = ep_group.size()
 
         if num_gpus % num_nodes != 0:
-            self.num_nodes = 1
+            num_nodes = 1
             logger.warning_once(
                 f"num_gpus % num_nodes != 0, "
                 "not using hierarchical rearrangement algorithm.\n"
@@ -631,60 +792,216 @@ class EplbState:
                 num_gpus,
             )
 
-            # Update expert weights
-            rearrange_expert_weights_inplace(
-                eplb_model_state.physical_to_logical_map,
-                new_physical_to_logical_map,
-                eplb_model_state.model.expert_weights,
-                ep_group,
-                is_profile,
-                rank_mapping,
-            )
+            if not eplb_model_state.is_async_enabled or is_profile:
+                # Update expert weights
+                rearrange_expert_weights_inplace(
+                    eplb_model_state.physical_to_logical_map,
+                    new_physical_to_logical_map,
+                    eplb_model_state.model.expert_weights,
+                    ep_group,
+                    is_profile,
+                    rank_mapping,
+                )
 
-            if not is_profile:
-                if (
-                    eplb_model_state.physical_to_logical_map.shape[1]
-                    != new_physical_to_logical_map.shape[1]
-                ):
-                    eplb_model_state.physical_to_logical_map = (
-                        new_physical_to_logical_map.to(
-                            eplb_model_state.physical_to_logical_map.device
+                if not is_profile:
+                    if (
+                        eplb_model_state.physical_to_logical_map.shape[1]
+                        != new_physical_to_logical_map.shape[1]
+                    ):
+                        eplb_model_state.physical_to_logical_map = (
+                            new_physical_to_logical_map.to(
+                                eplb_model_state.physical_to_logical_map.device
+                            )
                         )
+                    else:
+                        eplb_model_state.physical_to_logical_map.copy_(
+                            new_physical_to_logical_map
+                        )
+                    max_physical_slots = new_logical_to_physical_map.shape[-1]
+                    assert (
+                        max_physical_slots
+                        <= eplb_model_state.logical_to_physical_map.shape[-1]
                     )
-                else:
-                    eplb_model_state.physical_to_logical_map.copy_(
-                        new_physical_to_logical_map
+                    new_logical_to_physical_map = torch.nn.functional.pad(
+                        new_logical_to_physical_map,
+                        (
+                            0,
+                            eplb_model_state.logical_to_physical_map.shape[-1]
+                            - max_physical_slots,
+                        ),
+                        value=-1,
                     )
-                max_physical_slots = new_logical_to_physical_map.shape[-1]
-                assert (
-                    max_physical_slots
-                    <= eplb_model_state.logical_to_physical_map.shape[-1]
-                )
-                new_logical_to_physical_map = torch.nn.functional.pad(
+                    eplb_model_state.logical_to_physical_map.copy_(
+                        new_logical_to_physical_map
+                    )
+                    eplb_model_state.logical_replica_count.copy_(
+                        new_logical_replica_count
+                    )
+                if is_main_rank:
+                    assert time_start is not None
+                    torch.cuda.synchronize()
+                    time_end = time.perf_counter()
+                    logger.info(
+                        "Rearranged experts%sin %.2f seconds.",
+                        " (profile) " if is_profile else " ",
+                        time_end - time_start,
+                    )
+            else:
+                device = eplb_model_state.physical_to_logical_map.device
+                new_physical = new_physical_to_logical_map.to(device)
+                max_slots = eplb_model_state.logical_to_physical_map.shape[-1]
+                padded_logical = torch.nn.functional.pad(
                     new_logical_to_physical_map,
-                    (
-                        0,
-                        eplb_model_state.logical_to_physical_map.shape[-1]
-                        - max_physical_slots,
-                    ),
+                    (0, max(0, max_slots - new_logical_to_physical_map.shape[-1])),
                     value=-1,
+                ).to(eplb_model_state.logical_to_physical_map.device)
+                new_replica = new_logical_replica_count.to(
+                    eplb_model_state.logical_replica_count.device
                 )
-                eplb_model_state.logical_to_physical_map.copy_(
-                    new_logical_to_physical_map
-                )
-                eplb_model_state.logical_replica_count.copy_(new_logical_replica_count)
 
-        if is_main_rank:
-            assert time_start is not None
-            torch.cuda.synchronize()
-            time_end = time.perf_counter()
-            logger.info(
-                "Rearranged experts%sin %.2f seconds.",
-                " (profile) " if is_profile else " ",
-                time_end - time_start,
-            )
+                eplb_model_state.new_physical_to_logical_map = new_physical
+                eplb_model_state.new_logical_to_physical_map = padded_logical
+                eplb_model_state.new_logical_replica_count = new_replica
+
+                eplb_model_state.rebalanced = True
+                eplb_model_state.layer_to_transfer = 0
+                eplb_model_state.pending_global_ready_check = True
+
+        # Signal async thread to start transferring layers
+        if self.is_async and (not is_profile):
+            self.rearrange_event.set()
         return None
 
+    def start_async_loop(
+        self,
+        rank_mapping: dict[int, int] | None = None,
+        is_profile: bool = False,
+    ):
+        if not self.is_async:
+            return
+        if self.async_worker is None:
+            self.async_worker = start_async_worker(
+                self,
+                rank_mapping=rank_mapping,
+                is_profile=is_profile,
+            )
+
+    def _update_layer_mapping_from_new(
+        self, model_state: EplbModelState, layer: int
+    ) -> None:
+        if (
+            model_state.new_physical_to_logical_map is None
+            or model_state.new_logical_to_physical_map is None
+            or model_state.new_logical_replica_count is None
+        ):
+            return
+
+        target_device = model_state.physical_to_logical_map.device
+        new_physical = model_state.new_physical_to_logical_map
+        if model_state.physical_to_logical_map.shape[1] != new_physical.shape[1]:
+            model_state.physical_to_logical_map = new_physical.to(target_device)
+        else:
+            model_state.physical_to_logical_map[layer].copy_(
+                new_physical[layer].to(target_device)
+            )
+
+        logical_device = model_state.logical_to_physical_map.device
+        new_logical = model_state.new_logical_to_physical_map[layer].to(logical_device)
+        max_slots = model_state.logical_to_physical_map.shape[-1]
+        slot_delta = max_slots - new_logical.shape[-1]
+        if slot_delta > 0:
+            new_logical = torch.nn.functional.pad(
+                new_logical, (0, slot_delta), value=-1
+            )
+        model_state.logical_to_physical_map[layer].copy_(new_logical)
+
+        replica_device = model_state.logical_replica_count.device
+        model_state.logical_replica_count[layer].copy_(
+            model_state.new_logical_replica_count[layer].to(replica_device)
+        )
+
+    def _all_ranks_buffer_ready(self, model_state: EplbModelState) -> bool:
+        parallel_state = get_ep_group()
+        cpu_group = getattr(parallel_state, "cpu_group", None)
+        if cpu_group is not None and cpu_group.size() > 1:
+            flag = torch.tensor(
+                (int(model_state.ep_buffer_ready),), dtype=torch.int32, device="cpu"
+            )
+            all_reduce(flag, group=cpu_group)
+            return int(flag.item()) == cpu_group.size()
+
+        device_group = parallel_state.device_group
+        if device_group.size() <= 1:
+            return bool(model_state.ep_buffer_ready)
+
+        device = getattr(
+            parallel_state, "device", model_state.physical_to_logical_map.device
+        )
+        flag = torch.tensor(
+            (int(model_state.ep_buffer_ready),), dtype=torch.int32, device=device
+        )
+        all_reduce(flag, group=device_group)
+        return int(flag.item()) == device_group.size()
+
+    def move_to_workspace(
+        self,
+        model_state: EplbModelState,
+        ep_group: ProcessGroup,
+        is_profile: bool = False,
+    ):
+        if not model_state.buffer_lock.acquire(blocking=False):
+            return
+        try:
+            assert model_state.new_physical_to_logical_map is not None
+            device_index = model_state.cuda_device_index or self.cuda_device_index
+            if model_state.buffer_ready_event is not None and device_index is not None:
+                stream = torch.cuda.current_stream(device=device_index)
+                stream.wait_event(model_state.buffer_ready_event)
+                model_state.buffer_ready_event = None
+            move_from_buffer(
+                expert_weights=model_state.model.expert_weights[
+                    model_state.layer_to_transfer
+                ],
+                expert_weights_buffer=model_state.expert_buffer,
+                is_unchanged=model_state.is_unchanged,
+                is_received_locally=model_state.is_received_locally,
+                experts_recv_loc=model_state.experts_recv_loc,
+                new_indices=model_state.new_physical_to_logical_map[
+                    model_state.layer_to_transfer
+                ].tolist(),
+                ep_group=ep_group,
+            )
+            transferred_layer = model_state.layer_to_transfer
+            self._update_layer_mapping_from_new(model_state, transferred_layer)
+            # After the main thread consumes, advance layer_to_transfer
+            model_state.layer_to_transfer += 1
+            model_state.ep_buffer_ready = 0
+            logger.info(
+                "model %s successfully move_to_workspace layer %d",
+                model_state.model_name,
+                transferred_layer,
+            )
+        finally:
+            try:
+                model_state.buffer_lock.release()
+            except Exception as e:
+                logger.error(
+                    "Rank %d: buffer_lock release failed in move_to_workspace: %s",
+                    ep_group.rank(),
+                    str(e),
+                )
+
+    def post_eplb(self, model_state: EplbModelState, is_profile: bool = False) -> None:
+        assert model_state.new_physical_to_logical_map is not None
+        assert model_state.new_logical_to_physical_map is not None
+        assert model_state.new_logical_replica_count is not None
+        if not is_profile:
+            for layer_idx in range(model_state.physical_to_logical_map.shape[0]):
+                self._update_layer_mapping_from_new(model_state, layer_idx)
+        model_state.new_physical_to_logical_map = None
+        model_state.new_logical_to_physical_map = None
+        model_state.new_logical_replica_count = None
+
     @staticmethod
     def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]:
         """
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 5c1efbaf03bab..376dad8a72ef1 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -100,18 +100,19 @@ def get_ep_ranks_with_expert(
     return ranks_to_send, ranks_to_recv_actual
 
 
-def shuffle_layer(
+def move_to_buffer(
     num_local_experts: int,
-    ep_rank: int,
     old_indices: Sequence[int],
     new_indices: Sequence[int],
     expert_weights: Iterable[torch.Tensor],
     expert_weights_buffer: Sequence[torch.Tensor],
+    cuda_stream: torch.cuda.Stream | None,
     ep_group: ProcessGroup,
-) -> None:
+) -> tuple[list[bool], list[bool], dict[int, int]]:
     """
     Perform expert weights rearrangement of one layer.
     """
+    ep_rank = ep_group.rank()
     local2global = partial(
         idx_local_to_global,
         local_cnt=num_local_experts,
@@ -137,7 +138,8 @@ def shuffle_layer(
             if old_indices[src_global] == new_indices[dst_global]:
                 is_received_locally[dst] = True
                 for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                    buffer[dst].copy_(weight[src])
+                    with torch.cuda.stream(cuda_stream):
+                        buffer[dst].copy_(weight[src], non_blocking=True)
 
     p2p_ops: list[P2POp] = []
 
@@ -225,25 +227,115 @@ def shuffle_layer(
         ]
 
     # 4. Execute the P2P operations. The real communication happens here.
-    if p2p_ops:
+    if p2p_ops and cuda_stream is not None:
+        with torch.cuda.stream(cuda_stream):
+            reqs = batch_isend_irecv(p2p_ops)
+            for req in reqs:
+                req.wait()
+    elif p2p_ops:
         reqs = batch_isend_irecv(p2p_ops)
         for req in reqs:
             req.wait()
+    # wait for the communication to finish
+    return is_unchanged, is_received_locally, experts_recv_loc
+
+
+def move_from_buffer(
+    expert_weights: Iterable[torch.Tensor],
+    expert_weights_buffer: list[torch.Tensor],
+    is_unchanged: list[bool],
+    is_received_locally: list[bool],
+    experts_recv_loc: dict[int, int],
+    new_indices: Sequence[int],
+    ep_group: ProcessGroup,
+) -> None:
+    ep_rank = ep_group.rank()
+    num_local_experts = len(is_unchanged)
+
+    local2global = partial(
+        idx_local_to_global, local_cnt=num_local_experts, ep_rank=ep_rank
+    )
 
-    # 5. Copy the weights from the buffer back to the original weights.
     for dst in range(num_local_experts):
         if is_unchanged[dst]:
             continue
         if is_received_locally[dst]:
             for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                weight[dst].copy_(buffer[dst])
+                weight[dst].copy_(buffer[dst], non_blocking=True)
         else:
             expert = new_indices[local2global(dst)]
             if expert == -1:
                 continue
             src = experts_recv_loc[expert]
             for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                weight[dst].copy_(buffer[src])
+                weight[dst].copy_(buffer[src], non_blocking=True)
+
+
+async def transfer_layer(
+    old_global_expert_indices: torch.Tensor,
+    new_global_expert_indices: torch.Tensor,
+    expert_weights: Sequence[Iterable[torch.Tensor]],
+    expert_weights_buffer: Sequence[torch.Tensor],
+    ep_group: ProcessGroup,
+    is_profile: bool = False,
+    layer: int = 0,
+    cuda_stream: torch.cuda.Stream | None = None,
+    rank_mapping: dict[int, int] | None = None,
+) -> tuple[list[bool], list[bool], dict[int, int]]:
+    """
+    Rearranges the expert weights in place according to the new expert indices.
+
+    The value of the indices arguments are logical indices of the experts,
+    while keys are physical.
+
+    Args:
+        old_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
+        new_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
+        expert_weights: A sequence of shape (num_moe_layers)(weight_count)
+            of tensors of shape (num_local_physical_experts, hidden_size_i).
+            For example, a linear layer may have up and down projection,
+            so weight_count = 2. Each weight's hidden size can be different.
+        ep_group: The device process group for expert parallelism.
+        is_profile (bool): If `True`, do not perform any actual weight copy.
+            This is used during profile run, where we only perform dummy
+            communications to reserve enough memory for the buffers.
+    """
+    ep_size = ep_group.size()
+    if rank_mapping is not None:
+        if len(rank_mapping) == ep_group.size():
+            # scale down
+            new_global_expert_indices = _map_new_expert_indices_with_rank_mapping(
+                new_global_expert_indices,
+                rank_mapping,
+            )
+        else:
+            # scale up
+            old_global_expert_indices = _map_old_expert_indices_with_rank_mapping(
+                old_global_expert_indices,
+                rank_mapping,
+                ep_group.size(),
+            )
+
+    assert old_global_expert_indices.shape[1] == new_global_expert_indices.shape[1]
+    num_moe_layers, num_physical_experts = old_global_expert_indices.shape
+    assert len(expert_weights) == num_moe_layers
+    num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
+    assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
+    assert num_physical_experts == ep_size * num_local_physical_experts
+    # A buffer to hold the expert weights in one layer during the exchange.
+    # NOTE: Currently we assume the same weights across different layers
+    # have the same shape.
+
+    is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
+        num_local_experts=num_local_physical_experts,
+        old_indices=old_global_expert_indices[layer].tolist(),
+        new_indices=new_global_expert_indices[layer].tolist(),
+        expert_weights=expert_weights[layer],
+        expert_weights_buffer=expert_weights_buffer,
+        cuda_stream=cuda_stream,
+        ep_group=ep_group,
+    )
+    return is_unchanged, is_received_locally, experts_recv_loc
 
 
 def rearrange_expert_weights_inplace(
@@ -296,7 +388,6 @@ def rearrange_expert_weights_inplace(
     num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
     assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
 
-    ep_rank = ep_group.rank()
     ep_size = ep_group.size()
     assert num_physical_experts == ep_size * num_local_physical_experts
 
@@ -329,14 +420,24 @@ def rearrange_expert_weights_inplace(
     torch.cuda.synchronize()
 
     for layer in range(num_moe_layers):
-        shuffle_layer(
-            num_local_physical_experts,
-            ep_rank,
-            old_global_expert_indices_cpu[layer].tolist(),
-            new_global_expert_indices_cpu[layer].tolist(),
-            expert_weights[layer],
-            expert_weights_buffer,
-            ep_group,
+        is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
+            num_local_experts=num_local_physical_experts,
+            old_indices=old_global_expert_indices_cpu[layer].tolist(),
+            new_indices=new_global_expert_indices_cpu[layer].tolist(),
+            expert_weights=expert_weights[layer],
+            expert_weights_buffer=expert_weights_buffer,
+            cuda_stream=None,
+            ep_group=ep_group,
+        )
+
+        move_from_buffer(
+            expert_weights=expert_weights[layer],
+            expert_weights_buffer=expert_weights_buffer,
+            is_unchanged=is_unchanged,
+            is_received_locally=is_received_locally,
+            experts_recv_loc=experts_recv_loc,
+            new_indices=new_global_expert_indices[layer].tolist(),
+            ep_group=ep_group,
         )
 
 
@@ -428,4 +529,4 @@ def _map_new_expert_indices_with_rank_mapping(
     return mapped_expert_indices
 
 
-__all__ = ["rearrange_expert_weights_inplace"]
+__all__ = ["transfer_layer", "move_from_buffer"]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6a54e02f861e9..cbafc9c993cc2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3370,6 +3370,8 @@ class GPUModelRunner(
                 old_global_expert_indices,
                 rank_mapping,
             )
+            if self.eplb_state.is_async:
+                self.eplb_state.start_async_loop(rank_mapping=rank_mapping)
 
         if (
             self.vllm_config.compilation_config.mode

From f716a153723d4b2e18d01380cfe25d9ac636e2ef Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 24 Nov 2025 09:40:05 -0500
Subject: [PATCH 197/249] Update KServe guide link in documentation (#29258)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/deployment/integrations/kserve.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md
index edf79fca4f93e..37b29aa1a4876 100644
--- a/docs/deployment/integrations/kserve.md
+++ b/docs/deployment/integrations/kserve.md
@@ -2,4 +2,4 @@
 
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
 
-Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
+Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.

From 7a228b5305f3257834c5079fd475f659bc4bf73d Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Mon, 24 Nov 2025 07:12:41 -0800
Subject: [PATCH 198/249] Add option to use unbacked, and backed size obl
 dynamic shapes for more sounds compilation. (#26199)

Signed-off-by: Laith Sakka <lsakka@meta.com>
---
 docs/design/debug_vllm_compile.md             |  70 ++++++++++++
 docs/design/torch_compile.md                  | 105 +++++++++++++++++-
 .../test_dynamic_shapes_compilation.py        |  88 +++++++++++++++
 vllm/compilation/decorators.py                |  63 +++++++++--
 vllm/compilation/wrapper.py                   |  24 +++-
 vllm/config/compilation.py                    |  60 +++++++++-
 vllm/model_executor/models/llama.py           |  12 +-
 vllm/model_executor/models/qwen2.py           |  35 +++++-
 8 files changed, 442 insertions(+), 15 deletions(-)
 create mode 100644 tests/compile/test_dynamic_shapes_compilation.py

diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index 8912eb58f8ac7..408d2878309dd 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -151,6 +151,76 @@ To avoid this, please either:
 2. wrap the branching logic into a custom operator. TorchDynamo does not
 trace into custom operators.
 
+## Debugging constraint violations and dynamic shapes guards issues
+
+Dynamic-shape guards are a specific category of Dynamo guards. They are constraints that `torch.compile`
+attaches to dynamic dimensions (e.g., `seq_len`) to ensure the compiled artifact remains valid.
+These guards typically appear when framework code, custom passes, or user code branches based on
+dynamic shape values.
+
+**Example:**
+
+```python
+if x > 10:
+    # path A
+else:
+    # path B
+```
+
+This creates a guard `x > 10` or `x <= 10` depending on which path was traced.
+
+**vLLM's Assumption:**
+vLLM assumes that all guards added by torch.compile are safe to drop and will not
+constrain the compiled graph to specific input shapes. When this assumption is violated,
+it can cause issues that users need to debug.
+Some side effects that indicates this assumption is violated are runtime errors
+or `ConstraintViolationErrors`.
+
+A `ConstraintViolationErrors` will be thrown if a dynamic shape gets constrained to
+a single value. If you encounter a constraint violation error or suspect that a dynamic
+shapes guard is being added incorrectly, you can use stricter dynamic shape modes to
+help debug the issue:
+
+```sh
+# Online - using unbacked mode
+vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+
+# Online - using backed_size_oblivious mode
+vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious
+```
+
+```py
+# Offline - using unbacked mode
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+LLM(model, compilation_config=CompilationConfig(
+    dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.UNBACKED)
+))
+
+# Offline - using backed_size_oblivious mode
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+LLM(model, compilation_config=CompilationConfig(
+    dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS)
+))
+```
+
+These modes are stricter and reduce or eliminate the need of dynamic shapes guarding, which can help isolate issues:
+
+- `unbacked`: Uses unbacked symints which don't allow guards, making it easier to identify where guards are being incorrectly added
+- `backed_size_oblivious`: Uses a mode that is more strict about guarding.
+
+For more details on dynamic shapes modes, see [Dynamic shapes and vLLM guard dropping](torch_compile.md#dynamic-shapes-and-vllm-guard-dropping).
+
+### Printing guards
+
+To see all guards that are being added during compilation, you can use `TORCH_LOGS=+dynamic`:
+
+```sh
+TORCH_LOGS=+dynamic vllm serve meta-llama/Llama-3.2-1B
+```
+
+Look for `[guard added]` in the logs to see where guards are being added. This can help you identify which operations are
+causing guards to be added incorrectly.
+
 ## Debugging TorchInductor
 
 TorchInductor takes a captured graph and then compiles it down to some Python code
diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 27edc4f89201d..7b0b2c1e96978 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -29,6 +29,109 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
 
 By default, the cache saves compiled artifacts as binary files. If you would like to interact with the generated code for debugging purposes, set the field `compile_cache_save_format=unpacked` in the compilation config, or omit this and set the env variable `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`.
 
+## Dynamic shapes and vllm guard dropping
+
+`torch.compile` is designed to guard on dynamic shapes with no hesitation
+when needed. This contradicts with vLLM's `torch.compile` approach of
+dropping the guards since many of those guards could be material.
+
+`torch.compile` provides two kinds of dynamic shapes: `backed` and `unbacked`.
+`torch.compile` guards on `backed` dynamic shapes and does not provide a
+guarantee that no guards will be added to them. User code, dynamo,
+inductor, and autograd all can add guards. Moreover, for 0/1
+specializations, backed symbols are specialized unconditionally to 0, 1,
+or >=2 even without encountering a branching on those ranges.
+
+On the contrary, `unbacked` dynamic shapes are guaranteed not to be guarded
+on and are not 0/1 specialized. However, there is a possibility of
+throwing a data dependent error when a branch that requires their value is
+encountered and no explicit unbacked handling is defined. The framework is
+converging to a state where it won't throw DDE but rather pick general
+paths. One downside of using unbacked is missed optimization opportunities
+due to either perf bugs or picking general paths, also using a fixed
+non-example input-based hint (this will be fixed soon with override_hint
+API). An example of picking general paths is assuming input not contiguous
+in functions call contiguous() and reshape() when can't be symbolically proven
+with a change of introducing a clone.
+
+`backed_size_oblivious` is a flag that enables treating backed symbols as
+unbacked wherever explicit handling for unbacked is defined. With this
+mode, 0/1 specializations are mostly avoided in framework code and the
+default 0/1 specialization does not happen. However, there is still no
+guarantee that torch.compile won't guard, especially due to user code or
+custom passes. `backed_size_oblivious` is experimental in PyTorch compile
+and could be deprecated. That said, it's a safer option to use than
+`backed` and the probability of reducing performance is lower than
+`unbacked`.
+
+### Configuring Dynamic Shapes
+
+The `DynamicShapesConfig` allows you to control the dynamic shapes behavior by
+setting the `type` field. You can choose between three modes:
+`BACKED`(default), `UNBACKED` , and `BACKED_SIZE_OBLIVIOUS`.
+
+#### Offline Inference Example (Using LLM class)
+
+When using the `LLM` class for offline inference, you can configure dynamic
+shapes through the `compilation_config` parameter:
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+
+# Example: Using backed_size_oblivious (experimental, safer than backed)
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B",
+    compilation_config=CompilationConfig(
+        dynamic_shapes_config=DynamicShapesConfig(
+            type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS
+        )
+    )
+)
+
+# Example: Using unbacked (strongest guarantee against guards)
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B",
+    compilation_config=CompilationConfig(
+        dynamic_shapes_config=DynamicShapesConfig(
+            type=DynamicShapesType.UNBACKED
+        )
+    )
+)
+
+# Generate outputs
+prompts = ["Hello, my name is", "The future of AI is"]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+outputs = llm.generate(prompts, sampling_params)
+```
+
+#### Online Serving Example (Using vllm serve)
+
+When using `vllm serve` for online serving, you can configure dynamic shapes
+through the `--compilation-config` flag:
+
+```bash
+# Example: Using unbacked
+vllm serve meta-llama/Llama-3.2-1B \
+  --compilation-config '{"dynamic_shapes_config": {"type": "unbacked"}}'
+
+
+# Alternative: Using dot notation (simpler for single values)
+vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+```
+
+#### Choosing the Right Mode
+
+- **BACKED** (default): Use when you're willing to accept potential unsafe dropping of guards
+for maximal performance. Guard could be unsoundly added and then ignored.
+
+- **UNBACKED**  Use when you need the strongest guarantee against guards.
+  This is the most conservative option but may miss some optimization opportunities.
+
+- **BACKED_SIZE_OBLIVIOUS**: Use when you want a balance between avoiding guards
+  and performance. This experimental mode is safer than BACKED but still not as
+  conservative as UNBACKED.
+
 ## Python Code Compilation
 
 In the very verbose logs, we can see:
@@ -122,7 +225,7 @@ When all the shapes are known, `torch.compile` can compare different configs, an
       triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
       triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
       triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
-      mm 0.0160 ms 81.6% 
+      mm 0.0160 ms 81.6%
       triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
       triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
       triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
new file mode 100644
index 0000000000000..c20aea822fe81
--- /dev/null
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import gc
+
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config.compilation import CompilationMode, DynamicShapesType
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+
+def get_test_models():
+    """Get list of models to test based on PyTorch version"""
+    # TODO "Qwen/Qwen3-4B-Instruct-2507" fails Fix issue and support it.
+    return ["gpt2", "Qwen/Qwen2-7B-Instruct", "meta-llama/Llama-3.1-8B"]
+
+
+@pytest.mark.parametrize("model_name", get_test_models())
+@pytest.mark.parametrize(
+    "shapes_type",
+    [
+        DynamicShapesType.BACKED,
+        DynamicShapesType.UNBACKED,
+        DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
+    ],
+)
+@pytest.mark.parametrize("use_aot_compile", ["0"])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_dynamic_shapes_compilation(
+    monkeypatch, model_name, shapes_type, use_aot_compile, use_bytecode_hook
+):
+    """Test that all dynamic shapes types compile successfully"""
+    print(
+        f"\nTesting model: {model_name} with {shapes_type.name}, "
+        f"AOT compile: {use_aot_compile}, "
+        f"Bytecode hook: {use_bytecode_hook}"
+    )
+    if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
+        pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
+
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
+    prompt = "Hello, my name is"
+
+    print(f"Testing {shapes_type.name} dynamic shapes...")
+
+    # Initialize the model with specific dynamic shapes configuration
+    model = LLM(
+        model=model_name,
+        compilation_config={
+            "mode": CompilationMode.VLLM_COMPILE,
+            "dynamic_shapes_config": {
+                "type": shapes_type.value,
+            },
+        },
+    )
+
+    output = model.generate(prompt)
+    result = output[0].outputs[0].text
+    # Example of setting the sampling parameters
+    tokenizer = get_tokenizer(model_name)
+    yes_tokens = tokenizer.encode("yes", add_special_tokens=False)
+    no_tokens = tokenizer.encode("no", add_special_tokens=False)
+    allowed_ids = list(set(yes_tokens + no_tokens))
+    sampling_params = SamplingParams(
+        max_tokens=1, temperature=0, allowed_token_ids=allowed_ids
+    )
+
+    output = model.generate(
+        "answer with yes or no is " + result + " rubbish for prompt " + prompt + "?",
+        sampling_params=sampling_params,
+    )
+    result = output[0].outputs[0].text
+    assert result == "yes"
+
+    # Clean up GPU memory
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.synchronize()
+    print("GPU memory cleared")
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 11a18c0e6bb78..6d9da1c488c6d 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -24,6 +24,7 @@ from vllm.config import (
     get_current_vllm_config,
     set_current_vllm_config,
 )
+from vllm.config.compilation import DynamicShapesType
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -104,6 +105,7 @@ def support_torch_compile(
     dynamic_arg_dims: dict[str, int | list[int]] | None = None,
     mark_unbacked_dims: dict[str, int | list[int]] | None = None,
     enable_if: Callable[[VllmConfig], bool] | None = None,
+    shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
 ) -> Callable[[_T], _T] | _T:
     """
     A decorator to add support for compiling the forward method of a class.
@@ -161,6 +163,14 @@ def support_torch_compile(
     dim to be decorated with `mark_unbacked`.  This is useful if we would like to
     enforce that dynamo does not specialize on 0/1 values in the case of dummy input
     such as for vision model compilation
+
+    `shape_invariants` is a function that gets compiled right before forward.
+    The function should have the torch._check calls that are needed to set
+    the relationships between different input sizes. For example:
+            torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
+    This enforces constraints on the symbolic shapes without hardcoding
+    specific values. It is needed for some models to avoid data dependent
+    errors.
     """
 
     def cls_decorator_helper(cls: _T) -> _T:
@@ -199,7 +209,11 @@ def support_torch_compile(
                     f"Argument {k} not found in the forward method of {cls}"
                 )
         return _support_torch_compile(
-            cls, inferred_dynamic_arg_dims, mark_unbacked_dims, enable_if
+            cls,
+            inferred_dynamic_arg_dims,
+            mark_unbacked_dims,
+            enable_if,
+            shape_invariants,
         )
 
     if cls is not None:
@@ -242,6 +256,7 @@ def _support_torch_compile(
     dynamic_arg_dims: dict[str, int | list[int]],
     mark_unbacked_dims: dict[str, int | list[int]] | None = None,
     enable_if: Callable[[VllmConfig], bool] | None = None,
+    shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
 ) -> _T:
     """
     A decorator to add support for compiling the forward method of a class.
@@ -276,11 +291,12 @@ def _support_torch_compile(
         old_init(self, **kwargs)
 
         self.vllm_config = vllm_config
+        self.compilation_config = self.vllm_config.compilation_config
         enable_compile = enable_if is None or enable_if(vllm_config)
         # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner
         # will handle the compilation, so we don't need to do anything here.
         self.do_not_compile = (
-            vllm_config.compilation_config.mode
+            self.compilation_config.mode
             in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE]
             or not supports_dynamo()
             or _should_ignore_torch_compile(self.__class__)
@@ -289,29 +305,38 @@ def _support_torch_compile(
         if self.do_not_compile:
             return
 
+        self._check_shape_invariants = shape_invariants
+
         compilation_counter.num_models_seen += 1
         self.compiled = False
         TorchCompileWithNoGuardsWrapper.__init__(self)
 
     cls.__init__ = __init__
 
-    def _mark_dynamic_inputs(mod, *args, **kwargs):
+    def _mark_dynamic_inputs(mod, type, *args, **kwargs):
+        def mark_dynamic(arg, dims):
+            if type == DynamicShapesType.UNBACKED:
+                torch._dynamo.decorators.mark_unbacked(arg, dims)
+            else:
+                torch._dynamo.mark_dynamic(arg, dims)
+
         sig = inspect.signature(mod.__class__.forward)
         bound_args = sig.bind(mod, *args, **kwargs)
         bound_args.apply_defaults()
         for k, dims in dynamic_arg_dims.items():
             arg = bound_args.arguments.get(k)
+
             if arg is not None:
                 dims = [dims] if isinstance(dims, int) else dims
                 if isinstance(arg, torch.Tensor):
                     # In case dims is specified with negative indexing
                     dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                    torch._dynamo.mark_dynamic(arg, dims)
+                    mark_dynamic(arg, dims)
                 elif isinstance(arg, IntermediateTensors):
                     for tensor in arg.tensors.values():
                         # In case dims is specified with negative indexing
                         dims = [tensor.ndim + dim if dim < 0 else dim for dim in dims]
-                        torch._dynamo.mark_dynamic(tensor, dims)
+                        mark_dynamic(tensor, dims)
                 else:
                     raise ValueError(
                         "Unsupported dynamic dimensions"
@@ -338,6 +363,7 @@ def _support_torch_compile(
         if getattr(self, "aot_compiled_fn", None) is not None:
             return self.aot_compiled_fn(self, *args, **kwargs)
 
+        ds_type = self.compilation_config.dynamic_shapes_config.type
         cache_dir = None
         aot_compilation_path = None
         if envs.VLLM_USE_AOT_COMPILE:
@@ -352,6 +378,14 @@ def _support_torch_compile(
             serialized backend artifacts), then we need to generate a new AOT
             compile artifact from scratch.
             """
+            # Validate that AOT compile is not used with unbacked dynamic
+            # shapes. aot_compile re-allocates backed symbols post dynamo!
+            if ds_type == DynamicShapesType.UNBACKED:
+                raise ValueError(
+                    "AOT compilation is not compatible with UNBACKED dynamic shapes. "
+                    "Please use BACKED or BACKED_SIZE_OBLIVIOUS dynamic shapes type "
+                    "when VLLM_USE_AOT_COMPILE is enabled."
+                )
             from .caching import compilation_config_hash_factors
 
             factors: list[str] = compilation_config_hash_factors(self.vllm_config)
@@ -401,7 +435,12 @@ def _support_torch_compile(
         # This is the path for the first compilation.
 
         # the first compilation needs to have dynamic shapes marked
-        _mark_dynamic_inputs(self, *args, **kwargs)
+        _mark_dynamic_inputs(
+            self,
+            ds_type,
+            *args,
+            **kwargs,
+        )
 
         # here, it is the starting point of the `torch.compile` process
         start_monitoring_torch_compile(self.vllm_config)
@@ -417,9 +456,7 @@ def _support_torch_compile(
         # properly when any of these files change.
 
         # 1. the file containing the top-level forward function
-        self.vllm_config.compilation_config.traced_files.add(
-            original_code_object.co_filename
-        )
+        self.compilation_config.traced_files.add(original_code_object.co_filename)
 
         # 2. every time Dynamo sees a function call, it will inline
         # the function by calling InliningInstructionTranslator.inline_call_
@@ -429,7 +466,7 @@ def _support_torch_compile(
 
         def patched_inline_call(self_):
             code = self_.f_code
-            self.vllm_config.compilation_config.traced_files.add(code.co_filename)
+            self.compilation_config.traced_files.add(code.co_filename)
             return inline_call(self_)
 
         # Disable the C++ compilation of symbolic shape guards. C++-fication
@@ -445,12 +482,18 @@ def _support_torch_compile(
             # if the config doesn't exist
             logger.debug("enable_cpp_symbolic_shape_guards config not available")
 
+        # Prepare backed_size_oblivious config patch if needed
+        fx_config_patches = {}
+        if ds_type == DynamicShapesType.BACKED_SIZE_OBLIVIOUS:
+            fx_config_patches["backed_size_oblivious"] = True
+
         with (
             patch.object(
                 InliningInstructionTranslator, "inline_call_", patched_inline_call
             ),
             torch._dynamo.config.patch(**dynamo_config_patches),
             maybe_use_cudagraph_partition_wrapper(self.vllm_config),
+            torch.fx.experimental._config.patch(**fx_config_patches),
             _torch27_patch_tensor_subclasses(),
         ):
             if envs.VLLM_USE_AOT_COMPILE:
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 493e57f97f0f4..b120c85bf232e 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -6,6 +6,7 @@ import sys
 from abc import abstractmethod
 from contextlib import contextmanager
 from types import CodeType
+from typing import Any
 
 import torch
 import torch._C._dynamo.guards
@@ -85,6 +86,12 @@ class TorchCompileWithNoGuardsWrapper:
     since we drop all guards.
     """
 
+    def check_invariants_and_forward(self, *args, **kwargs):
+        assert hasattr(self, "_check_shape_invariants")
+        self._check_shape_invariants(*args, **kwargs)
+
+        return self.forward(*args, **kwargs)
+
     def __init__(self):
         self.compiled = False
 
@@ -104,6 +111,21 @@ class TorchCompileWithNoGuardsWrapper:
             # Drop all the guards.
             options["guard_filter_fn"] = lambda x: [False for _ in x]
 
+        # Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False
+        from vllm.compilation.decorators import DynamicShapesType
+
+        ds_type = vllm_config.compilation_config.dynamic_shapes_config.type
+        compiled_ptr: Any = self.forward
+        if ds_type == DynamicShapesType.UNBACKED:
+            if envs.VLLM_USE_BYTECODE_HOOK:
+                # reason is that bytecode does this hack torch._dynamo.eval_frame.
+                # remove_from_cache(self.original_code_object()) to force a new
+                # re-compilation.
+                raise ValueError(
+                    "UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0. "
+                )
+            compiled_ptr = self.check_invariants_and_forward
+
         if envs.VLLM_USE_AOT_COMPILE:
             if hasattr(torch._dynamo.config, "enable_aot_compile"):
                 torch._dynamo.config.enable_aot_compile = True
@@ -114,7 +136,7 @@ class TorchCompileWithNoGuardsWrapper:
                 logger.warning(msg)
 
         self._compiled_callable = torch.compile(
-            self.forward,
+            compiled_ptr,
             fullgraph=True,
             dynamic=False,
             backend=backend,
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 9b5309598d0e2..42eccf9f41123 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -192,6 +192,54 @@ class PassConfig:
             self.enable_qk_norm_rope_fusion = False
 
 
+class DynamicShapesType(str, enum.Enum):
+    """Types of dynamic shapes handling in torch.compile().
+    see  Dynamic shapes and vllm guard dropping in torch_compile.md
+    for more details."""
+
+    BACKED = "backed"
+    """Use backed dynamic shapes. torch.compile() guards on backed dynamic
+    shapes and may add guards. Symbols are specialized to 0, 1, or >=2 even
+    without encountering branching on those ranges."""
+
+    UNBACKED = "unbacked"
+    """Use unbacked dynamic shapes. Guaranteed not to be guarded on and not
+    0/1 specialized, but may throw data dependent errors when branches require
+    their value without explicit unbacked handling."""
+
+    BACKED_SIZE_OBLIVIOUS = "backed_size_oblivious"
+    """Experimental flag that treats backed symbols as unbacked when explicit
+    unbacked handling is defined."""
+
+
+@config
+@dataclass
+class DynamicShapesConfig:
+    """Configuration to control/debug torch compile dynamic shapes."""
+
+    type: DynamicShapesType = DynamicShapesType.BACKED
+    """Controls the type of dynamic shapes handling to use with torch.compile().
+
+    - BACKED: Default PyTorch behavior with potential guards ignored.
+    - UNBACKED: No guards guaranteed (most sound) but may throw
+      data dependent errors.
+    - BACKED_SIZE_OBLIVIOUS: Experimental safer alternative to
+      backed/unbacked.
+    """
+
+    # TODO add a debug mode to fail
+
+    def compute_hash(self) -> str:
+        """
+        Provide a hash for DynamicShapesConfig
+        """
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, {})
+        return hash_factors(factors)
+
+
 @config
 @dataclass
 class CompilationConfig:
@@ -322,7 +370,7 @@ class CompilationConfig:
     If empty list [], no ops are excluded (suitable for full cudagraphs)."""
     compile_mm_encoder: bool = False
     """Whether or not to compile the multimodal encoder.
-    Currently, this only works for `Qwen2_5_vl` on selected platforms. 
+    Currently, this only works for `Qwen2_5_vl` on selected platforms.
     Disabled by default until more models are supported/tested to work."""
 
     # Inductor capture
@@ -348,9 +396,11 @@ class CompilationConfig:
     """Sizes to compile for inductor. In addition
     to integers, it also supports "cudagraph_capture_sizes" to
     specify the sizes for cudagraph capture."""
+
     inductor_compile_config: dict = field(default_factory=dict)
     """Additional configurations for inductor.
     - None: use default configurations."""
+
     inductor_passes: dict[str, str] = field(default_factory=dict)
     """Additional passes for inductor. It is a dictionary
     from pass name to pass function qualified name. We use function
@@ -460,8 +510,15 @@ class CompilationConfig:
     max_num_seqs, and prevents capture of many large graphs (>512) that would
     greatly increase startup time with limited performance benefit.
     """
+
+    dynamic_shapes_config: DynamicShapesConfig = field(
+        default_factory=DynamicShapesConfig
+    )
+    """Configuration for dynamic shapes options"""
+
     local_cache_dir: str = field(default=None, init=False)  # type: ignore
     """local cache dir for each rank"""
+
     bs_to_padded_graph_size: list[int] = field(
         default=None,  # type: ignore
         init=False,
@@ -530,6 +587,7 @@ class CompilationConfig:
         from vllm.config.utils import get_hash_factors, hash_factors
 
         factors = get_hash_factors(self, ignored_factors)
+
         factors["pass_config"] = self.pass_config.compute_hash()
         return hash_factors(factors)
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index ebf8addda4a54..eebb9e07fa89d 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -354,7 +354,17 @@ class LlamaDecoderLayer(nn.Module):
         return vllm_config.quant_config
 
 
-@support_torch_compile
+def llama_model_invariants(
+    input_ids, positions, intermediate_tensors=None, inputs_embeds=None
+):
+    """Shape invariants for Llama model compilation, those are translated to
+    runtime assertions for unbacked dynamic shapes and are compiled away for
+    backed"""
+    if input_ids is not None:
+        torch._check(positions.size()[0] == input_ids.size()[0])
+
+
+@support_torch_compile(shape_invariants=llama_model_invariants)
 class LlamaModel(nn.Module):
     def __init__(
         self,
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 32b6d6dd07b83..5831ce0b3d64b 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -274,6 +274,38 @@ class Qwen2DecoderLayer(nn.Module):
         return hidden_states, residual
 
 
+def qwen_2_model_invariants(
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    intermediate_tensors: IntermediateTensors | None = None,
+    inputs_embeds: torch.Tensor | None = None,
+):
+    """Shape invariants for Qwen2Model Model, those are translated to
+    runtime assertions for unbacked dynamic shapes and are compiled away for
+    backed"""
+    # All these should be equal.
+    # input_ids.size()[0]
+    # positions.size()[-1]
+    # intermediate_tensors["hidden_states"].size()[0]
+    # inputs_embeds.size()[0]
+    torch._check(input_ids.size()[0] == positions.size()[-1])
+    if intermediate_tensors is not None:
+        torch._check(
+            input_ids.size()[0] == intermediate_tensors["hidden_states"].size()[0]
+        )
+
+    if inputs_embeds is not None:
+        torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
+
+    # Hidden dimensions should match (hidden_size)
+    # intermediate_tensors["hidden_states"].size()[1]
+    # inputs_embeds.size()[1]
+    if inputs_embeds is not None and intermediate_tensors is not None:
+        torch._check(
+            inputs_embeds.size()[1] == intermediate_tensors["hidden_states"].size()[1]
+        )
+
+
 @support_torch_compile(
     dynamic_arg_dims={
         "input_ids": 0,
@@ -282,7 +314,8 @@ class Qwen2DecoderLayer(nn.Module):
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    }
+    },
+    shape_invariants=qwen_2_model_invariants,
 )
 class Qwen2Model(nn.Module):
     def __init__(

From e48b2e6848ac0084860c8a6bd73927d7535a2c61 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Mon, 24 Nov 2025 22:24:49 +0700
Subject: [PATCH 199/249] [Bugfix] [ROCm] [UX] Reorganize ROCm Backend
 Selection Logic (#26980)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../test_rocm_attention_backends_selection.py | 337 ++++++++++++++++++
 vllm/platforms/rocm.py                        |  80 +++--
 2 files changed, 394 insertions(+), 23 deletions(-)
 create mode 100644 tests/v1/attention/test_rocm_attention_backends_selection.py

diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
new file mode 100644
index 0000000000000..4ec79e9eb6ba4
--- /dev/null
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -0,0 +1,337 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for attention backend selectors."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.platforms import current_platform
+
+# ROCm-specific attention backend selection tests
+pytestmark = pytest.mark.skipif(
+    not current_platform.is_rocm(), reason="ROCm-specific tests"
+)
+
+
+@pytest.fixture
+def mock_vllm_config():
+    """Create a mock VllmConfig for testing."""
+    config = MagicMock()
+    config.model_config.dtype = torch.float16
+    config.model_config.hf_config.architectures = ["LlamaForCausalLM"]
+    config.cache_config.block_size = 16
+    return config
+
+
+@pytest.fixture
+def mock_on_gfx9():
+    """Mock the on_gfx9 function to return True."""
+    with patch("vllm.platforms.rocm.on_gfx9", return_value=True):
+        yield
+
+
+@pytest.mark.parametrize(
+    "env_vars, selected_backend, expected_backend_path",
+    [
+        # Test Case 1: Default (no env vars, no explicit backend)
+        (
+            {},
+            None,
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 2: Explicit TRITON_ATTN backend
+        (
+            {},
+            "TRITON_ATTN",
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 3: Explicit ROCM_ATTN backend
+        (
+            {},
+            "ROCM_ATTN",
+            AttentionBackendEnum.ROCM_ATTN.get_path(),
+        ),
+        # Test Case 4: Explicit ROCM_AITER_FA backend
+        (
+            {},
+            "ROCM_AITER_FA",
+            AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+        ),
+        # Test Case 5: Explicit ROCM_AITER_UNIFIED_ATTN backend
+        (
+            {},
+            "ROCM_AITER_UNIFIED_ATTN",
+            AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
+        ),
+        # Test Case 6: VLLM_ROCM_USE_AITER=1
+        # (defaults to AITER FA when MHA not explicitly disabled)
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            None,
+            AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+        ),
+        # Test Case 7: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=1
+        (
+            {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "1"},
+            None,
+            AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+        ),
+        # Test Case 8: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
+        (
+            {
+                "VLLM_ROCM_USE_AITER": "1",
+                "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION": "1",
+            },
+            None,
+            AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
+        ),
+        # Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
+        (
+            {"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+            None,
+            AttentionBackendEnum.ROCM_ATTN.get_path(),
+        ),
+        # Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            "TRITON_ATTN",
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
+        # (explicitly disabled)
+        (
+            {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"},
+            None,
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            "ROCM_ATTN",
+            AttentionBackendEnum.ROCM_ATTN.get_path(),
+        ),
+    ],
+)
+def test_standard_attention_backend_selection(
+    env_vars,
+    selected_backend,
+    expected_backend_path,
+    mock_vllm_config,
+    mock_on_gfx9,
+    monkeypatch,
+):
+    """Test standard attention backend selection with various configurations."""
+    # Set environment variables
+    for key, value in env_vars.items():
+        monkeypatch.setenv(key, value)
+
+    # Import after setting env vars to ensure they're picked up
+    # Reload envs to pick up new environment variables
+    import importlib
+
+    import vllm.envs as envs
+    from vllm.attention.backends.registry import _Backend
+
+    importlib.reload(envs)
+
+    # Convert string backend to enum if provided
+    backend_enum = None
+    if selected_backend:
+        backend_enum = getattr(_Backend, selected_backend)
+
+    # Get the backend class path
+    from vllm.platforms.rocm import RocmPlatform
+
+    backend_path = RocmPlatform.get_attn_backend_cls(
+        selected_backend=backend_enum,
+        head_size=128,
+        dtype=torch.float16,
+        kv_cache_dtype="auto",
+        block_size=16,
+        use_mla=False,
+        has_sink=False,
+        use_sparse=False,
+    )
+    assert backend_path == expected_backend_path
+
+
+@pytest.mark.parametrize(
+    "env_vars, selected_backend, block_size, expected_backend_path, should_raise",
+    [
+        # Test Case 1: TRITON_MLA with block_size != 1
+        (
+            {},
+            "TRITON_MLA",
+            16,
+            AttentionBackendEnum.TRITON_MLA.get_path(),
+            False,
+        ),
+        # Test Case 2: TRITON_MLA with block_size == 1 (should raise)
+        (
+            {},
+            "TRITON_MLA",
+            1,
+            None,
+            True,
+        ),
+        # Test Case 3: ROCM_AITER_MLA with block_size == 1
+        (
+            {},
+            "ROCM_AITER_MLA",
+            1,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 4: ROCM_AITER_MLA with block_size != 1 (should raise)
+        (
+            {},
+            "ROCM_AITER_MLA",
+            16,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 5: VLLM_ROCM_USE_AITER=1 with block_size == 1
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            None,
+            1,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 6: VLLM_ROCM_USE_AITER=1 with block_size == 16
+        # (should use ROCM_AITER_MLA now, as it supports block_size 16)
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            None,
+            16,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 7: VLLM_ROCM_USE_AITER=1 + explicit TRITON_MLA
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            "TRITON_MLA",
+            16,
+            AttentionBackendEnum.TRITON_MLA.get_path(),
+            False,
+        ),
+        # Test Case 8: Explicit ROCM_AITER_TRITON_MLA
+        (
+            {},
+            "ROCM_AITER_TRITON_MLA",
+            16,
+            AttentionBackendEnum.ROCM_AITER_TRITON_MLA.get_path(),
+            False,
+        ),
+    ],
+)
+def test_mla_backend_selection(
+    env_vars,
+    selected_backend,
+    block_size,
+    expected_backend_path,
+    should_raise,
+    mock_vllm_config,
+    monkeypatch,
+):
+    """Test MLA backend selection with various configurations."""
+    # Set environment variables
+    for key, value in env_vars.items():
+        monkeypatch.setenv(key, value)
+
+    # Import after setting env vars
+    # Reload envs
+    import importlib
+
+    import vllm.envs as envs
+    from vllm.attention.backends.registry import _Backend
+
+    importlib.reload(envs)
+
+    # Mock is_aiter_mla_enabled based on env vars and block_size
+    aiter_enabled = env_vars.get("VLLM_ROCM_USE_AITER") == "1"
+
+    mock_rocm_ops = MagicMock()
+    mock_rocm_ops.is_mla_enabled.return_value = aiter_enabled
+    mock_aiter_module = MagicMock()
+    mock_aiter_module.rocm_aiter_ops = mock_rocm_ops
+
+    with patch.dict("sys.modules", {"vllm._aiter_ops": mock_aiter_module}):
+        # Convert string backend to enum if provided
+        backend_enum = None
+        if selected_backend:
+            backend_enum = getattr(_Backend, selected_backend)
+
+        from vllm.platforms.rocm import RocmPlatform
+
+        if should_raise:
+            with pytest.raises(ValueError):
+                RocmPlatform.get_attn_backend_cls(
+                    selected_backend=backend_enum,
+                    head_size=128,
+                    dtype=torch.float16,
+                    kv_cache_dtype="auto",
+                    block_size=block_size,
+                    use_mla=True,
+                    has_sink=False,
+                    use_sparse=False,
+                )
+        else:
+            backend_path = RocmPlatform.get_attn_backend_cls(
+                selected_backend=backend_enum,
+                head_size=128,
+                dtype=torch.float16,
+                kv_cache_dtype="auto",
+                block_size=block_size,
+                use_mla=True,
+                has_sink=False,
+                use_sparse=False,
+            )
+            assert backend_path == expected_backend_path
+
+
+def test_aiter_fa_requires_gfx9(mock_vllm_config):
+    """Test that ROCM_AITER_FA requires gfx9 architecture."""
+    from vllm.attention.backends.registry import _Backend
+    from vllm.platforms.rocm import RocmPlatform
+
+    # Mock on_gfx9 to return False
+    with (
+        patch("vllm.platforms.rocm.on_gfx9", return_value=False),
+        pytest.raises(
+            ValueError,
+            match="only supported on gfx9",
+        ),
+    ):
+        RocmPlatform.get_attn_backend_cls(
+            selected_backend=_Backend.ROCM_AITER_FA,
+            head_size=128,
+            dtype=torch.float16,
+            kv_cache_dtype="auto",
+            block_size=16,
+            use_mla=False,
+            has_sink=False,
+            use_sparse=False,
+        )
+
+
+def test_sparse_not_supported(mock_vllm_config):
+    """Test that sparse attention is not supported on ROCm."""
+    from vllm.platforms.rocm import RocmPlatform
+
+    with pytest.raises(
+        AssertionError, match="Sparse MLA backend on ROCm only supports block size 1"
+    ):
+        RocmPlatform.get_attn_backend_cls(
+            selected_backend=None,
+            head_size=128,
+            dtype=torch.float16,
+            kv_cache_dtype="auto",
+            block_size=16,
+            use_mla=False,
+            has_sink=False,
+            use_sparse=True,
+        )
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f9005fd7d044c..f3ec965bd0881 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -262,30 +262,64 @@ class RocmPlatform(Platform):
                 f"is not MLA type while requested for MLA backend."
             )
 
-        if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
-            logger.info("Using FlexAttention backend.")
-            return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
-        if (
-            rocm_aiter_ops.is_mha_enabled()
-        ) or selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
-            logger.info("Using Aiter Flash Attention backend.")
-            return AttentionBackendEnum.ROCM_AITER_FA.get_path()
-        if (
-            rocm_aiter_ops.is_triton_unified_attn_enabled()
-        ) or selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
-            logger.info("Using Aiter Unified Attention backend.")
-            return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
-        if (
-            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
-            or selected_backend == AttentionBackendEnum.ROCM_ATTN
-        ):
-            # rocm specific backend, with aiter and/or
-            #   triton prefix-prefill
-            logger.info("Using Rocm Attention backend.")
+        if selected_backend == AttentionBackendEnum.TRITON_ATTN:
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+        if selected_backend == AttentionBackendEnum.ROCM_ATTN:
+            logger.info("Using Rocm Attention backend on V1 engine.")
             return AttentionBackendEnum.ROCM_ATTN.get_path()
-        # default case, using triton unified attention
-        logger.info("Using Triton Attention backend.")
-        return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+        if selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
+            if on_gfx9():
+                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+            else:
+                raise ValueError(
+                    f"The selected backend, {selected_backend.name}, "
+                    "is only supported on gfx9 architectures."
+                )
+
+        if selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
+            logger.info("Using Aiter Unified Attention backend on V1 engine.")
+            return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
+
+        # Handle automatic backend selection based on environment variables
+        if selected_backend is None:
+            # Priority 1: Check for AITER Unified Attention (must check before MHA)
+            if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION:
+                logger.info("Using Aiter Unified Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
+
+            # Priority 2: Check for AITER MHA (Flash Attention)
+            # Only use if explicitly enabled (not just VLLM_ROCM_USE_AITER=1)
+            if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
+                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+
+            # Priority 3: Check for ROCM_ATTN (prefill-decode split)
+            if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
+                logger.info("Using Rocm Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_ATTN.get_path()
+
+            # Priority 4: Check for AITER enabled without specific flags
+            # This defaults to AITER FA only if MHA is not explicitly disabled
+            if (
+                envs.VLLM_ROCM_USE_AITER
+                and on_gfx9()
+                and envs.VLLM_ROCM_USE_AITER_MHA is not False
+            ):
+                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+
+            # Default: Triton Unified Attention
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+        raise RuntimeError(
+            "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
+            "to select a supported backend."
+        )
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:

From 656516c3158ef932c6a19a6aab9fdf4df74b105f Mon Sep 17 00:00:00 2001
From: Aydin Abiar <62435714+Aydin-ab@users.noreply.github.com>
Date: Mon, 24 Nov 2025 07:28:51 -0800
Subject: [PATCH 200/249] [Bugfix] properly handle nested json with llama3 tool
 parser (#27701)

Signed-off-by: Aydin Abiar <aydin@anyscale.com>
Signed-off-by: Aydin Abiar <62435714+Aydin-ab@users.noreply.github.com>
Co-authored-by: Aydin Abiar <aydin@anyscale.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 .../test_llama3_json_tool_parser.py           | 128 ++++++++++++++++++
 .../openai/tool_parsers/llama_tool_parser.py  | 116 ++++++++++------
 2 files changed, 203 insertions(+), 41 deletions(-)

diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
index 2b68a653f4600..37e52d2cdf609 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from unittest.mock import MagicMock, patch
+
 import pytest
 
 from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
@@ -132,3 +134,129 @@ def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
     assert result.tool_calls[0].function.name == "searchTool"
     assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
     assert result.tool_calls[2].function.name == "searchTool"
+
+
+def test_extract_tool_calls_deeply_nested_json(parser):
+    # Test with deeply nested JSON parameters (5 levels)
+    model_output = (
+        '{"name": "complexTool", '
+        '"parameters": {'
+        '"level1": {'
+        '"level2": {'
+        '"level3": {'
+        '"level4": {'
+        '"value": "deep"'
+        "}}}}}}"
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "complexTool"
+    # Verify the nested structure is preserved in the arguments
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["level1"]["level2"]["level3"]["level4"]["value"] == "deep"
+
+
+def test_extract_tool_calls_multiple_with_deep_nesting(parser):
+    # Test with multiple tool calls where some have deeply nested parameters
+    model_output = (
+        '{"name": "simpleTool", "parameters": {"value": "test"}}; '
+        '{"name": "complexTool", "parameters": '
+        '{"config": {"database": {"connection": {"pool": {"size": 10}}}}}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 2
+
+    # Check first tool call
+    assert result.tool_calls[0].function.name == "simpleTool"
+    import json
+
+    args0 = json.loads(result.tool_calls[0].function.arguments)
+    assert args0["value"] == "test"
+
+    # Check second tool call with deep nesting
+    assert result.tool_calls[1].function.name == "complexTool"
+    args1 = json.loads(result.tool_calls[1].function.arguments)
+    assert args1["config"]["database"]["connection"]["pool"]["size"] == 10
+
+
+def test_extract_tool_calls_with_quotes_and_brackets_in_string(parser):
+    # Test with quotes and brackets inside quoted string values
+    model_output = (
+        '{"name": "searchTool", '
+        '"parameters": {'
+        '"query": "test {value} [complex]",'
+        '"nested": {"inner": "more {brackets}"}'
+        "}}"
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "searchTool"
+    # Verify the string values are preserved including brackets and quotes
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["query"] == "test {value} [complex]"
+    assert args["nested"]["inner"] == "more {brackets}"
+
+
+def test_extract_tool_calls_with_escaped_quotes_in_nested_json(parser):
+    # Test with escaped quotes in deeply nested JSON
+    model_output = (
+        '{"name": "parserTool", "parameters": {"text": "He said \\"Hello {world}\\""}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "parserTool"
+    # Verify escaped quotes are preserved
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["text"] == 'He said "Hello {world}"'
+
+
+def test_extract_tool_calls_missing_name_key(parser):
+    # Test that missing "name" key returns content
+    model_output = '{"parameters": {}}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_extract_tool_calls_missing_parameters_and_arguments_key(parser):
+    # Test that missing both "parameters" and "arguments" keys returns content
+    model_output = '{"name": "toolWithoutParams"}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_regex_timeout_handling(parser):
+    """Test regex timeout is handled gracefully"""
+    fake_problematic_input = "{hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.finditer.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(parser, "tool_call_start_regex", mock_regex):
+        result = parser.extract_tool_calls(fake_problematic_input, None)
+
+        # should treat as regular text when regex times out
+        assert result.content == fake_problematic_input
+        assert result.tools_called is False
+        assert len(result.tool_calls) == 0
+        mock_regex.finditer.assert_called_once()
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 02fc9b8a4d34e..e1fe6e90dfd0b 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -9,6 +9,7 @@ import regex as re
 from partial_json_parser.core.options import Allow
 from transformers import PreTrainedTokenizerBase
 
+import vllm.envs as envs
 from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
@@ -56,12 +57,10 @@ class Llama3JsonToolParser(ToolParser):
         self.bot_token_id = tokenizer.encode(self.bot_token, add_special_tokens=False)[
             0
         ]
-        # Updated regex to match multiple JSONs separated by semicolons
-        # This pattern is more robust and can handle nested JSON objects
-        self.tool_call_regex = re.compile(
-            r"{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*",
-            re.DOTALL,
-        )
+        # Simple regex to find opening braces - we'll use JSON decoder for parsing
+        # This handles arbitrary nesting depth correctly
+        self.tool_call_start_regex = re.compile(r"\{")
+        self.json_decoder = json.JSONDecoder()
 
     def extract_tool_calls(
         self, model_output: str, request: ChatCompletionRequest
@@ -77,49 +76,84 @@ class Llama3JsonToolParser(ToolParser):
                 tools_called=False, tool_calls=[], content=model_output
             )
 
-        # Find JSON object(s) in the text using regex
-        match = self.tool_call_regex.search(model_output)
-        if not match:
+        # Keep track of the end index of the last parsed JSON object
+        # so we don't parse inner brackets
+        end_index = -1
+        tool_calls: list[ToolCall] = []
+
+        try:
+            for match in self.tool_call_start_regex.finditer(
+                model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
+            ):
+                start_index = match.start()
+                # Skip if this brace is inside a previously parsed JSON object
+                if start_index <= end_index:
+                    continue
+
+                try:
+                    obj, json_end_index = self.json_decoder.raw_decode(
+                        model_output[start_index:]
+                    )
+                    end_index = start_index + json_end_index
+
+                    # raise KeyError if missing
+                    name = obj["name"]
+                    arguments_or_params = (
+                        obj["arguments"] if "arguments" in obj else obj["parameters"]
+                    )
+
+                    tool_calls.append(
+                        ToolCall(
+                            type="function",
+                            function=FunctionCall(
+                                name=name,
+                                # function call args are JSON but as a string
+                                arguments=json.dumps(
+                                    arguments_or_params, ensure_ascii=False
+                                ),
+                            ),
+                        )
+                    )
+                except KeyError as e:
+                    # Missing required key
+                    missing_key = str(e).strip("'\"")
+                    logger.exception(
+                        "Couldn't extract tool call from JSON response. "
+                        "Required key '%s' not present. "
+                        "Returning output in content with empty tool calls.",
+                        missing_key,
+                    )
+                    return ExtractedToolCallInformation(
+                        tools_called=False, tool_calls=[], content=model_output
+                    )
+                except Exception:
+                    # Any other error during parsing
+                    logger.exception(
+                        "Error in extracting tool call from response. "
+                        "Returning output in content with empty tool calls"
+                    )
+                    return ExtractedToolCallInformation(
+                        tools_called=False, tool_calls=[], content=model_output
+                    )
+        except TimeoutError:
+            logger.warning("Regex timeout occurred when matching tool call pattern.")
+            logger.debug(
+                "Regex timeout occurred when matching user input: %s", model_output
+            )
             return ExtractedToolCallInformation(
                 tools_called=False, tool_calls=[], content=model_output
             )
 
-        try:
-            json_str = match.group(0)
-            # Split by semicolon and strip whitespace
-            json_objects = [obj.strip() for obj in json_str.split(";")]
-
-            tool_calls: list[ToolCall] = []
-            for json_obj in json_objects:
-                if not json_obj:  # Skip empty strings
-                    continue
-                obj = json.loads(json_obj)
-                tool_calls.append(
-                    ToolCall(
-                        type="function",
-                        function=FunctionCall(
-                            name=obj["name"],
-                            # function call args are JSON but as a string
-                            arguments=json.dumps(
-                                obj["arguments"]
-                                if "arguments" in obj
-                                else obj["parameters"],
-                                ensure_ascii=False,
-                            ),
-                        ),
-                    )
-                )
-
+        # If we have valid tool calls, return them normally
+        if tool_calls:
             return ExtractedToolCallInformation(
                 tools_called=True, tool_calls=tool_calls, content=None
             )
 
-        except Exception:
-            logger.exception("Error in extracting tool call from response.")
-            # return information to just treat the tool call as regular JSON
-            return ExtractedToolCallInformation(
-                tools_called=False, tool_calls=[], content=model_output
-            )
+        # No valid tool calls found
+        return ExtractedToolCallInformation(
+            tools_called=False, tool_calls=[], content=model_output
+        )
 
     def extract_tool_calls_streaming(
         self,

From e924bbb4f4ac3258a71a18ac4c753c8056bc059f Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 24 Nov 2025 11:06:17 -0500
Subject: [PATCH 201/249] [Build/CI][DP/EP] Add QWen/Qwen3-30B-A3B-FP8 + EPLB
 tests to Nightly H100 and B200 (#29195)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 ...block_ep.sh => qwen30b_a3b_fp8_block_ep_eplb.sh} | 10 +++++++---
 .buildkite/test-amd.yaml                            |  2 +-
 .buildkite/test-pipeline.yaml                       | 13 +++++++++++--
 3 files changed, 19 insertions(+), 6 deletions(-)
 rename .buildkite/scripts/scheduled_integration_test/{qwen30b_a3b_fp8_block_ep.sh => qwen30b_a3b_fp8_block_ep_eplb.sh} (82%)

diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
similarity index 82%
rename from .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
rename to .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
index 0d06f53a183d0..6a1bef275d047 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
 THRESHOLD=${1:-0.8}
 NUM_Q=${2:-1319}
 PORT=${3:-8020}
+DATA_PARALLEL_SIZE=${4:-2}
+TENSOR_PARALLEL_SIZE=${5:-2}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"
 
@@ -45,8 +47,10 @@ for BACK in "${BACKENDS[@]}"; do
   VLLM_ALL2ALL_BACKEND=$BACK \
   vllm serve "$MODEL" \
     --enforce-eager \
-    --tensor-parallel-size 2 \
-    --data-parallel-size 2 \
+    --enable-eplb \
+    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --data-parallel-size ${DATA_PARALLEL_SIZE} \
     --enable-expert-parallel \
     --trust-remote-code \
     --max-model-len 2048 \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index f098e23866eb3..4ddf11c0b268f 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1486,4 +1486,4 @@ steps:
   num_gpus: 4
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7a46e919f93bf..f1cd39ef4f948 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1340,11 +1340,20 @@ steps:
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
   timeout_in_minutes: 60
   gpu: h100
   optional: true
   num_gpus: 4
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file

From 26a465584ae14a04a1f6e9b36621d70e9d907c10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Mon, 24 Nov 2025 18:18:04 +0100
Subject: [PATCH 202/249] [NIXL] Use config to enable telemetry + NIXL version
 bump (#29305)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 requirements/kv_connectors.txt                           | 2 +-
 .../kv_transfer/kv_connector/v1/nixl_connector.py        | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt
index b1f3269cd3813..083230c171096 100644
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@@ -1,2 +1,2 @@
 lmcache
-nixl >= 0.6.0 # Required for disaggregated prefill
+nixl >= 0.7.1 # Required for disaggregated prefill
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 7c0911240493c..493938d4aad92 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -4,7 +4,6 @@ import contextlib
 import copy
 import logging
 import math
-import os
 import queue
 import threading
 import time
@@ -810,9 +809,6 @@ class NixlConnectorWorker:
         self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config(
             "backends", ["UCX"]
         )
-        # TODO temporary, once nixl allows for telemetry flag in config
-        # (next release), we can remove this env var.
-        os.environ["NIXL_TELEMETRY_ENABLE"] = "1"
 
         # Agent.
         non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
@@ -828,10 +824,11 @@ class NixlConnectorWorker:
         if nixl_agent_config is None:
             config = None
         else:
+            # Enable telemetry by default for NIXL 0.7.1 and above.
             config = (
-                nixl_agent_config(backends=self.nixl_backends)
+                nixl_agent_config(backends=self.nixl_backends, capture_telemetry=True)
                 if len(non_ucx_backends) > 0
-                else nixl_agent_config(num_threads=num_threads)
+                else nixl_agent_config(num_threads=num_threads, capture_telemetry=True)
             )
 
         self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)

From cc313cb73d75cd5ac2715fc45bfadb89888cf8cd Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Nov 2025 09:32:27 -0800
Subject: [PATCH 203/249] [Model Runner V2] Implement Single-step Eagle 1
 (#29300)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py          |   3 +
 vllm/v1/worker/gpu/model_runner.py         |  79 +++++++++
 vllm/v1/worker/gpu/sampler.py              |   5 +-
 vllm/v1/worker/gpu/spec_decode/__init__.py |  18 ++
 vllm/v1/worker/gpu/spec_decode/eagle.py    | 197 +++++++++++++++++++++
 5 files changed, 300 insertions(+), 2 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/spec_decode/eagle.py

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 7675cb45170b5..1177d25e300cf 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -37,6 +37,9 @@ class InputBuffers:
         self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
         self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
 
+        # Spec decoding.
+        self.next_prefill_tokens = self._make_buffer(max_num_reqs, dtype=torch.int32)
+
         # Structured outputs.
         self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
         self.grammar_bitmask = self._make_buffer(
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 6e332ee4b75b8..205298a415d43 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -45,6 +45,7 @@ from vllm.v1.worker.gpu.input_batch import (
     prepare_prefill_inputs,
 )
 from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
+from vllm.v1.worker.gpu.spec_decode import init_speculator
 from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
 from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
 from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
@@ -97,16 +98,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if self.use_async_scheduling:
             self.input_prep_event = torch.cuda.Event()
             self.structured_outputs_event = torch.cuda.Event()
+            self.spec_decode_event = torch.cuda.Event()
         else:
             self.input_prep_event = None
             self.structured_outputs_event = None
+            self.spec_decode_event = None
 
         if self.speculative_config is not None:
             self.do_spec_decode = True
             self.num_speculative_steps = self.speculative_config.num_speculative_tokens
+            self.speculator = init_speculator(self.vllm_config, self.device)
         else:
             self.do_spec_decode = False
             self.num_speculative_steps = 0
+            self.speculator = None
 
         self.req_states = RequestState(
             max_num_reqs=self.max_num_reqs,
@@ -153,6 +158,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     self.vllm_config,
                     self.device,
                 )
+            if self.do_spec_decode:
+                self.speculator.load_model(self.model)
         time_after_load = time.perf_counter()
 
         self.model_memory_usage = m.consumed_memory
@@ -285,6 +292,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         logits = self.model.compute_logits(hidden_states)
         self.sampler(logits, sampling_metadata)
 
+    @torch.inference_mode()
+    def _dummy_speculator_run(
+        self,
+        hidden_states: torch.Tensor,
+        aux_hidden_states: list[torch.Tensor] | None,
+    ) -> None:
+        num_tokens = hidden_states.shape[0]
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        input_batch = InputBatch.make_dummy(
+            num_reqs=num_reqs,
+            num_tokens=num_tokens,
+            input_buffers=self.input_buffers,
+            device=self.device,
+        )
+        sampling_metadata = SamplingMetadata.make_dummy(
+            num_reqs=num_reqs,
+            device=self.device,
+        )
+        num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device)
+        self.propose_draft(
+            input_batch=input_batch,
+            sampling_metadata=sampling_metadata,
+            last_hidden_states=hidden_states,
+            aux_hidden_states=aux_hidden_states,
+            num_sampled=num_sampled,
+        )
+
     @torch.inference_mode()
     def profile_run(self) -> None:
         hidden_states, sample_hidden_states = self._dummy_run(
@@ -292,6 +326,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             skip_attn=True,
         )
         self._dummy_sampler_run(sample_hidden_states)
+        if self.do_spec_decode:
+            self._dummy_speculator_run(hidden_states, None)
         torch.cuda.synchronize()
         del hidden_states, sample_hidden_states
         gc.collect()
@@ -727,6 +763,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.req_states.prefill_len.np[idx_mapping_np],
         )
 
+    @torch.inference_mode()
+    def propose_draft(
+        self,
+        input_batch: InputBatch,
+        sampling_metadata: SamplingMetadata,
+        last_hidden_states: torch.Tensor,
+        aux_hidden_states: list[torch.Tensor] | None,
+        num_sampled: torch.Tensor,
+    ) -> torch.Tensor:
+        num_reqs = input_batch.num_reqs
+        idx_mapping_np = input_batch.idx_mapping_np
+        with async_barrier(self.spec_decode_event):
+            self.input_buffers.next_prefill_tokens.np[:num_reqs] = (
+                self.req_states.prefill_token_ids[
+                    idx_mapping_np,
+                    self.req_states.num_computed_prefill_tokens[idx_mapping_np],
+                ]
+            )
+            next_prefill_tokens = self.input_buffers.next_prefill_tokens.copy_to_gpu(
+                num_reqs
+            )
+
+        assert self.speculator is not None
+        draft_tokens = self.speculator.propose(
+            input_batch,
+            sampling_metadata,
+            last_hidden_states,
+            aux_hidden_states,
+            num_sampled,
+            self.req_states.last_sampled_tokens,
+            next_prefill_tokens,
+        )
+        self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens
+        return draft_tokens
+
     def get_cudagraph_and_dp_padding(
         self,
         scheduler_output: SchedulerOutput,
@@ -913,6 +984,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.postprocess(
             input_batch, sampler_output.sampled_token_ids, num_sampled_tokens
         )
+        if self.do_spec_decode:
+            _ = self.propose_draft(
+                input_batch,
+                sampling_metadata,
+                hidden_states,
+                None,  # aux_hidden_states
+                num_sampled_tokens,
+            )
 
         if self.use_async_scheduling:
             return async_output
diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
index c48ed2d8ca167..d8676079ab951 100644
--- a/vllm/v1/worker/gpu/sampler.py
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -100,8 +100,9 @@ def _gumbel_sample_kernel(
         mask=mask,
         other=float("-inf"),
     )
+    logits = logits.to(tl.float32)
 
-    temp = tl.load(temp_ptr + req_idx)
+    temp = tl.load(temp_ptr + req_idx).to(tl.float32)
     if temp != 0.0:
         # Calculate the seed for gumbel noise.
         seed = tl.load(seeds_ptr + req_idx)
@@ -116,7 +117,7 @@ def _gumbel_sample_kernel(
         # Apply temperature.
         if APPLY_TEMPERATURE:
             # NOTE(woosuk): Use div_rn to match the behavior of torch.
-            logits = tl.div_rn(logits, temp.to(tl.float32))
+            logits = tl.div_rn(logits, temp)
 
         # Apply gumbel noise.
         logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
diff --git a/vllm/v1/worker/gpu/spec_decode/__init__.py b/vllm/v1/worker/gpu/spec_decode/__init__.py
index e69de29bb2d1d..15b85204e05ce 100644
--- a/vllm/v1/worker/gpu/spec_decode/__init__.py
+++ b/vllm/v1/worker/gpu/spec_decode/__init__.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.config import VllmConfig
+
+
+def init_speculator(
+    vllm_config: VllmConfig,
+    device: torch.device,
+):
+    speculative_config = vllm_config.speculative_config
+    assert speculative_config is not None
+    if speculative_config.use_eagle():
+        from vllm.v1.worker.gpu.spec_decode.eagle import EagleSpeculator
+
+        return EagleSpeculator(vllm_config, device)
+    raise NotImplementedError(f"{speculative_config.method} is not supported yet.")
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
new file mode 100644
index 0000000000000..0f11903e14540
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.forward_context import set_forward_context
+from vllm.model_executor.model_loader import get_model
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu.input_batch import InputBatch
+from vllm.v1.worker.gpu.sampler import gumbel_sample
+from vllm.v1.worker.gpu.states import SamplingMetadata
+
+
+class EagleSpeculator:
+    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        self.vllm_config = vllm_config
+        self.device = device
+
+        self.speculative_config = vllm_config.speculative_config
+        assert self.speculative_config is not None
+        self.method = self.speculative_config.method
+        self.num_speculative_steps = self.speculative_config.num_speculative_tokens
+        self.draft_model_config = self.speculative_config.draft_model_config
+
+        self.scheduler_config = vllm_config.scheduler_config
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+
+        self.input_ids = torch.zeros(
+            self.max_num_tokens, dtype=torch.int32, device=device
+        )
+        self.positions = torch.zeros(
+            self.max_num_tokens, dtype=torch.int64, device=device
+        )
+
+    def load_model(self, target_model: nn.Module) -> None:
+        from vllm.compilation.backends import set_model_tag
+
+        with set_model_tag("eagle_head"):
+            self.model = get_model(
+                vllm_config=self.vllm_config, model_config=self.draft_model_config
+            )
+
+        share_lm_head = True
+        if share_lm_head and hasattr(target_model, "lm_head"):
+            if hasattr(self.model, "lm_head"):
+                del self.model.lm_head
+            self.model.lm_head = target_model.lm_head
+
+    @torch.inference_mode()
+    def propose(
+        self,
+        input_batch: InputBatch,
+        sampling_metadata: SamplingMetadata,
+        # [num_tokens, hidden_size]
+        last_hidden_states: torch.Tensor,
+        # num_layers x [num_tokens, hidden_size]
+        aux_hidden_states: list[torch.Tensor] | None,
+        # [num_reqs]
+        num_sampled: torch.Tensor,
+        # [max_num_reqs, 1]
+        last_sampled: torch.Tensor,
+        # [num_reqs]
+        next_prefill_tokens: torch.Tensor,
+    ) -> torch.Tensor:
+        if aux_hidden_states:
+            assert self.method == "eagle3"
+            hidden_states = self.model.combine_hidden_states(
+                torch.cat(aux_hidden_states, dim=-1)
+            )
+        else:
+            hidden_states = last_hidden_states
+
+        # Get the input ids and last token indices for the speculator.
+        last_token_indices = prepare_eagle_inputs(
+            self.input_ids,
+            input_batch,
+            num_sampled,
+            last_sampled,
+            next_prefill_tokens,
+        )
+        input_ids = self.input_ids[: input_batch.num_tokens_after_padding]
+
+        # Prefill: Run the eagle speculator with eager mode.
+        with set_forward_context(
+            input_batch.attn_metadata,
+            self.vllm_config,
+            num_tokens=input_batch.num_tokens_after_padding,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
+        ):
+            ret_hidden_states = self.model(
+                input_ids=input_ids,
+                positions=input_batch.positions,
+                hidden_states=hidden_states,
+            )
+        if self.method == "mtp":
+            last_hidden_states = ret_hidden_states
+            hidden_states = ret_hidden_states
+        else:
+            last_hidden_states, hidden_states = ret_hidden_states
+        sample_hidden_states = last_hidden_states[last_token_indices]
+        logits = self.model.compute_logits(sample_hidden_states)
+
+        num_reqs = input_batch.num_reqs
+        cu_num_logits = input_batch.cu_num_logits[:num_reqs]
+        temperature = sampling_metadata.temperature[cu_num_logits]
+        seed = sampling_metadata.seeds[cu_num_logits]
+        # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
+        # used for draft and target sampling.
+        pos = input_batch.positions[last_token_indices] + 1
+        draft_tokens = gumbel_sample(
+            logits, temperature, seed, pos, apply_temperature=True
+        )
+        if self.num_speculative_steps == 1:
+            # Early exit.
+            return draft_tokens.view(-1, 1)
+        raise NotImplementedError("num_speculative_steps > 1 is not supported yet.")
+
+
+@triton.jit
+def _prepare_eagle_inputs_kernel(
+    last_token_indices_ptr,
+    eagle_input_ids_ptr,
+    target_input_ids_ptr,
+    idx_mapping_ptr,
+    last_sampled_ptr,
+    next_prefill_tokens_ptr,
+    num_sampled_ptr,
+    query_start_loc_ptr,
+    cu_num_logits_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    query_start = tl.load(query_start_loc_ptr + batch_idx)
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    query_len = query_end - query_start
+
+    # Get the true query length and next token after accounting for rejected tokens.
+    num_sampled = tl.load(num_sampled_ptr + batch_idx)
+    if num_sampled > 0:
+        req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+        next_token = tl.load(last_sampled_ptr + req_state_idx).to(tl.int32)
+
+        logits_start = tl.load(cu_num_logits_ptr + batch_idx)
+        logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1)
+        num_logits = logits_end - logits_start
+
+        num_rejected = num_logits - num_sampled
+        query_len -= num_rejected
+    else:
+        # Chunked prefilling.
+        # Get the next prefill token.
+        next_token = tl.load(next_prefill_tokens_ptr + batch_idx)
+
+    # Shift target_input_ids by one.
+    for i in range(1, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        input_ids = tl.load(target_input_ids_ptr + query_start + block, mask=mask)
+        tl.store(eagle_input_ids_ptr + query_start + block - 1, input_ids, mask=mask)
+
+    last_token_index = query_start + query_len - 1
+    tl.store(last_token_indices_ptr + batch_idx, last_token_index)
+    tl.store(eagle_input_ids_ptr + last_token_index, next_token)
+
+
+def prepare_eagle_inputs(
+    eagle_input_ids: torch.Tensor,
+    input_batch: InputBatch,
+    # [num_reqs]
+    num_sampled: torch.Tensor,
+    # [max_num_reqs, 1]
+    last_sampled: torch.Tensor,
+    # [max_num_reqs]
+    next_prefill_tokens: torch.Tensor,
+) -> torch.Tensor:
+    num_reqs = input_batch.num_reqs
+    last_token_indices = torch.empty(
+        num_reqs,
+        dtype=torch.int64,
+        device=eagle_input_ids.device,
+    )
+    _prepare_eagle_inputs_kernel[(num_reqs,)](
+        last_token_indices,
+        eagle_input_ids,
+        input_batch.input_ids,
+        input_batch.idx_mapping,
+        last_sampled,
+        next_prefill_tokens,
+        num_sampled,
+        input_batch.query_start_loc,
+        input_batch.cu_num_logits,
+        BLOCK_SIZE=1024,
+    )
+    return last_token_indices

From cec418b5df3bf032a83b6a6795e8026d39e199bd Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Nov 2025 09:34:37 -0800
Subject: [PATCH 204/249] [Model Runner V2] Change Numba AoT to JIT (#29328)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py  | 71 +++++++-----------------------
 vllm/v1/worker/gpu/model_runner.py | 24 ++++++----
 2 files changed, 32 insertions(+), 63 deletions(-)

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 1177d25e300cf..3ac43ea4952de 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -4,7 +4,6 @@ from dataclasses import dataclass
 from typing import Any
 
 import numba
-import numba.types as types
 import numpy as np
 import torch
 
@@ -147,80 +146,42 @@ class InputBatch:
         )
 
 
-# NOTE: With the type annotations, this function is pre-compiled
-# before the first call.
-@numba.jit(
-    [
-        types.none(
-            types.int32[:],  # idx_mapping
-            types.int32[:],  # num_scheduled_tokens
-            types.int32[:, :],  # prefill_token_ids
-            types.int32[:],  # num_computed_prefill_tokens
-            types.int32[:],  # prefill_len
-            types.int32[:],  # input_ids
-            types.int32[:],  # query_start_loc
-        )
-    ],
-    nopython=True,
-    cache=True,
-)
+@numba.njit(cache=True)
 def _prepare_prefill_inputs(
-    idx_mapping: np.ndarray,  # batch_idx -> req_idx
-    num_scheduled_tokens: np.ndarray,  # [B]
+    idx_mapping: np.ndarray,  # [B]
+    query_lens: np.ndarray,  # [B]
+    query_start_loc: np.ndarray,  # [B + 1]
     prefill_token_ids: np.ndarray,  # [N, max_model_len]
     num_computed_prefill_tokens: np.ndarray,  # [N]
-    prefill_len: np.ndarray,  # [N]
     input_ids: np.ndarray,  # [num_input_tokens]
-    query_start_loc: np.ndarray,  # [B + 1]
 ) -> None:
-    num_reqs = num_scheduled_tokens.shape[0]
-    query_start_loc[0] = 0
-
-    cu_num_tokens = 0
+    num_reqs = idx_mapping.shape[0]
+    query_starts = query_start_loc[:num_reqs]
+    query_ends = query_start_loc[1 : num_reqs + 1]
+    starts = num_computed_prefill_tokens[idx_mapping]
+    ends = starts + query_lens
     for i in range(num_reqs):
-        req_idx = idx_mapping[i]
-        query_len = num_scheduled_tokens[i]
-
-        start = num_computed_prefill_tokens[req_idx]
-        end = min(start + query_len, prefill_len[req_idx])
-        n = end - start
-
-        start_idx = cu_num_tokens
-        input_ids[start_idx : start_idx + n] = prefill_token_ids[req_idx, start:end]
-
-        cu_num_tokens = start_idx + query_len
-        query_start_loc[i + 1] = cu_num_tokens
-
-    # Pad the inputs for CUDA graphs.
-    # Note: pad query_start_loc to be non-decreasing, as kernels
-    # like FlashAttention requires that
-    query_start_loc[num_reqs + 1 :].fill(cu_num_tokens)
+        input_ids[query_starts[i] : query_ends[i]] = prefill_token_ids[
+            idx_mapping[i], starts[i] : ends[i]
+        ]
 
 
 def prepare_prefill_inputs(
     idx_mapping: np.ndarray,
     num_scheduled_tokens: np.ndarray,
-    total_num_tokens: int,
+    query_start_loc: np.ndarray,
     prefill_token_ids: np.ndarray,
     num_computed_prefill_tokens: np.ndarray,
-    prefill_len: np.ndarray,
-    input_ids: CpuGpuBuffer,
-    query_start_loc: CpuGpuBuffer,
+    input_ids: np.ndarray,
 ) -> None:
     _prepare_prefill_inputs(
         idx_mapping,
         num_scheduled_tokens,
+        query_start_loc,
         prefill_token_ids,
         num_computed_prefill_tokens,
-        prefill_len,
-        input_ids.np,
-        query_start_loc.np,
+        input_ids,
     )
-    input_ids.copy_to_gpu(total_num_tokens)
-    # NOTE(woosuk): We should copy the whole query_start_loc and seq_lens
-    # tensors from CPU to GPU, because they may include paddings needed
-    # for full CUDA graph mode.
-    query_start_loc.copy_to_gpu()
 
 
 @triton.jit
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 205298a415d43..e0ed183d3c5b0 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -502,20 +502,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
         block_tables = self.block_tables.gather_block_tables(idx_mapping)
 
-        # Copy prefill tokens from CPU to GPU and get query_start_loc.
+        # Get query_start_loc.
+        np.cumsum(
+            num_scheduled_tokens,
+            out=self.input_buffers.query_start_loc.np[1 : num_reqs + 1],
+        )
+        # Pad for full CUDA graph mode.
+        # Some attention backends like FA3 require query_start_loc to be non-decreasing.
+        self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
+        self.input_buffers.query_start_loc.copy_to_gpu()
+        query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+        query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
+
+        # Copy prefill tokens from CPU to GPU.
         prepare_prefill_inputs(
             idx_mapping_np,
             num_scheduled_tokens,
-            num_tokens,
+            query_start_loc_np,
             self.req_states.prefill_token_ids,
             self.req_states.num_computed_prefill_tokens,
-            self.req_states.prefill_len.np,
-            self.input_buffers.input_ids,
-            self.input_buffers.query_start_loc,
+            self.input_buffers.input_ids.np,
         )
-        query_start_loc = self.input_buffers.query_start_loc
-        query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
-        query_start_loc_np = query_start_loc.np[: num_reqs + 1]
+        self.input_buffers.input_ids.copy_to_gpu(num_tokens)
 
         # Prepare positions and seq_lens.
         prepare_pos_seq_lens(

From 8f066146c395dfadb86914c88d9a0f3173f8fa39 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Mon, 24 Nov 2025 13:38:04 -0500
Subject: [PATCH 205/249] [MoE][Refactor] Make select_experts a non-static
 method (#29067)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 tests/kernels/moe/test_flashinfer.py          |  19 +--
 tests/test_routing_simulator.py               |  35 ++++-
 .../layers/fused_moe/fused_moe_method_base.py |   6 +-
 .../fused_moe/fused_moe_modular_method.py     |  41 +-----
 vllm/model_executor/layers/fused_moe/layer.py | 118 +++++++++--------
 .../fused_moe/unquantized_fused_moe_method.py |  32 +----
 .../layers/quantization/awq_marlin.py         |  17 +--
 .../layers/quantization/bitsandbytes.py       |  20 +--
 .../compressed_tensors_moe.py                 | 123 +++---------------
 .../layers/quantization/experts_int8.py       |  19 +--
 .../model_executor/layers/quantization/fp8.py |  29 +----
 .../layers/quantization/gguf.py               |  17 +--
 .../layers/quantization/gptq_marlin.py        |  19 +--
 .../layers/quantization/modelopt.py           |  45 ++-----
 .../layers/quantization/moe_wna16.py          |  17 +--
 .../layers/quantization/mxfp4.py              |  23 +---
 .../layers/quantization/quark/quark_moe.py    |  38 +-----
 .../model_executor/layers/quantization/rtn.py |  17 +--
 18 files changed, 163 insertions(+), 472 deletions(-)

diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index 638741e91619b..a6977f222408d 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -11,7 +11,6 @@ from vllm.model_executor.layers.fused_moe.config import (
     fp8_w8a8_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     apply_flashinfer_per_tensor_scale_fp8,
     flashinfer_cutlass_moe_fp8,
@@ -151,14 +150,11 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
         td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids = Llama4MoE.custom_routing_function(
             hidden_states=td.hidden_states,
-            router_logits=score,
-            use_grouped_topk=False,
-            top_k=topk,
+            gating_output=score,
+            topk=topk,
             renormalize=False,
-            custom_routing_function=Llama4MoE.custom_routing_function,
-            scoring_func="softmax",
         )
 
         quant_config = fp8_w8a8_moe_quant_config(
@@ -219,14 +215,11 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
         )
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids = Llama4MoE.custom_routing_function(
             hidden_states=td.hidden_states,
-            router_logits=score,
-            use_grouped_topk=False,
-            top_k=topk,
+            gating_output=score,
+            topk=topk,
             renormalize=False,
-            custom_routing_function=Llama4MoE.custom_routing_function,
-            scoring_func="softmax",
         )
 
         quant_config = fp8_w8a8_moe_quant_config(
diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py
index 5a162fa8f791b..e8826eb441a24 100644
--- a/tests/test_routing_simulator.py
+++ b/tests/test_routing_simulator.py
@@ -9,9 +9,16 @@ different routing strategies and analyze their performance, including
 integration tests with FusedMoE layer.
 """
 
+import tempfile
+
 import pytest
 import torch
 
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.layers.fused_moe.routing_simulator import (
     DistributionBasedRouting,
     RoutingSimulator,
@@ -89,6 +96,28 @@ def test_routing_strategy_integration(monkeypatch, device):
     # Test different routing strategies
     strategies = RoutingSimulator.get_available_strategies()
 
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        temp_file = tempfile.mkstemp()[1]
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            local_rank=0,
+            distributed_init_method=f"file://{temp_file}",
+        )
+        initialize_model_parallel(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=1,
+        )
+        fused_moe = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=0,
+            use_grouped_topk=False,
+            renormalize=True,
+        )
+
     for strategy in strategies:
         # Set environment variable
         env_name = "VLLM_MOE_ROUTING_SIMULATION_STRATEGY"
@@ -98,13 +127,9 @@ def test_routing_strategy_integration(monkeypatch, device):
         envs.environment_variables[env_name] = lambda s=strategy: s
 
         # Test the select_experts method
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = fused_moe.select_experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
-            top_k=top_k,
-            use_grouped_topk=False,
-            renormalize=True,
-            indices_type=torch.long,
         )
 
         # Verify output shapes
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
index 073e90a4e6808..ef7090c349fc6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -90,10 +90,14 @@ class FusedMoEMethodBase(QuantizeMethodBase):
     def allow_inplace(self) -> bool:
         return False
 
+    @property
+    def method_name(self) -> str:
+        return self.__class__.__name__
+
     @abstractmethod
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index c6dc95acdb636..c23c41df226f0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -66,6 +66,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
     def allow_inplace(self) -> bool:
         return self.old_quant_method.allow_inplace
 
+    @property
+    def method_name(self) -> str:
+        return self.old_quant_method.method_name
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -84,7 +88,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -105,42 +109,9 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        # Is getattr needed?
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        if enable_eplb:
-            if self.supports_eplb:
-                assert expert_load_view is not None
-                assert logical_to_physical_map is not None
-                assert logical_replica_count is not None
-            else:
-                raise NotImplementedError(
-                    "EPLB is not supported for "
-                    f"{self.old_quant_method.__class__.__name__}."
-                )
-
         topk_weights, topk_ids, zero_expert_result = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
         )
 
         result = self.fused_experts(
@@ -156,7 +127,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
             expert_map=None if self.disable_expert_map else expert_map,
         )
 
-        if zero_expert_num != 0 and zero_expert_type is not None:
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
             )
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 6619b64b2bbc0..0ef3130b26333 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1510,30 +1510,11 @@ class FusedMoE(CustomOp):
             logits_shape, dtype=moe.in_dtype, device=torch.cuda.current_device()
         )
 
-    @staticmethod
     def select_experts(
+        self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        use_grouped_topk: bool,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        indices_type: torch.dtype | None = None,
-        enable_eplb: bool = False,
-        expert_map: torch.Tensor | None = None,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-        global_num_experts: int | None = None,
-        zero_expert_num: int | None = None,
-        zero_expert_type: str | None = None,
-        num_fused_shared_experts: int = 0,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
         """
         Route the input hidden states to the top-k experts based on the
         router logits.
@@ -1552,6 +1533,27 @@ class FusedMoE(CustomOp):
             fused_topk_bias,
         )
 
+        if self.enable_eplb:
+            if self.quant_method.supports_eplb:
+                if self.expert_load_view is None:
+                    raise ValueError(
+                        "enable_eplb=True requiere expert_load_view != None"
+                    )
+                if self.logical_to_physical_map is None:
+                    raise ValueError(
+                        "enable_eplb=True requiere logical_to_physical_map != None"
+                    )
+                if self.logical_replica_count is None:
+                    raise ValueError(
+                        "enable_eplb=True requiere logical_replica_count != None"
+                    )
+            else:
+                raise NotImplementedError(
+                    f"EPLB is not supported for {self.quant_method.method_name}."
+                )
+
+        indices_type = self.quant_method.topk_indices_dtype
+
         # Check if we should use a routing simulation strategy
         routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
         if routing_strategy != "":
@@ -1559,20 +1561,20 @@ class FusedMoE(CustomOp):
                 hidden_states=hidden_states,
                 router_logits=router_logits,
                 strategy_name=routing_strategy,
-                top_k=top_k,
+                top_k=self.top_k,
                 indices_type=indices_type,
             )
 
         # DeepSeekv2 uses grouped_top_k
-        elif use_grouped_topk:
-            assert topk_group is not None
-            assert num_expert_group is not None
+        elif self.use_grouped_topk:
+            assert self.topk_group is not None
+            assert self.num_expert_group is not None
             if rocm_aiter_ops.is_fused_moe_enabled():
                 if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
-                    assert num_fused_shared_experts == 0
+                    assert self.num_fused_shared_experts == 0
                 grouped_topk_impl = partial(
                     rocm_aiter_grouped_topk,
-                    num_fused_shared_experts=num_fused_shared_experts,
+                    num_fused_shared_experts=self.num_fused_shared_experts,
                 )
             else:
                 grouped_topk_impl = grouped_topk
@@ -1580,50 +1582,46 @@ class FusedMoE(CustomOp):
             topk_weights, topk_ids = grouped_topk_impl(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                scoring_func=scoring_func,
-                routed_scaling_factor=routed_scaling_factor,
-                e_score_correction_bias=e_score_correction_bias,
+                topk=self.top_k,
+                renormalize=self.renormalize,
+                num_expert_group=self.num_expert_group,
+                topk_group=self.topk_group,
+                scoring_func=self.scoring_func,
+                routed_scaling_factor=self.routed_scaling_factor,
+                e_score_correction_bias=self.e_score_correction_bias,
             )
-        elif e_score_correction_bias is not None:
+        elif self.e_score_correction_bias is not None:
             topk_weights, topk_ids = fused_topk_bias(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                e_score_correction_bias=e_score_correction_bias.data,
-                topk=top_k,
-                renormalize=renormalize,
+                e_score_correction_bias=self.e_score_correction_bias.data,
+                topk=self.top_k,
+                renormalize=self.renormalize,
             )
-            if routed_scaling_factor != 1.0:
-                topk_weights *= routed_scaling_factor
-        elif custom_routing_function is None:
+            if self.routed_scaling_factor != 1.0:
+                topk_weights *= self.routed_scaling_factor
+        elif self.custom_routing_function is None:
             topk_weights, topk_ids, token_expert_indices = fused_topk(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
+                topk=self.top_k,
+                renormalize=self.renormalize,
                 indices_type=indices_type,
             )
         else:
-            topk_weights, topk_ids = custom_routing_function(
+            topk_weights, topk_ids = self.custom_routing_function(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
+                topk=self.top_k,
+                renormalize=self.renormalize,
             )
 
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-
+        if self.enable_eplb:
             topk_ids = eplb_map_to_physical_and_record(
                 topk_ids=topk_ids,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
+                expert_load_view=self.expert_load_view,
+                logical_to_physical_map=self.logical_to_physical_map,
+                logical_replica_count=self.logical_replica_count,
             )
 
         if (indices_type is not None) and topk_ids.dtype != indices_type:
@@ -1633,16 +1631,16 @@ class FusedMoE(CustomOp):
 
         # Compute zero expert result if needed
         if (
-            zero_expert_num is not None
-            and zero_expert_num > 0
-            and zero_expert_type is not None
-            and global_num_experts is not None
+            self.zero_expert_num is not None
+            and self.zero_expert_num > 0
+            and self.zero_expert_type is not None
+            and self.global_num_experts is not None
         ):
             zero_expert_result = zero_experts_compute_triton(
                 expert_indices=topk_ids,
                 expert_scales=topk_weights,
-                num_experts=global_num_experts,
-                zero_expert_type=zero_expert_type,
+                num_experts=self.global_num_experts,
+                zero_expert_type=self.zero_expert_type,
                 hidden_states=hidden_states,
             )
         else:
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 63b0e6f573d65..48e5a8907f926 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -331,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
     def forward_cuda(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
@@ -352,31 +352,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
         topk_weights, topk_ids, zero_expert_result = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
         )
 
         if self.rocm_aiter_moe_enabled:
@@ -415,7 +393,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 expert_map=expert_map,
             )
 
-        if zero_expert_num != 0 and zero_expert_type is not None:
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
             )
@@ -425,7 +403,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
     def forward_cpu(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
@@ -474,7 +452,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
     def forward_xpu(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
@@ -515,7 +493,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
     def forward_tpu(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 3f6ea68072b40..66945e2d2a7c8 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -597,7 +597,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -618,24 +618,11 @@ class AWQMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.")
-
         assert activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index e5a741e639ad9..1e57fa218b797 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -495,7 +495,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -518,25 +518,11 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `BitsAndBytesMoEMethod` yet."
-            )
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
+        # TODO(bnell): Do these need to be called on the hot path?
         if self.quant_config.load_in_8bit:
             w13, w2 = self._apply_8bit_dequant(layer)
         else:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index ad547dd409822..149e4419c64a4 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -511,7 +511,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -532,16 +532,17 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
-            )
         assert activation == "silu", "Only SiLU activation is supported."
 
         if (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
+            if enable_eplb:
+                raise NotImplementedError(
+                    "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
+                )
+
             return flashinfer_trtllm_fp4_moe(
                 layer=layer,
                 x=x,
@@ -554,19 +555,9 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 e_score_correction_bias=e_score_correction_bias,
             )
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.use_marlin:
@@ -1109,7 +1100,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1130,31 +1121,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-            assert isinstance(layer, FusedMoE)
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
         )
 
         per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
@@ -1377,7 +1346,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1398,26 +1367,11 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
-            )
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
@@ -1738,7 +1692,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1759,26 +1713,11 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
-            )
-
         assert activation == "silu", f"{activation} not supported for Marlin MoE."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
@@ -2001,7 +1940,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -2022,43 +1961,11 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            if expert_load_view is None:
-                raise ValueError("enable_eplb=True requiere expert_load_view != None")
-            if logical_to_physical_map is None:
-                raise ValueError(
-                    "enable_eplb=True requiere logical_to_physical_map != None"
-                )
-            if logical_replica_count is None:
-                raise ValueError(
-                    "enable_eplb=True requiere logical_replica_count != None"
-                )
-            if not isinstance(layer, FusedMoE):
-                raise TypeError(
-                    "EPLB is only supported when `layer` is a instance of FusedMoE."
-                )
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0),
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
         )
 
         return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 5241f9a2301be..7ebe40ec84687 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -137,7 +137,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -158,26 +158,11 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `ExpertsInt8MoEMethod` yet."
-            )
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 91bd45bf879cb..9e2718057038d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1140,7 +1140,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1216,31 +1216,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     apply_router_weight_on_input=apply_router_weight_on_input,
                 )
 
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        select_result = FusedMoE.select_experts(
+        select_result = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
         )
 
         topk_weights, topk_ids, zero_expert_result = select_result
@@ -1322,7 +1300,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     self.allow_cutlass_block_scaled_grouped_gemm
                 ),
             )
-        if zero_expert_num != 0 and zero_expert_type is not None:
+
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
             )
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 42d7a67371ae8..bcdfafb50fc5a 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -621,7 +621,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -642,9 +642,6 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.")
-
         assert activation == "silu", "Only SiLU activation is supported."
         if apply_router_weight_on_input:
             raise NotImplementedError(
@@ -652,19 +649,9 @@ class GGUFMoEMethod(FusedMoEMethodBase):
                 "fused GGUF MoE method."
             )
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
         return fused_moe_gguf(
             x,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 68a122fd46c6b..77b15db373a3a 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -722,7 +722,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -743,26 +743,11 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `GPTQMarlinMoEMethod` yet."
-            )
-
         assert activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 01a23168bdde3..8165673135910 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -696,7 +696,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -717,12 +717,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `ModelOptFp8MoEMethod` yet."
-            )
-
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            if layer.enable_eplb:
+                raise NotImplementedError(
+                    "EPLB not supported for `ModelOptFp8MoEMethod` yet."
+                )
             assert activation == "silu", (
                 f"Expected 'silu' activation but got {activation}"
             )
@@ -740,19 +739,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             )
 
         # Expert selection
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
@@ -1459,7 +1448,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1480,16 +1469,16 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
-            )
         assert activation == "silu", "Only SiLU activation is supported."
 
         if (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
+            if enable_eplb:
+                raise NotImplementedError(
+                    "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
+                )
             return flashinfer_trtllm_fp4_moe(
                 layer=layer,
                 x=x,
@@ -1502,19 +1491,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 e_score_correction_bias=e_score_correction_bias,
             )
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.use_marlin:
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 2090c86f78dc8..cf348290a2716 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -359,7 +359,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -380,25 +380,12 @@ class MoeWNA16Method(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.")
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         assert activation == "silu", "Only SiLU activation is supported."
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 66ae2e94c60a5..255b5aad17853 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -862,7 +862,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -887,18 +887,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            topk_weights, topk_ids, _ = FusedMoE.select_experts(
+            topk_weights, topk_ids, _ = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
-                use_grouped_topk=use_grouped_topk,
-                top_k=top_k,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                custom_routing_function=custom_routing_function,
-                scoring_func=scoring_func,
-                routed_scaling_factor=routed_scaling_factor,
-                e_score_correction_bias=e_score_correction_bias,
             )
 
             return fused_marlin_moe(
@@ -989,17 +980,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         ):
             from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
 
-            topk_weights, topk_ids, _ = FusedMoE.select_experts(
+            topk_weights, topk_ids, _ = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
-                use_grouped_topk=use_grouped_topk,
-                top_k=top_k,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                custom_routing_function=custom_routing_function,
-                scoring_func=scoring_func,
-                e_score_correction_bias=e_score_correction_bias,
             )
 
             # Backend-specific preparation
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 30772c3665b06..8be0299eaa66f 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -334,7 +334,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -355,24 +355,9 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet."
-            )
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.rocm_aiter_moe_enabled:
@@ -609,7 +594,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -630,24 +615,9 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `QuarkOCP_MX_MoEMethod` yet."
-            )
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if not self.emulate:
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 52656263a601b..7b51b828009fc 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -356,7 +356,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -377,22 +377,9 @@ class RTNMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `RTNMoEMethod` yet.")
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(

From 839c6b7b72bcc7197443019aae32be409f1c0363 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Mon, 24 Nov 2025 11:24:37 -0800
Subject: [PATCH 206/249] [Multimodal][Qwen3 Omni] Make Qwen3 Omni work with
 audio-in-video inputs in V1 engine.   (#27721)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 .../qwen3_omni/only_thinker.py                | 170 ++++++++++++++
 tests/model_executor/test_qwen3_omni.py       | 221 ++++++++++++++++++
 .../models/qwen2_5_omni_thinker.py            |  25 --
 .../models/qwen3_omni_moe_thinker.py          | 110 ++++++---
 4 files changed, 467 insertions(+), 59 deletions(-)
 create mode 100644 examples/offline_inference/qwen3_omni/only_thinker.py
 create mode 100644 tests/model_executor/test_qwen3_omni.py

diff --git a/examples/offline_inference/qwen3_omni/only_thinker.py b/examples/offline_inference/qwen3_omni/only_thinker.py
new file mode 100644
index 0000000000000..88a61ed694c2e
--- /dev/null
+++ b/examples/offline_inference/qwen3_omni/only_thinker.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference
+with the correct prompt format on Qwen2.5-Omni (thinker only).
+"""
+
+from typing import NamedTuple
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.multimodal.image import convert_image_mode
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+
+class QueryResult(NamedTuple):
+    inputs: dict
+    limit_mm_per_prompt: dict[str, int]
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+default_system = (
+    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+    "Group, capable of perceiving auditory and visual inputs, as well as "
+    "generating text and speech."
+)
+
+
+def get_mixed_modalities_query() -> QueryResult:
+    question = (
+        "What is recited in the audio? "
+        "What is the content of this image? Why is this video funny?"
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
+        "<|vision_start|><|image_pad|><|vision_end|>"
+        "<|vision_start|><|video_pad|><|vision_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                "image": convert_image_mode(
+                    ImageAsset("cherry_blossom").pil_image, "RGB"
+                ),
+                "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
+            },
+        },
+        limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
+    )
+
+
+def get_use_audio_in_video_query() -> QueryResult:
+    question = (
+        "Describe the content of the video in details, then convert what the "
+        "baby say into text."
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    asset = VideoAsset(name="baby_reading", num_frames=16)
+    audio = asset.get_audio(sampling_rate=16000)
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "video": asset.np_ndarrays,
+                "audio": audio,
+            },
+            "mm_processor_kwargs": {
+                "use_audio_in_video": True,
+            },
+        },
+        limit_mm_per_prompt={"audio": 1, "video": 1},
+    )
+
+
+def get_multi_audios_query() -> QueryResult:
+    question = "Are these two audio clips the same?"
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
+        "<|audio_start|><|audio_pad|><|audio_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": [
+                    AudioAsset("winning_call").audio_and_sample_rate,
+                    AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                ],
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 2,
+        },
+    )
+
+
+query_map = {
+    "mixed_modalities": get_mixed_modalities_query,
+    "use_audio_in_video": get_use_audio_in_video_query,
+    "multi_audios": get_multi_audios_query,
+}
+
+
+def main(args):
+    model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+    query_result = query_map[args.query_type]()
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=12800,
+        max_num_seqs=5,
+        limit_mm_per_prompt=query_result.limit_mm_per_prompt,
+        seed=args.seed,
+    )
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=256)
+
+    outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "audio language models"
+    )
+    parser.add_argument(
+        "--query-type",
+        "-q",
+        type=str,
+        default="mixed_modalities",
+        choices=query_map.keys(),
+        help="Query type.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/tests/model_executor/test_qwen3_omni.py b/tests/model_executor/test_qwen3_omni.py
new file mode 100644
index 0000000000000..c92c61dcd3bc2
--- /dev/null
+++ b/tests/model_executor/test_qwen3_omni.py
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import Mock
+
+import pytest
+from transformers import PretrainedConfig
+
+from vllm.multimodal.processing import InputProcessingContext
+
+
+# Helper function to print input IDs with coalesced audio/video tokens.
+def print_input_ids(input_ids):
+    """
+    Print input IDs, compressing consecutive special tokens.
+    - 151675: <|audio_pad|>
+    - 151656: <|video_pad|>
+    """
+    if not input_ids:
+        print("[]")
+        return
+
+    result = []
+    i = 0
+
+    while i < len(input_ids):
+        current_id = input_ids[i]
+
+        # Check if it's a special token that should be compressed
+        if current_id in [151675, 151656]:
+            # Count consecutive occurrences
+            count = 1
+            while i + count < len(input_ids) and input_ids[i + count] == current_id:
+                count += 1
+
+            # Add compressed representation
+            token_name = "<|audio_pad|>" if current_id == 151675 else "<|video_pad|>"
+            result.append(f"{token_name} * {count}")
+            i += count
+        else:
+            # Regular token, just add it
+            result.append(str(current_id))
+            i += 1
+
+    print(", ".join(result))
+
+
+@pytest.fixture
+def mock_qwen3_omni_config():
+    """Create a mock Qwen3OmniMoeThinker config."""
+    config = Mock(spec=PretrainedConfig)
+    # Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
+    config.audio_token_id = 151675  # <|audio_pad|>
+    config.video_token_id = 151656  # <|video_pad|>
+    config.image_token_id = 151655  # <|image_pad|>
+    config.audio_start_token_id = 151669  # <|audio_start|>
+    config.audio_end_token_id = 151670  # <|audio_end|>
+    config.vision_start_token_id = 151652  # <|vision_start|>
+    config.position_id_per_seconds = 12.5
+
+    # Vision config
+    vision_config = Mock()
+    vision_config.spatial_merge_size = 2
+    config.vision_config = vision_config
+
+    return config
+
+
+@pytest.fixture
+def mock_processor():
+    """Create a mock HF processor."""
+    from transformers.models.whisper import WhisperFeatureExtractor
+
+    processor = Mock()
+    processor.audio_token = "<|audio_pad|>"
+    processor.image_token = "<|image_pad|>"
+    processor.video_token = "<|video_pad|>"
+
+    # Create a real WhisperFeatureExtractor instance for the feature_extractor attribute
+    feature_extractor = WhisperFeatureExtractor()
+    processor.feature_extractor = feature_extractor
+
+    return processor
+
+
+@pytest.fixture
+def mock_tokenizer():
+    """Create a mock tokenizer."""
+    tokenizer = Mock()
+    # Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
+    tokenizer.get_vocab = Mock(
+        return_value={
+            "<|audio_pad|>": 151675,
+            "<|video_pad|>": 151656,
+            "<|image_pad|>": 151655,
+            "<|audio_start|>": 151669,
+            "<|audio_end|>": 151670,
+            "<|vision_start|>": 151652,
+            "<|vision_end|>": 151653,
+        }
+    )
+    tokenizer.encode = Mock(
+        side_effect=lambda x: {
+            "<|vision_start|>": [151652],
+            "<|vision_end|>": [151653],
+            "<|audio_start|>": [151669],
+            "<|audio_end|>": [151670],
+            "<|audio_pad|>": [151675],
+            "<|image_pad|>": [151655],
+            "<|video_pad|>": [151656],
+        }.get(x, [0])
+    )
+    tokenizer.vision_bos_token = "<|vision_start|>"
+    tokenizer.vision_eos_token = "<|vision_end|>"
+    tokenizer.audio_bos_token = "<|audio_start|>"
+    tokenizer.audio_eos_token = "<|audio_end|>"
+    return tokenizer
+
+
+@pytest.fixture
+def mock_image_processor():
+    """Create a mock image processor."""
+    image_processor = Mock()
+    image_processor.merge_size = 2
+    return image_processor
+
+
+def test_qwen3_omni_get_updates_use_audio_in_video(
+    mock_qwen3_omni_config,
+    mock_processor,
+    mock_tokenizer,
+    mock_image_processor,
+):
+    """Test the get_updates_use_audio_in_video method directly."""
+
+    from vllm.model_executor.models.qwen3_omni_moe_thinker import (
+        Qwen3OmniMoeThinkerMultiModalProcessor,
+        Qwen3OmniMoeThinkerProcessingInfo,
+    )
+
+    # Create a mock context
+    mock_ctx = Mock(spec=InputProcessingContext)
+
+    # Create processing info
+    info = Qwen3OmniMoeThinkerProcessingInfo(mock_ctx)
+    info.get_hf_config = Mock(return_value=mock_qwen3_omni_config)
+    info.get_hf_processor = Mock(return_value=mock_processor)
+    info.get_tokenizer = Mock(return_value=mock_tokenizer)
+    info.get_image_processor = Mock(return_value=mock_image_processor)
+
+    # Create a mock dummy_inputs builder
+    mock_dummy_inputs = Mock()
+
+    # Create the processor
+    processor = Qwen3OmniMoeThinkerMultiModalProcessor(info, mock_dummy_inputs)
+
+    # Test parameters from reference video
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4
+    audio_len = 85
+    video_grid_thw = [6, 36, 64]
+    video_second_per_grid_t = 2.0
+
+    # Call the method
+    updates = processor.get_updates_use_audio_in_video(
+        thinker_config=mock_qwen3_omni_config,
+        audio_len=audio_len,
+        video_grid_thw=video_grid_thw,
+        video_second_per_grid_t=video_second_per_grid_t,
+    )
+
+    # Updated input ids should align with HF implementation.
+    # 151669,
+    # <|video_pad|> * 576, <|audio_pad|> * 25,
+    # <|video_pad|> * 576, <|audio_pad|> * 25,
+    # <|video_pad|> * 576, <|audio_pad|> * 25,
+    # <|video_pad|> * 576, <|audio_pad|> * 10,
+    # <|video_pad|> * 1152,
+    # 151670
+    print_input_ids(updates)
+
+    # Verify structure
+    assert isinstance(updates, list)
+    assert len(updates) > 0
+
+    # Verify start and end tokens
+    audio_start_token_id = mock_qwen3_omni_config.audio_start_token_id
+    audio_end_token_id = mock_qwen3_omni_config.audio_end_token_id
+
+    assert updates[0] == audio_start_token_id
+    assert updates[-1] == audio_end_token_id
+
+    # Verify both audio and video tokens are present
+    audio_token_id = mock_qwen3_omni_config.audio_token_id
+    video_token_id = mock_qwen3_omni_config.video_token_id
+
+    audio_count = updates.count(audio_token_id)
+    video_count = updates.count(video_token_id)
+
+    assert audio_count == audio_len, (
+        f"Expected {audio_len} audio tokens, got {audio_count}"
+    )
+
+    # Calculate expected video token count
+    spatial_merge_size = mock_qwen3_omni_config.vision_config.spatial_merge_size
+    height = video_grid_thw[1] // spatial_merge_size
+    width = video_grid_thw[2] // spatial_merge_size
+    expected_video_count = video_grid_thw[0] * height * width
+
+    assert video_count == expected_video_count, (
+        f"Expected {expected_video_count} video tokens, got {video_count}"
+    )
+
+    # Total tokens should be: 1 (start) + audio_len + video_count + 1 (end)
+    expected_total = 1 + audio_len + expected_video_count + 1
+    assert len(updates) == expected_total, (
+        f"Expected {expected_total} total tokens, got {len(updates)}"
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 262ea771d9cdf..7506ee8656fda 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -23,7 +23,6 @@
 """Inference-only Qwen2.5-Omni model (thinker part)."""
 
 from collections.abc import Callable, Iterable, Mapping, Sequence
-from copy import copy
 from functools import partial
 from typing import Annotated, Any, Literal
 
@@ -387,15 +386,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
         self._validate_mm_updates(mm_prompt_updates, mm_item_counts)
 
-        use_audio_in_video = False
-        if "video" in mm_kwargs:
-            video_items = [item for item in mm_kwargs["video"] if item is not None]
-            # only check video items (if there are any)
-            if video_items:
-                use_audio_in_video = all(
-                    item["use_audio_in_video"].data for item in video_items
-                )
-
         if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
                 prompt_ids,
@@ -404,7 +394,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
-                use_audio_in_video=use_audio_in_video,
             )
         else:
             prompt_ids, mm_placeholders = self._apply_prompt_updates(
@@ -414,7 +403,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
-                use_audio_in_video=use_audio_in_video,
             )
 
         return prompt_ids, mm_placeholders
@@ -640,19 +628,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
 
         return mm_processed_data
 
-    def _validate_mm_placeholders(
-        self,
-        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
-        mm_item_counts: Mapping[str, int],
-        use_audio_in_video: bool = False,
-    ) -> None:
-        if use_audio_in_video:
-            mm_item_counts = copy(mm_item_counts)
-            if "video" in mm_item_counts:
-                assert "audio" in mm_item_counts
-                mm_item_counts["audio"] -= mm_item_counts["video"]
-        super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
-
 
 class Qwen2_5OmniConditionalGenerationMixin:
     def _parse_and_validate_audio_input(
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 61f218f16d79c..f5f88f66eff91 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -68,11 +68,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
 from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import (
-    BaseMultiModalProcessor,
     MultiModalPromptUpdates,
     PlaceholderFeaturesInfo,
     PromptReplacement,
     PromptUpdate,
+    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
 
@@ -87,7 +87,6 @@ from .qwen2_5_omni_thinker import (
     Qwen2_5OmniConditionalGenerationMixin,
     Qwen2_5OmniThinkerDummyInputsBuilder,
     Qwen2_5OmniThinkerMultiModalProcessor,
-    Qwen2_5OmniThinkerProcessingInfo,
 )
 from .qwen2_5_vl import (
     Qwen2_5_VisionAttention,
@@ -807,24 +806,8 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
                 else:
                     use_audio_in_video = False
 
-        if use_audio_in_video and "video" in mm_item_counts:
-            assert "audio" in mm_item_counts
-            mm_item_counts["audio"] -= mm_item_counts["video"]
-
-        # Special case with `use_audio_in_video=True`
-        if use_audio_in_video:
-            if is_update_applied:
-                prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video)
-            (
-                prompt_ids,
-                mm_placeholders,
-            ) = self._apply_prompt_updates(
-                prompt_ids,
-                mm_prompt_updates,
-            )
-            self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
         # normal case with `use_audio_in_video=False`
-        elif is_update_applied:
+        if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
                 prompt_ids,
                 mm_prompt_updates,
@@ -834,10 +817,24 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
                 mm_item_counts,
             )
         else:
-            prompt_ids, mm_placeholders = self._apply_prompt_updates(
-                prompt_ids,
-                mm_prompt_updates,
-            )
+            if use_audio_in_video and "audio" in mm_prompt_updates:
+                filtered_updates = {
+                    k: v for k, v in mm_prompt_updates.items() if k != "audio"
+                }
+                prompt_ids, mm_placeholders = self._apply_prompt_updates(
+                    prompt_ids,
+                    filtered_updates,
+                )
+                # Derive audio placeholders from video placeholders
+                mm_placeholders = self._derive_audio_from_video_placeholders(
+                    mm_placeholders, mm_prompt_updates
+                )
+            else:
+                prompt_ids, mm_placeholders = self._apply_prompt_updates(
+                    prompt_ids,
+                    mm_prompt_updates,
+                )
+
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
@@ -962,7 +959,9 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
 
         def get_replacement_qwen2_use_audio_in_video(item_idx: int):
             nonlocal audio_in_video_item_idx
-            audio_num_features = audio_output_lengths[audio_item_idx + item_idx]
+            audio_num_features = audio_output_lengths[
+                audio_in_video_item_idx + item_idx
+            ]
             video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
 
             audio_in_video_item_idx += 1
@@ -971,14 +970,17 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
             if second_per_grid_ts:
                 video_second_per_grid_t = second_per_grid_ts[item_idx]
             else:
-                video_second_per_grid_t = 1.0
+                video_second_per_grid_t = 2.0
 
-            return self.get_updates_use_audio_in_video(
+            placeholder = self.get_updates_use_audio_in_video(
                 thinker_config=thinker_config,
                 audio_len=audio_num_features,
                 video_grid_thw=video_grid_thw,
                 video_second_per_grid_t=video_second_per_grid_t,
             )
+            return PromptUpdateDetails.select_token_id(
+                placeholder, embed_token_id=video_token_id
+            )
 
         video_replacement_fn = (
             get_replacement_qwen2_use_audio_in_video
@@ -1004,14 +1006,50 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
             ),
         ]
 
-    def _validate_mm_placeholders(
+    def _derive_audio_from_video_placeholders(
         self,
-        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
-        mm_item_counts: Mapping[str, int],
-    ) -> None:
-        BaseMultiModalProcessor[
-            Qwen2_5OmniThinkerProcessingInfo
-        ]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts)
+        placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_prompt_updates: MultiModalPromptUpdates,
+    ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
+        """
+        Helper to derive audio placeholders from video placeholders when
+        use_audio_in_video=True.
+        """
+        if "video" not in placeholders:
+            return placeholders
+
+        # Validate audio and video counts match
+        num_videos = len(placeholders["video"])
+        num_audios = len(mm_prompt_updates.get("audio", []))
+        if num_audios != num_videos:
+            raise ValueError(
+                f"use_audio_in_video requires equal number of audio and video items, "
+                f"got {num_audios=}, {num_videos=}"
+            )
+
+        tokenizer = self.info.get_tokenizer()
+        processor = self.info.get_hf_processor()
+        audio_token_id = tokenizer.get_vocab()[processor.audio_token]
+
+        result_placeholders = dict(placeholders)
+        audio_placeholders = []
+
+        # Each video is paired with one audio
+        for video_idx, video_placeholder in enumerate(placeholders["video"]):
+            # Create is_embed mask selecting only audio tokens
+            audio_is_embed = torch.tensor(video_placeholder.tokens) == audio_token_id
+
+            audio_placeholder = PlaceholderFeaturesInfo(
+                modality="audio",
+                item_idx=video_idx,
+                start_idx=video_placeholder.start_idx,
+                tokens=video_placeholder.tokens,
+                is_embed=audio_is_embed,
+            )
+            audio_placeholders.append(audio_placeholder)
+
+        result_placeholders["audio"] = audio_placeholders
+        return result_placeholders
 
     def _get_raw_input_ids(
         self,
@@ -1454,7 +1492,11 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
             )
 
         if not len(second_per_grid_ts) and len(video_grid_thw):
-            second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
+            second_per_grid_ts = 2.0
+            second_per_grids = (
+                torch.ones(len(video_grid_thw), dtype=torch.float32)
+                * second_per_grid_ts
+            )
         else:
             second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
 

From 97588c4d1231287eb380cf9fb95ec77f88479b85 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Nov 2025 11:28:56 -0800
Subject: [PATCH 207/249] [Model Runner V2] Add minor clarification comments
 for Eagle (#29332)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/spec_decode/eagle.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 0f11903e14540..59d0f313d96a2 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -65,6 +65,12 @@ class EagleSpeculator:
         # [num_reqs]
         next_prefill_tokens: torch.Tensor,
     ) -> torch.Tensor:
+        # NOTE(woosuk): To avoid CPU-GPU synchronization without CPU knowing the
+        # number of rejected tokens, we maintain the size of eagle's input_ids and
+        # hidden_states the same as the target model's. This means, we pad each
+        # request's query length to include any rejected positions. By doing so,
+        # we can also reuse the attention metadata (e.g., query_start_loc,
+        # seq_lens) of the target model.
         if aux_hidden_states:
             assert self.method == "eagle3"
             hidden_states = self.model.combine_hidden_states(
@@ -110,6 +116,11 @@ class EagleSpeculator:
         # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
         # used for draft and target sampling.
         pos = input_batch.positions[last_token_indices] + 1
+        # NOTE(woosuk): For draft sampling, we only consider the temperature
+        # and ignore the other sampling parameters such as top_k and top_p,
+        # for simplicity and performance.
+        # While this may slightly degrade the acceptance rate, it does not
+        # affect the output distribution after rejection sampling.
         draft_tokens = gumbel_sample(
             logits, temperature, seed, pos, apply_temperature=True
         )

From 4d6afcaddccaf281385ddfa7c6078916af7d9d20 Mon Sep 17 00:00:00 2001
From: Benjamin Bartels <benjamin@bartels.dev>
Date: Mon, 24 Nov 2025 19:40:54 +0000
Subject: [PATCH 208/249] [CI/Build] Moves to cuda-base runtime image while
 retaining minimal JIT dependencies (#29270)

Signed-off-by: bbartels <benjamin@bartels.dev>
Signed-off-by: Benjamin Bartels <benjamin@bartels.dev>
---
 docker/Dockerfile                             |  16 ++++++++++++++--
 .../dockerfile-stages-dependency.png          | Bin 134558 -> 149377 bytes
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1b937bbc1225e..e03b9989a190c 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,8 +20,8 @@ ARG PYTHON_VERSION=3.12
 # glibc version is baked into the distro, and binaries built with one glibc
 # version are not backwards compatible with OSes that use an earlier version.
 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
-# TODO: Restore to base image after FlashInfer AOT wheel fixed
-ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
+ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
 
 # By parameterizing the Deadsnakes repository URL, we allow third-party to use
 # their own mirror. When doing so, we don't benefit from the transparent
@@ -328,6 +328,18 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 
+# Install CUDA development tools and build essentials for runtime JIT compilation
+# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
+RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
+    apt-get update -y && \
+    apt-get install -y --no-install-recommends \
+    cuda-nvcc-${CUDA_VERSION_DASH} \
+    cuda-cudart-${CUDA_VERSION_DASH} \
+    cuda-nvrtc-${CUDA_VERSION_DASH} \
+    cuda-cuobjdump-${CUDA_VERSION_DASH} \
+    libcublas-${CUDA_VERSION_DASH} && \
+    rm -rf /var/lib/apt/lists/*
+
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index 57a33524a5169c8b56c7de309cdeb243ab3fa918..b327eb2151f50e4d682fe533fa57e12f0b6118c2 100644
GIT binary patch
literal 149377
zcmbTecU;fw|3Ch;j&sa|j8Ji;5(@2|nHMb$?LjJ<G_}LALgketO{CHuD(xJSw4}X+
zmbUh;-~H+w<o)^Mce{OipYuMvs^{~1Uf1J#JnrLhJ#Wg(N-bZqaS26H%V|dsDNxjk
z3lz2B?C-zflcz#l8}OgsP9K*#M9q-@1!snNQPdWSc4)uSMgOjbbN7^60>rx}7;?94
zxw1<B^`+khx4!-J?@t>T@6lw+Uq3jsCY|ri$vUCZkh0n%sz#4ZjW^eY2M8X~Te#5V
z<&G7H4)5O^zDnxQdFoKGWJK}AR`U*)+Eq)cmCtJ#$Jb8?O-v3ujppq5R^PV6yP(Aa
zW0~`%>UTtiAz;o2_@nN@kk^0w@=@FhjtldD`6Oz=G5=papgLVwv;Xo*j&Sh*%Vi|>
zOXvN(jw^TT)JS8=L~nM`)urnVKL2y|aNfML@|>KUA3|mPo}VjY>#j?YE4aV=mit1=
zWZ^x*pT8T-te$M5#(eS9C28M%+sne_N^Udq>@)wE@mO!3$)4mov-GaC&X^k0l!kq0
z-g++EuxHDijeMNK>!`R(vlCUib<Qx(<hrd*ZE_c}D3cM;e|K+(TAJ0l(z_J@MnBVJ
zwc$pWnay_1cNuwSN9Km1@H#&6o;wy@%htkteIKp+oiXWL18pTUuZ@MRzIrl?*`K+!
zJNam5Wpvr&!?%X~R|kKd_;h7u!r9Wt8GUay`R>&_vTnEf;LR%Q_7B_hm>o>qg<2mx
zH>xla3tLF>Z<)IY^7m{TodopW+|G}6nd;o7k!<3fmt1QUxoi{vu2<JrmblJa@FB}-
z;<CbglNZ`fLlvTv-(E3FI6L-aj(psu8mH@>C(!6P;BVLJD>OaYl$g)v$8c{obv9@2
zoa#D7#>5nF_tB(V77MZ8TF)J$W&R<AmeqI1y!pDs<f-9$%aVJ?p3YaCZ=!iJM$<Q5
zFR%68&e%eykp`cK4<FXf4oK4?f4fk#=fNe?zf-)Da~~m2Tx_B%QE2?@lY!T3<jQ0C
z%s-0evA8(OiVc-peSQBZ$BJUjKhB+bZ=hI&JA0y`S3yflE8Y41{>3f*1>ACZ?o3Oi
z7gM}jp6qm!{keG5rdu<UB{R}N;!Z|O{nxHtJN9{{%L5;6`yJ~w&tIC)E-6-;WL#B$
z{?&3a-AC?hR8DuU(z1Q(Fw(;)Vq<ctopQbQT6Rvj$GJO=e!e~y<}w~uZ{Hc6WSriW
za6Be+xOT9&F+1lB<+w^O&%NkE9+QudWh*P~*i!{r!OwqB_WNW@>+9>k#P#e8f4Lr4
zkuMMXg5Q>f?>>M3*i*6uzWk;SF|q}FEt=nok~tZCf6$Ay#X<<fK8GQCZ$EyY!G8r6
zUh>QGFxj<ut0atkbZAR$3p`Fopm3%%Iy=0%&FIA@^ANw_T2Y#8T32sVXPVj(5ot8D
zENIWE1H%orjTsX?>5IY^P#uv9shV8tndWSFe%=zX5zXJ0Z@gzW(7K(LX4OvrV+~u#
zaUZTl><fZpnC-tl+HCcHzgnE`etHNd%~#aUqBg}`=lnh@IFQklea=#-78sNS25dce
zt$dwex@~XZVirgFOKWr&#8+uMw0yqKA{!<fK%b0$A#w5Br(K#a;y#_BB%PkjO(Vvo
zb>`XR3TJxiQ)%>h3w~>sa~W3@w&_yE{VGGXY-+Zz6dROJZm_9i&Bu<n`1W#fSVTm`
z?4=(6a;g0OmIBL9mln6S_~^*?r!WcTd9VakMr*A1U*>B4$g-{2m-n1PZIZDMs~vvI
zqHu2>gZ~mLT<(`C-#u34GF3G>R^lStKjS85T9>?Aa4psOVTDMSr$bLFyPsLwIeN3p
zbeoGs?pl@E3vmB>fzhCuu^>&X9F5*u<kz;+W%S_e<8V@y*Rf$+1+vFv5JU9lW-qyz
z-9rngO#<8J%xU<Ol#lj1*#j@0?jFN~1O|#Ztb6dg>rnTm#?d3PLM``L$#V{5_BwGZ
zD=RMwn@_og&YcEptNIIh`pB#_daujOgiuHL6Sw6HCAX5GL12i{`u|zV-8?x^!Y#Op
zl7Iex!ee7Z<IMEPzX7UYmZ~4<GChXG9QsL#VrU^@h<%rEI@gWYtX{AzROa|dPs1(u
zMHGYH^ME-SQg?Z@X6inPsdhQz67K9ti$$`ocQo;%171OsthD}8DVCO~#9~e|1lGAv
zuX=E1s&OV@z4pbew0(%0&5qw*E@Gb_EI!)j;o@(aT*F4&XVX<JikyOvDYsuEKo@CU
zs5fJP&iYj0+3*JHA)oPmmx&aYNU@ok66ce;8>!%<za*&NdR{cSXN<M_(dZ|h9C@mf
z?L2!kq5ikq5?`$lVWovi`_kJ29b!Hmq9j-Svd+sjFXXOG$_w+iIChCfg?n)B+-dpG
zAME9UI@T4+i$dp7ZVH?}1uc&c{>)sG2m3MahCW(0G<sLYK(ViorNPUa_+8Y5d$Iw-
zEw5HeNTYTPRBD(%#pXPmH=n8uDz*8t@W(Y=E3DcAY%7^rEDQH8Vw)E%ZrqS&oi$cu
z&Nim;Xdr8%XD6mKE^Eh}`krI75oMvfZ~tN@CEe9jNBWhO;xC3rSsEoM*E<KFX>$48
zTJf{YQp<DGd$as@Cs+SfQBhH2U8Ti?8uWPEJb91Z8p*r-%(Lw7b7?({POjxN&*(dX
zKrW4^T!;Q=_4%X&g!%CMG&;(C_9tGd+}6(|fj0cnp_et^U&yk*);M;p|L>G)&@Xe!
zCpu8bla%K)KDcAg9~g;5mQM9cMv9;7mqptc=rmkY+cdi|7P?-0EakzRl!u>)yKv%{
zncCwp+{H>mibjf4IKPqFcsaoKW9slIb`=*c|G%?@nu&xb<H!FjDr%SDa%%ZYpRB>B
zwM_sj>mDqmR2zRC>SU&4S(<g{WiRTz{Bxv<Vbz;!{?ACoQIuI}z%xr_=Sv<e#XjOa
z_W|+$axn{o|8M{2@$VJ1u+k1+_4qRXTdw!9r>rx*P7A4O!BM|$KLUXK;r|zknNZ`y
z-F^zs?%|*PSAJW|WEddQ{k+Ub#~ia!3NZU=&D;nf*ZhA(fJ*C_$Z!Am<vRW}B7!re
z-s1Qhq~5R~XW@a?qT|-q)+Fo3Xr&vx_1ro(oRS@YV0ZfLi;`2+ozfVHfD<8rF2im>
zL0Q?mh`(-^R=Q1*O7?gKsYHx>bR~~H{UGdUO;K^v_ujQwq<;PL`z<L<Kd*7+t^yx!
zxmfMYeWta~A2-F_1sv=qK^9$UX_h@TyxHRI)}ZlGCfoYEUUK3>h}<`Z#pf!c)JQ`0
zM_xZy9$spw?a=5n(lXIF<NVQaps1v%D%G;h-?2ZRAeb@?PMKK*+0^3IaskJl+b&j<
zeR*vO1zZ}@Z8>S5rWR2A$NmSXNKT!f9&2m!Jh<Z7?(@!5ojN54SBl*`d}|%q=!J@M
zV%}@yME81odq0bacsJG-6oj2AhzFS|_gyJ5dH&i;mOx-Q9zYxd;RdSpJV;(wOSRDd
z2m}UfCdRn$%u#x?w~BWDP4>VC0HbD^83q)k>N7W1sg2<(I!=5*8~6u%>zP)gdG?f1
z)l6@!T;0`y@y^(i8p9whhv|W!8ByE%3sMC35v;LW{rS<}kD2&*EGQKhS5Q;)WhEG@
zShzDvu4t5drk`6a{`6t`nZgHqQ@auhK4iPhJU43)2J(JKHqYksIrB)-l@)FeDRIoK
z(s6O>?i3s<S2V8L#Ah`4EVhaZIE;aYo%7>MciRX?)Hd4W5i7<bcwW~T!*b~uB9Ir)
zsOsR{m%G00Sy<qBwRNIlNiiUSQAzedK(C{L`%<nKTbP8cR0F<Jlzi+j5pL4#JXWA%
z(Ej0RQk%qdpNAr+xA=h2%y{g~LpQf<C)P|&uC@=yxu>f6fb7*;97`V`4iZYFd9l?v
z^k!zCHKC~R5zbpHzK=3a9JRM4I_2XrdKBs$kh3h^5jT*nOE&G~b?9%--{Dh0qa&|}
z;2P(uVziQL4E)lrf<dZYCrG++vQ;3d4HNwK{iA~tBX%0*nbv8xz1rzztza<sBV)6K
zk%^)+!HW1JMKR<iiT=9A>0zFwj{y&i%`%57E1VednA|@fbLhihd)Ywt%tZFs*C!r?
zb0{e*>(?acr+&J;^aFC0a8Y+-_Do)-d1}jj#{s{_sN>CD*)!7!s1{CR0?fI9e1(g|
zP!#(C*h5f)&ec9Y_fWt*kV`AA*(ZBaKcE#zk~JT(t30ttfc7C)CmTRubg5X1VM$=p
z_F^D2QLOJ+hf22k#3-&uFAflCLx9rg^;Tsv1kYNu`WhOHe*63^YVfWNMe&<Be_8b3
zymk;2Xg5$Kkf{6ea>mT`1OcHaG0vl}SSru;ELbV(StchjrPnDjmM_<#^u+|bmOVyA
zMJ2Ycuq1(B+;b9<+k)Uw*N--D%(4!g=+v1J&YtQvFN;u)Bq5DNEfaY%x}RmHkA>|2
zfVKr-`LB?ePYU(cCUvfNHoLxp>7<ZLe?Gg)jteK1l$w!6ioSh*rHIVLYS$H?X90fl
z6xU(oH&MN|g6Sc^tE0&}iZc2A$EI@?7Q!T0;z42)DuMZgeFXgk%%^X5BI8Au)pI+w
zA145-EJ43O+2($Y^F((iuK-O}VrsA)Ux6xEwof^itOM&1Js#0XSjl2b9ml?0(cx;n
z$S7qcC9>hHA5Ng0wiF38N~ha&e?WTW&bYD?A#vOGqn)Am0Er>&`=O0R_7mMH63N3!
zu}-@^MEj3A4?bbh?<jwk)VXxsuD89;Bk7Ypw%Ny@DZT4S?+tw7F3c&#Y!||Pe;%cK
z<<Splx#=cYyF_f(xKV4NuNOi?zt~@>=3H1n|1F)#$Fi1){E7IaMN8L(XmW)qaYpym
zzr3=z4ZJ)OxWTlJU*;h%&4){?W3@nlh;1=~{A3toP%H3|4!b40T%e{y#>KubQL6EO
z?K~M3K2V?CdwIY^J;iM5S-Q9L2~Z&OEG%PHU<xh;ji7{JTPq|d*~EF2Ym#~2mt14h
zN)(cbo1K{sBgIV~5`ouQoCOK1?9Y>!zTo|jVXEfr#|U&$KsvLSjQ)4GHw#%rx6G$p
zAOBE9a*lj?UM5>WB0_B<gh}CKmH0%L7YZpUOIY^kDRZsk{&l7)T10aYZmWB5lGYxI
zn<L15cgAMyBa;du<hfFObPJMs>_Gk4KwGRu7D5`lJh)h-9cvttmR4IFh<bKE3GCl{
z02_*Ly_Qv}aB5YcLyxc6NIj2==7qzdvH=f8dNX)X1}sTUsx|%yhPoLdNb2->Cm{jm
z0nHTf*Wn*2wBs3WM|NC30^p%LlZXN6UQ}7e?v(m|J!7z-=(0jq5`C6mA6>7sTakON
zF1ee%Cp<0GrbgdqvqM+BuYkGsN4vHF!Xe3@3Hb|4EfF1%&g%D$B?8A=Jh~sUpD;;V
z?!p}$B98MZNv7w<Rc77;NvS+-VLQB!wKrzFjF0pNl|GSbLtqty*OiIROb*bdlX=>*
zCkwLAG{3!TJk+}rv+xtJN{S`owuXo?wS+UY?poWdfdFhD59lPzSj0s&Op}y}(gOS$
zQT$YRl=aGQ%ijg>zfw6~kLn`E$H!;a8J*G(0NoOw#~hg%L{zM()Q_1Cu@=)Ek+CsV
z69Pu%99W<8*OqU(AM4a@Y&Tdc<uw^CZI5|szOl_+HP#NInP~)v@Q-XCk$Y=>timU0
zy1>VaPl_$2K-eK3aXmS5XLic~%{)b#w+>v|c^t$%fjmIL?iA@~N)cL(27aa;ytXax
zyrBWq+Si!=PEj3C#GS@NtUD`{+K|nxjEs!3z72rozK7O=e=;Wf3nE2rh-RV0sUPGt
zJXSA+GEfABko~R}A=ZD@k^+%F^Mi_qKhZC^-*R=m_FJN+U^x0YuJf!PreKhvp`mQp
za4}O!5LsfMK7W6HIT4VqEk8~tn~msLo6Ixr0)}PDTF#7DWrxZIiPPwamVICr$5YH2
z`sHQ@<=U%abpk9GP&wlt!cRQmbN;x}`P&suq$C<0*@hjlu1jQx5~rTai<a&?_0Qj|
zw30wEiJ`UuwL}A_N4rG2;$Cr*dN*2Fj}+Bd^CMm>C9y9rEoOdzA&4VYNCU-LRG+%u
z0yQskpo>I*qCwHSzyDa%Pm(!6r9FUA);fh(*PdX~+v`&;hmoRr0sLeWwK1aFCcw5<
zAnZcEDoro^xss0!F`z=2dw1W5r^h9RM`Z*Wl+yKDG2+a3dSQX~Mnqe5nC_jK?$!Ak
z2!^G7f0vqqNK;v%p8)qt(Qi3z#eQNA!<=pD-N|)f+7lkJDQRiK$W*E=eW=Hg!$W}<
z?{)+^n4`Y0|4{~Xmt!>x{e^e{RAgmjWV*+(JW9Hpx-us5x$6k3OoUMK^x*XsNu2<^
zyZ5ZcII~<P8Hcfb$EurL0DX)-Y#Xerl$q;{BBL#aDpaZhC9rpaS``(zd?bdP38vyo
zZKkv4GdpzMHFue)3-axik{CaM+K?&dH2K6s{GCvnzoNs(XpNMvq$=$o`>B~J->I2(
zk`kRm^Rb``YF6I^{cN}m!~-x>ks8QdiTH1rMv7T`O`>6GylX`{iv3_`RqQZ9M|SN`
zeL6J1_>0(@xr+@~4}LA(h6$4PU>3s+`|ui-hsWHV2cB_fv(P>jZ99E>GuF45+qv%+
zNim>Ug)c5N86AHJne1p=ae&0|Brvfk(o<1;SWqCuMG{=I<rbCL5L>gb)z3T-uy~K0
z*pO08T1Uj>^hmZG-3me~vg1J8bkqa@%61H*kkQcp?xTa(4UnW|ut$kVOaz0}Z~rVE
zfat<T!)p6(GJElv$8ZI+s9@1UzT<#%^kyg$?{e4c_}g^Xw&b%16{5J^Z#EaPao@;i
z#OES_df3h3Ezucy;=lRQUC@VF)(1qg8*dLA=t?XJRf*LmYtzANiHcMM#8ahkK=&Ze
zg)grO7LJS}5@<?;jRrzkTb>hX<}5(SeQupV?M!=ep~HEX)Pd3$Gg-;^fX&NghN9%y
zAya=qRg=qXMRCZdu?O7Co*s$~awxg7LYM=y+rpV%JM(1i&Xdh4*;A%mX<r{tws5<Q
zO|HKi0&<+%lh*kmOfIM-bC^IlQ?&svCEyx@`)yGlxvk7qeKK6AoFnI+|Iqw5Wgta4
zc3rsu=%9n_oppKjyiHebaiwu!1CHla2HCYTrVr1TToCs6L8p^iS^>r}sceM!VSuT-
z)Sq*p+~-4<^dTT%uuQJxq=xBGswIKGB0Vp*HRXAfls{AC!OHAXOJI#wPkJ|8XC4&m
zJQO(wEHZJ`?Y@fuc62@ThcWOE|7<*vU2o<{g5y~KbCa5VbtZ{2gyRxzyvAi_O2v5b
z1hPR3QKvzw?vkWLOUPZPamGVz_$`?+=c#e0&uVd8b~BJY=szDc3z5=_Wz^D-hRo~B
zV<@}-rt6(tq**#6k3sqLLx?nikZ?iVL%yCxmE-tK#j*W6Usn31-<=g*tvch_OhR8#
zYr<@MKenUp+0>p3gFUh75<qYmVq552&Vs7iLLVJxA<Tf>%=A<uFvO0(XVuwqy&oz^
z`EkSm$$FDx1Bt*sRp%<f&;k-m#2t2TU%*ibSZMKq#=d>5j*BbPe&`r3G|<E{BM4x`
zE%+}XpE>$H!@Alxkzj@`;0IjC>LEUUb{&`I0}RP<xU1%{)PisUW|zsn+9rbC0Wf`q
zt&IrjTK`}P#V<?9HcA#%ZuxU`n^d%{m5D&ET#4B8E)0@r?6II`vRP(E^9WfdsD+>`
z?yS+%WC}nD(=c^$r2w*rNw6bBtYcc_`Ws}#wQ5<8cJwpwVRl1vi4*@pXXMS7Z5E76
zew~U%3~`y6t^wsOL+!cjHLsinRl(8*7-ci)Dvb`%c?Mt<TBz%t<9__6{nHb@hQwQv
zniqU&G;icKDlfyY5>B|lwPlR-&e&}61&dcm1DCqFFL=KNIu1X9cmw`7*RUIaIbqhF
zD&L7x)t*yMi@U(wpU#3?NLQ}oW_XySJQo+1c@+POwTBmfPYSQT;r3yuyRtmceIIP|
z5LLLeCYT~lP8GH-KoXbJ#BdQwQu$u&>zS5P9Zw0CqLJ9nKrLGoHZOSV#m`sQ8+pu3
zi_fP_Hvgz7s@s4lOPce&EZ;C1h9JrawItDrD2w}z2y<lKY4A}3P}Mjgqn_e=@#!+=
zOmeCmRj9TJF^#Eky>@zhustbH%zlt#Rti)8q2st-GXyuNfkKdQCZDcc83qLuA-(?I
z5?8(o<m`GNLt+UKWfT_+0lom<F)uBpg0*Y0U5!1EY679Au6^*E>$OiT$7g<(=5&*~
zQD6u|H7eN&-!D=n*0KG#4kwDAam0nDH?y<~(2SLAcZ->+k)5pza>7R6sbncQOwN+q
z|D@cG^p1`W;sw$Ox3%i7O@c_9Gp_WD>iYhhW2Z*rJ2pQPI;jZngZI|}MiY0Diu(w~
zg3AsH4GAk0`Vya7>f5vF!7vGZ)Ej9!p~!^*LJH^qqIAVJ{&YSnZ>;p2_swqIE>%yR
zsSXuJJ|m@n^)@9dR!Y^F=r(OZF5=KXeFZO^b{G6|gXpJncbG(yhcANpLb#>e<WKx4
zyOr6fcaOJGajPD{z9~j<qs(##2wT5{dGP8siv&>{uu)xwMUX&@Jz}x-=6g}{K~8|K
zsqhuOSV_8_w|KKW8LE@wcNw($DE6iVNT4+#s#C*tv0nvYtCGr6+eB;zvXPJ6-f$N%
zBsxclinGC<+e7RxdNUx|+d^NSrLqepod#hBZbd}c(3|0A7NZht|Idwb08+GS;8>Qy
zBP10*6=`eW+pCqpQXh8+Z=^Q8IL?*ITBT_zbHshKuMB?;D=h&E`<d%ZHXMSCR1{O*
zW=rYrK4`PrM5K-^;7X4oBDDarwbU;nbxxXP{&pYYo@nbwBdMkiRcZ`)$2vX@R-#Ny
z$R0pEas!*a-Fkhqg4Je9btlPp#5*854FEcEK+XWk?Swl#^cu-D0a61yjif4q*5Dn$
z?8vd_ul?cbZF+HVg%C`I9fZ3Qsf}<;_!<fG*{M!th@3ni6^mpkiMMTS-+KZ0IZ(i6
zi#00|$%?`0F0Yv<uLF_S-?~yQsjbc5(jC8V64LdZWXG)<1;%{{v4w#Nx`ghPL}W~&
zD-{#R96lkT?d_0Cp}BKZFBhoyBy64b0p6SFShLscMoU=ppKqV@M)J}c*X4%+Jl!Rx
zN2MR^*}79DW|!~^YL<G4L@TSPv|$?-OR+d9o#&+7cwMF^{p~xRCAA?}o%poS_0EGI
zWs-WY3lzLyY$(<x;MA*Y1hWdERt$tmOzj{g;w)wIny7qTV1_h0hS-KV-UTn>uyEh&
zH3O)x+i7GYEMpdZfo&>z_n7{S{eQJnp1UYeOoRYgFd++M^|efksgA2GQ(vW0zqsub
zPj-eZs}B`-=ayB}WDya$u-w~OoCZ(8bm4)2-vwU^W{Iv?I@0Gr3G87S9lD4pk72R&
z#BldgS@=_De}EfFF;b?9mkmN#jAW|F<wK;H!T=Eg5RhHM%oIZm;QdIWi;EeQVv_L)
z83hjxf&(0qkdW~8A_YZpI5DWg`40kZyh*l&^vFTHLZTh)JO3#M>f3f2bm-*GuvCdr
zx8#RWCm-YwzK}Zl<(7(CB(N}I!wX0WNtWseBLXp!fHXF}JHXQ8CQ33QGLi>f4YS!U
z0h?7HE<KU2UglG(RUe@rk$wlF&Cr|wUL*VVsgJh02?rG{F8@6fgv%KeKDZ0ZgpFnc
zay6S2357FBEaHdl3rFDAMdaHLDdvr}t4rSA-DZi6Tf29o?t)=3sWTv<`B2^hSP7|T
zh9^hR5V2szel&&1H~OQAfKl|~`K<AZSd@3j5`5pv73K7?dPQGyZ+Wp2W6`oZsVemG
z<Hz{rDAVKjcjd>Ode#5_;L7bZVcQ;m0#JzQi05*(KkyyFkjh$_yqNjomc^UTO9H0Q
zS+V~v0>EE``1pccYBy0`FzkW1+dNol#BLxv6s+^$?4`~;j~)2oaq%0)W>4Fbx|o3^
z1(2Y-AfZHjqB+#hEMm7Qky%JaCj^jK(AU5&cmQe*O)n(^8vqB=v1z~U{Q-vnoVy9^
z^fe6h1@&%OEHSztlYIiUfd5Y-G_XlO@VviUgN?=}>;Hf_z##J6$a&U%cy8i35+9I0
ztE<2*3jz~Z>%R=H;E$?bNhg)gUoigb1}eu$75aZndQTcF4FTee8g&MQq5#uKx+l&H
zQ~V1FW+(T8?8ZhjD0=8kY;{D`F5EOS5=uArW+8&c2jrNImS)ra5GjBc%xM?EqrA=!
z2v%N-WwQj)3k3@l`sBKxKWD*82Btr#(aYF)WU8C7zC}<tK$BPs2_OPfQX*qu0IVtm
z!Y_Lw<-wXy<S=u`rcFXn0rnxwTrQjU?J$u!NQ+IeVAE|PqSF$f-nGuU|F%7vxJa=0
zk3yz;aOrxkSzbrdID3GV0%<&89jl0SR>mUfTALs}q<(#TtDkrWDC-4;PL#=tM1NJK
zOmwHS6zhnPRGWlu1F`Sz9kmDkq=pLU+mBoHq_%~UPKSN^$LJxCAO9sWGge}Wuzd=k
zwupTpJkwun^5CuNNCzZdXK984r#ERi@<;t3&qEqQ_>>hDz1)^GdaTSz8-Q-l&mQDx
zK}?(4BzRDJiTt<%Ju{@Yrfi-kD=AhLDmnqA4+g~~QVUWv2vzIDE}-^Bp$n*uo~6@I
zSN)4Hb(cdB{)=4jA-YPy?}ZV(CRp$A9C)Vwq$16#W<+Zfg8yh-MM40zb{CR~9vGHR
z3&M<h&(@qbozMdch%RS>EG6{$?=@(sxwA>2n1nKN_|Eh5smR<PMbP>iS~?8!>D$}k
zS(FkIeAbm25M;8MT_PXO*7JAv;4J&l*+Y~%7&fyVJjZ^I`|JQPyAL~+Kp7$xA<rAJ
z?DRfH;$e335nqc!7s9A2{kg3m6U70Bw~}Bb`$BH~VlUAhNs|?{F&VT6q5r^-_)TE&
zJGbnXeDxe^0QyY!kYE9}4+K6v1DxwC;WC{UV_N^>!V2N9!DP5Jh)he4E~QMi{Mck9
z#Srxxd;D-773@yX9g!P|Bn1Pv4A9rwxn#YL<6Z)q%(BK>Q7>!*8dth1L&@8X%ykC&
zl!a_#4%FE9-+pW(h&nQ4iN=RIeR`dG9fU^eEz5vL0Se=;5hsUa;|Q_X@bKBjF&K>y
zvv@HM89X2H9f@&41VALHhs08Xq;{?$&6>aFj2E#!0W)aiC(&_6q-atXx|8tp_{a2Q
zsx9O-QjjqH(t)BvWFUY;FS1;gQv4TDvW^nh6rC2gH|+fj7u_kmnu<F$$4Ey@Apl}u
zQy%-QYyx3j+%UlUq<WGO*?A|ca983zI0A}xwm4%Y3JBOn+f5jnMv_Ea%@Xq2#@gjv
z9+KRM%s03BPYs&Ceg1E8ThC(r-LcSJ^vTMiWRsQ>lj`q_<9erxq!8k0kiEZAz)Tak
zPIrMkb{R;q2<f5%@KaBphU`Izb3e~w$pyT4s(L`us}0%M$ElyMZ6Y{tJ$wypQ2w;X
zU%$6ibZP)2kxFlcg(Lza$*LsJRZBuBIRkCW4$7M)e5+7s3^Y2FDos$jOB;nO&YT)y
z#-4|08iU<Oy?6Z~Y*vSpYE9Zh@J;RXO61x}peLEbS9@PBqAFuZB+YjBVMk(pLV?Vq
zE}^vVhHBMdpJ@oEz8KEKWL@@@Uv?xE0$!6JJ&3M%j=)rhUl@o`*iJgq(4G``n^{a4
zA;~qX01Q>2Mm!{mCbQlcU5)vWJnvmvusFtN0?!{OduO;p^xu4vbL0uw+E^^iP*xhT
zdjauV?`xO=0Rk5Az{@qttZxG@wFl8;1jwaam*4o_>*IQ-`1Q>-1e+)+jegX?6~!Bg
z_kxNOl{iMC;lW-#qE$pEI^dh~H-6dR-x!ey5-Lt=v_Krx?o32FZL7B^*iG6_s>Wfu
zQcDDCtKR)=Udu65g;0wJ&#G*LZ`D`?t>+P$__%qIyME~P6cuC!!5s$U32xUR)9J}E
z!o%L&+2n&pw`Q_>#)-%{0g*9joWzK&s+GpAI@o6tzLR)LUJmHkkZ4E56VsXevi=<8
z0Ma5#LbaeeUXKTDNYZ}%yIU;mq8s%_8f;5|IQ^lj#17TqYUL7hGTBf$>9KsK7-j(c
zPMS2QC$eWmAkf+Z$n$RiEH5OJMv4;%A4~t6P3DeYtclHqOv__j86^r3ZEO__7ix{H
z;8qw#C&(V~sm6_Xe{_vXUICDWvF+wQkBTeC_{k0=@`|?eSc_w~u}%SWAb(P0NWK(i
zmb@}Z<V&=_5>J16G*6-x4G5#JJBhLaXKdfLBe&n?Tv~uj66HDe#LYy&e!@4&dZ)d?
zWCj0EZ*5jMB#pLvin95meNAO}V1QRD%92P~uw(qLGuv~}K0t^Tof<4V_c6l`;B2Ei
z<YmLI>Uc&x5rLK@ldz<RBC>1rJjC{$etiSTp>gzewq+KWUOPa=$+o7HgV&bj-x2uu
zU<H$Kc{?BG;r*67C%>cJjeiqA1b8$bp2!1`>t!2xIe_(InS|!K-Z_AbNUQ+>S}}ZR
zZi7mXTwHDp!cG)tk}B&_tHHab`)^j6BGnV{vk$$S%b0|DAy$sp*G!EMO&mzS0A-06
zJWiH)x5yPy!^Rkj?v*`h2|uz+P#Xy2+f`DV?~*j-Jp?bw<<ZLO`4q1e2qmGt1PYj-
zK>)qb;Xc5f^c{+!XHc!ipy@q;)9Uoa8YA)H!BmiCLI0>6=T4R0rsvzF+$ZZ-8^WZ)
zrb!8b_#DZ%iR@30AN$k3SEF(KB&jm2G(Zc>my6an!ydS+MVj^aEg&VRqm}d?VM7Rp
zBDb|W#chaaPFgw0&49Xfi76o5FaT6AR%p?BSBlve?0gprV^o<F9=CeS?H@DWk%@Mx
zPzX~sr*G{hoiZWDRWXO9u!DoQfQ=Pmp9ezZAyW5Ph{r$4Kr%!<S^@LxHrlmO%G8WU
zxP1z0NdFG7Tx>#Cd#GQl+s+s=gYlI;pcRr=1m;d}80s7mq>^EcDxDl}B%QsRyhs3G
zr1{vXmU&FJL>~;gh$d1~AJT04`o=0oVXLzbdCw)t_5+2HCxt3&*R9@|<wSh@FgW#{
z%`j*wUd_4vcfr~qgSN!ry1hw&YYm(9Tkwq0cw^GR4ke2eQ7j7SO|I0)Lb1??fF%cK
z{SS73&!yZM=L5WtVBXKZKeF>@hn|TmVF(}wp)i3!M@A1A6#M&<S;DR$0jZ#<I1q14
z%6%8we;HAANJ)mZyUvrBNDD-`(6n!V$|&x5aVwfasIz~5ublV`{~`2#DH04wOg;4f
z%sNFV1;pvYHD%n~7kh$W5bc-j!Rc?;C7NM_Oup_UJPOcD0*_P(b&0T;n6{806Qsgv
zQLMzLfgiRx`X6qrkeK{N(l?6+=_1Emr%bwZHP~Kc7^<D=D+#5`ul_yrw~%YA8I~Vi
zb?Enbt0nh@-(WZ)`Dfqe{c%hSm6TS%tavE3U(%6z(V?Ru>)U12w}fxW)N}pY^B8?>
z>ZONMb-7Hyy^+o|3oVhv5yz2_MQkFCHK}comy;$ZC(F~Ujrq>L&-r`VynBZP%<A7G
z<F8%4`my&;6)X>DgCGkFi*ZU3P)7dgc!u+#m>Y^}zIjg8z;O1DKmPBnLsnK+`??Ek
zY-~m-h_&gqvDN`TK0Y!h!h(W=IF-WXJ=^DVyaAwj>goCC)oa&`01(qG+hiYZU}tAf
zGHXzO`SN8)T}rG`MTGu=|L!9u{By<0oh0`@w;wVZ`dWIBJ_r`|3`!Qc5hVF1P-_nE
z*s+7r%s5`PJb3>CT(MTY9Fu-rNJM14(m3T7ckdnpljz0w>Mv3ujaxNkWMq`5S{g3;
z?Y9UlDyK?}<~&1keua|E@;kEXKS7B69bXIwA$KdFth3Ulr>E7^ZK8*=!oxX?_qRbF
zc#a+<Vnvqq9rNaVihO+(7^Re_MAR!bbpeO{X+6E93r$DJBcs5kS+pEWyD-#Ip{T1H
zeEIU_@_0Qdw^xX7ippg=O!nb_UrUP7k*^dg&6t{=?iYZ8N#2wPi=7&((uq)bdhFUF
z<q>~zCw0hJ$?L3{MK8+t_4S?mf-)bY9C@-u<sIebiRn3qvbE%oKT3h4y*)iW+1`|v
zo<PDlAusQVn>RN#>HRm?53c<+r=faL6J*T(kr5S`-bs#Q7HQEK6@x1<s}Vk-hd@z}
z#6sDaJG2!ukx8wMx5Qk#-MXcSN3({HH+hUva_f3#=9u^I<*)wz_gVOr$9j2{Y;A3=
zd+OEp@893ike(8u^z8J1_h(qbFlVjbx?Eq$a#B*#HEDv72wtagE3ZT(M2>01XBa5T
zQ92O&69jR~#VFthW29+%2$D&JMaw%o`Zm{pc8iN^*QJ=N96fr&d9qJR>&5x44KIYP
zJHi2CmMmL#FFQM1Q%h^9pyKRwFM0ZFL<e82OS6u&AL@9DRi-IG{~Uzw_0S(?vt<6K
z&Q9Y@C!EF5Tzg@Bd|dsO;H!WBDThC!*|s?NyxWZ%@6kBLuyJDqpK&FJu(0s#CR((3
z@lFnoqv%-h+_mob_elFJH7C$q9lahr2!{&(Q9Xxf{hYkKN^3cl_6YHyJkcK^J6G0W
zT8b-FpDjJG<I6%fAH^`)9jvT}9zA+A4iWeszS!Or70E!>C(#vX-?5`2{KTujrDwy7
zL!IM$t;aKanrmumGGM18C3b+LR(eQGheObN7=X4c$(yAiv^A+|X`EZPZml}(Z>RR>
zk{rkU{Ctw!4jep~bUJs*6darOH_8!8w0ZO9Em^WeK~j>k{1;9p<e%Pf?Uc>x{~{Ux
zaZ8Dd+rxM6!=d8hVgsmpl>8Ub(N9-MSITB4rnO8=y9ESNHs2x7k1=rW-Fu?QPe4Ug
zRh8Lr$(HQg*RKhJgn)4Kz<~n{jEuf!ny~N~Hf#w0^hr0#wC?1J6)P6`@0h(`?i=2@
zS>vrO8YuRN5M^z3B&LwulJ4cT7LiHRE~%y$FJ8dPI3jG(d;{{1z;;&Fw-pt99*K)Q
zt7L=3qlY@Hc4CUrVXk=k^y!Biq@<*h&XpZO0NcZ4uZTN!aLqnk9JTu5;~R5YGRZ^l
z)zQ)E>;|cCdvQZCP2*4d=C`Z1Z{OaI`f86O83TVErtCPjZ$G?vC9@LZluYbwW<s_M
z#$kp7mo8l*yF^q}bjg2ldh9p1IR^&FT{j%pdhJ^m4~lz+>DFJe!RK$>x>bfY9TI;?
zv|3q?LJcnv$UFAzkte&ZI{t}&K*0C7KgRrju2<0cS8>M1rIJ0G={7Qe(I?KH4L=#J
z{`A3v2fTYR&t&A$SN(wK_HF%l^Bm*<KNpFVe{>o7;VF)J?cB9X`r4_tr{kN??!wPm
zUcG)@Bi$4|i;CAyh1|SgYpa6Pu$F-#1RCBm{0`fh*(6N$Jo?}16vA`{NVsU<y?a;K
zVK^!yGt-h+_USQirA&Kktu(8fczTL;`}T7<GC@KGrfCfBAGS#aX%ljT3<?ZVnone;
z@qgQo+IL{i_9KhPV2LWWWchM$tCTASvbO|#?~p@h4KFSnY%2*;!87-B9H;13Q{&b|
zK)~iuk3W4CS;*VW-^0TL8I{B&X@z+4ii=#2daE-xt>FX7y=}AU<5SMOp`Hd26}@Qf
zO!W^9m*w3AFV%}!cibZpj<o|){EEbO?lh?>!`*c|v3|hMipb|OZy9E<eP;dKdV!DR
z3mbG^3+caO*sU5TNwPemwgLrS3|q~@(z0M!m1I?2UERR8r3|)nwliuHj4pF*=P1<^
z4e7QjC=c)lLy=6^{#Wz2r2LXScGGAy3yUw2N3gFRhoDb@Q!DK`HV7*XJ*^dBCquwM
z$LR>Qmd?&dC>sSOC2}A!Cs981<5fT0_D;L8#?=YG0UpmrY9nmzG!*w=YU49S_Ew$Q
zXJxps?$w@UiC(jRnf*U`Ud>dC!%QMJ<zP1z*)A@LWk_?9{85-sG>%r1!t65RY`ov_
z$U7fybH(x+%!Hkt-3}okHRJ(hO-=ue{3hYR4NqJH=gANHi#ePG?pDS|Btp#V*ROf^
zy1Tm@m4)6RWjfu~oX@Pj`oCN%Den6VL`X+Rl@v1Q3lQ9`w{joDRyf9;@$GWWIgDw`
zVmh79fP@3JSk;9jnWrHkXVSFLhS%KI7KSp<`PW|uu(jOW+!UavF5$2Y{<d`G$|}gL
zBorVEb7YqBl0H~C)tUq+RYdxTiiv#>1O{sE2CQxaL?Es?H%$SRz#5T;$Mz+)i|T&|
zf^7Zu{Z=$J>r9OFR^d^t0iSlI)FHEjU?@_I^RFsmZtx(FkUxl$x1L*DTA;YDP6N;f
zrBeyisrkzng&bexaeS?j|8B+MFI$X3%#{G;&uqAc`kLf4VIu}T)gX){U`LzbF17BD
znHo4`QVM^Z3@tY|x8`+WpqK{mGVhLqIijadog%%rfJ|?AUDm@q{S*ws$bs64^kr20
z{MjFB|K8W@#LxSBt~qhy1S+rmnKL0~A2ZI`2uo&7pidT1_Ai<}h&U3=8H*zv?{=-b
zg^6ovZ-0h;^$ZbYfJ3C)0V)Knn%*oweB=n{zI`9MtmGGNXJ_w3CyDTdFMt2@>Xr52
z*Arvo<3?y9HVFHzS!D6z#huP3fOFQ^Ap}3q)?GQ#zQ6y=0aziwd%H0qOL$t_C{N30
zZEX+@_KG}>nloFT-mhODGHpmx-XvgF4snxV-MUb$vM)+7!zO?SrCaNEH)H$2+YE<_
zU}0m!FS48B=>PKN%U}U_HmQlij-zq^{PU01P=|72*?ia2udqz3R;@Zlqn$ucHm}uJ
z`Jt|wL=JyZBvse>uO~NC{XQuvDQHiR0Wo>k*r<cvmT=VIBH2#oIjJgyea8+Oma-gf
zMw0WC{b4C7zkt=$y$yI~FJLDeg$#kH%#h4oJ1<!NxU_UR>h>Ud10kQtxZSzKC$gR5
z*h>nTt1GpfSyUAX`tFr0S88@(L*%d)mdwi$y?WkR_GkXhnF6{b3Pw95%MT#@Wq11e
z`xU?vl%v)8>@Hq>e`*0`@d>FA+k|}Idi2{VWae!RFMvvn(INn8F$AD~iQsXHLzqiD
zBYbAEO@cJy<HGmNq`f(n%O4+JimNFzGczM!S<N&2gF4)Zti-T(?e|iJIBI`W5v!vE
zA?H0>*p*{6Q|A{GC}VKSc?GrT$&I2CU@!cA_RBNm?c1Y85B45}+vjW6m}P3IL+<lL
zY9(PQ(4&=shh^Tarrcs7u%CtgJ`O!M35o9Tp+nX6y`Minf{Ckz7Q{x5z{Rc;hYlSA
z@)Q3MrIyIBe!Zck4mii`)w*gF$x4Bym?+$*<Q)TqERUdlysSiGCJM!^9mgj5M^uo$
z!LQ$cEq&6`+G>CwL<KS@ldXBxaqeVu)GT}T9yx0WS-;NelBI=(0wir-i{_(ZP|YmP
zpWm|=lc)xulw{Mbj-$C!H~^qvVUaMDg?-bG^FXDaUS2^Wmo8jQ6?*&oM?oW(_F|Vi
zX=U}?I`h`8TOb@^cehDJAZ{mP7rM>|HVr{OA(z_3Zz6x}nA=bmRMM}IqRY_8t$+F&
z87ogy11ieD?*%Z%xP!m#_<O+eXLgg!x1JomkU99_crEa`bpUPzTK@<VWjLIEHiN~K
zX9!qNJM_wOm;hE_C!OP1ek0(G2#u7On>F)Xh1@p_J%%5D4zfk1bx%*vMlt*6+AdSp
z%Fh&XUw2Wc?_Xa3kqOybJ>z10(P0|gkSFMIivWrucc5G|a<Gq<WFOck5%v?K+>-_e
zDMJg~;}<UkY4qIFr2{1ru~4PL!FkyX7gI;OVVm1S2^l0EIhEGP2c=j7KHVH@4bKAC
z%R^|-*FcL~%DQ9`LBX>KXnQE6JAR+<df+QTj8IsTP^df8aFo^f<>f`d-23)|8c@x@
z4Omhf>e7Khia#^rvHO4jy)YBH<2ce_bZ19pbR>E&6cF`$@k_2^gg#+?I*=VItpQQ1
zpsjCRzkWR#%B1zguxTcY_HkhR=uTK>5u~62QT+S6fGC4FY~G37K=09(CRy>}UcGDn
z{{GU%<hal9Y3!84ezad#4r%ETiZW3LDy?C0R>F?^3w#iFq|W4_F7e?2Ghly7S`30l
zJ=NkVDRG6%sQx_c_O*Zh8H`^#mgaD}6wTL<FtPGExXWAo2H)w|zwcH{c%2plLy|zh
zV?JD_rwuTYxW!~@n57^5GBwu-eFYCBn4dKtgb6BJIaFhr;$@VlBKB!HvTIazwJOav
z_F^_R0AW$hh18Z_!ni^8E1=kJry&&*rwJ{-5?*+Q=5nZE&)`pRfWFWN;q;!Zh)|AD
zjenf)OHjZz63?SMa>j&y!nWX15xf2nxS~;rS{mJ`GU{1Hq)H^RAe-TD)Z6H(i4g+)
zpiSIm#W&;tY)+!4lYv_FVwZP(czP>FJCl<Zp_83;GFCep+s@CQk=l}yY}OEgcGR!P
z3-k~OS?8MG+=<XkP55^#d?-rtj%(jf`u}A8@)>wC8Ms^TVO(u**xjQBxgYP2m^b@y
z-sqqyK)Jxl=CF`-_7kR!Sy~`}dqGa$rh<n)gI$#7IF^9#t&G-6KZ(%_;CqtVf2>@o
z0wNFlYzxh(^zqeXlbXNe<mAXjsT{G{IoattHpY8BW8;6lEnqcqIMEb}BdDi$e=@=E
z*z|ssna55s@F3Gw0J5N`PLK8>$YuIjPL3ogO?c$yLTFaay}jmQ`-vD{fkGj)AIQiT
zey0(DaD-x*JL?XR5jpX(kS|S5WDY56HHnWMOHTii9Pbl?xOsCFAjwfWnSp4v#3M-S
z`^W=1gAF|-2oY``jqW@(q=H5i%YDmc1H@tV>eU1)28h}zd}vs~A~7?W2>K>X0*i(0
zL4ZAV4;8rh+#AImU*xMHFkHuksC&D{#O5XwhU4({WBASLIcO2;NfDJXT10Lk7;cLM
zg6cS-qX5j?2?ZHP$Cym<5-On$&U6`I)%K~zn+ID8d<2AcVz}rkzf1TV>hrPL!*u}X
ze*X#?Z;-~01y1@0X)NOs`ImKJO5A*rr@>7$z}jkR)JVwTtW=@wQc7Nu-=vxpbbzA*
z2uXw;LKh?7N_Y-cdih3PoUH8WiGic`uBxi4If|kdk;8OLR;;*xYJ-KfHCeotwl+f>
zb=+wC3NraM{imG8@BA>#sbJOA3r*Lw?c29Eq`SAEacRe=g|4d!E_x_xm%vKk`<2Bj
zamef*W>2ye*eC?edKGD+maWpVe=fq0qB~?92U59c$BrIF^Wqf-78Y}hljMd=aYISx
z!np-J_Z>QHY_&4&mFeYSaxCaKtZcrGVt)}wVh~N-?L9YDa5~;SpAk2MDWngKc#MwL
zzzAsK5B&b5av0E5br+|ucGJdTNDV?udFYR}1Ihe<Fuhw^rY7r8*QYNdcFXLS<f@cY
zuYR-cu2rTh!Dj*;jzlD}87`vs6rjqWY*{vBIcb0n)Mh!QqgUVr0d~BwY9k<a&=hd?
z$*QY*ubsU;k)v{ULH~u|7k%%+0}ew&LnguVlAwq@z=@RWX%B?fCuciBiJ&7t&b?6b
zyEa@S6;sv6u<*ftY^!_cKYsi;4hNH!1_c!d-!sU=s)OP2H(WL>_Rs(0F(v;G@TjzT
z)>tTOJ7_5mRHaB4qsKXTR1dlS=x*+|HmjegDEA+90R+PpR;QOoDDOmWms@u$mxP1C
zaaq~>v9Ynh)Z1wUQ-G|K;GHfZr4aaorhx8Th_}LCJo71r-ymv|0LCrZX|?fs*K3na
zjc3;~cRg84`~I`evU4775X|Z<bo9y=48U`~Eal3Yk3dxnm5%2^w@V$H!2&9I^_n$9
zC~U{+AOl1Ecs<cE?+-ZNy0Wt$J^RL|@d=f?iCq0iztJywp`UE~Mwb~aI#F%MCMNu_
z(7}7`AOsB|-5;gnWJEaWcf!Z@N$9wT>bRd?9;w1THa12+4BmrYuQ<RSv|Bwt=jF~U
zW@G`pL*=d^w=?MfH4XKnU&l0a3da+QUQ1Kd)w{qH=$ALp3dGr@n>AF<Osb2-Fi=DK
zO<xd=ZP3y0F!H=fD|Ft4@SE1f!n~E%E-cx7?bHT4-+wg&+)Q$R+WF6yp`H-fhPOR*
zV2@c?SnNZ{NN)A=^bAHgwxipN9vK;ltQ`z{3!<RwY8WeegnzN-=jBO#{`{HEa3RHU
z8*OPyi3TzxWPz&5i&5$u8j6q+sNiZcf;cIzwzjgbKY@wcbDX*0z%HiqnpE#u0y(Rl
zBnH4q)SR!AK*uu0zf00au$R=$SF(uL&px~F_3PIIoyX77Y~?mI#GSf=2!&e1jq*Va
zzwQ2|jD<Al{9T5cyoP5U#b86S{SZGYZlTT0%p+992M_*0ebJLxhmt(AM`AHmy;ESo
z^46&hN3_wk^xv2KpHS_UVH1I#K11~+eM^WILg$w}Yi>S6NCfZ<(769!fBn_m)n$T=
zb}abDjo-<4psW~u%t!?<E0p~M{yn(}+V(+<lmX|hrYG&(U2vo3d8op!U2k(I9XEdW
z9<e-k2|vfL(CG8$&lkM3it^k6(Gk&hoDQYw>9=p^u{Gosx@!`pa3gw8Pu_gWlTkoG
zz`CpY1l_9d%i#bqhuI`Xxh?+p4n@CA&>7VDxAy7C8f*shDGpA&y=LpIOzHRF4_Er_
zDOF0+reVfz|8)+6bzdTjd1?s-%tcUcq%^9UPmGU0B(oX*VrJmWbjd1;<7gKKlMsn+
z%*wi1Hrc#<b_DO*e;ono)3#3s#s7B%j*gB|DRP0L=T5Ux#}XVS;$>xJ38KkISAg66
zVLhyMKQq}PHZ4}a?8pR={T=N+KAnMqfoz5gD2^qZC!@}0I`vvEq?XiX&Pd>M65B$X
zD9IC_T3YlonHU+#U9jf}H2l98>~Y$m5ThxiwriaiPRTMX;^N|hbmsLszn3bs?49vI
z9Oh%Uxw!#_YnUgpAkaMp9t16*5_X3Df_Lu{5KSX`j&$+D+){KzZo8FB?<qkCgQ1Q6
z*t@zqkymYjD(Fe@?-@l3dY70-?NdOf#;n<o-;t!{I`0bc?>1le+FAXS5`CoTk^>}_
zmM>rax`{Qy()=@7f6}JSzp%n}axc0ImdzxiX<>b0+0H%-;a|{%+O<<h>6<r?zFR=)
zW;+CJ8WESGf`j8R__sG5*h}U-g~ea&1D4|FzV8Fd)oK~zzC+hesk*pbfjagK|8l~^
zM(chg1YxTtSj2ixAK{O@^U-vknX>KFBR$hP%m8=6dkXTW4fc11tv!-^p>Z?0<PyKG
zVj<tid1Fm@CrbRZmg&cgnl39(skzePu^WK)XCV`&IZY%J4HwL^u<PEB5dk1P`N_H{
zj;#P|ruV(*bm?~sp+y?Pdm^H2&5vhr+whY=!UXt&6F1jk1hIm8KpRnnoBxog67pXO
z2mp$$K>J1opukXXqYn1?taLSdcaFb)9VZakIdDF=n>UF@4wy%FyXyf{bo-G!iLsj{
z#sJEX_vNt=9`o;)ywV&6h<sGvhEO6pKMwxJ;;7(pdc?`-h$U+|kEYcHBm&HA6g(e{
z_KH&m5&vDn^?>_N*iJtCG@}95Lqr{5VD&0^O`YiLB`0aHP{vx>7yC|+jgG$e{_anB
zZk;<Nn$<5ZJRyP(ni}4|dUcSfqhJesXwN6@G+l$zM8+eeo0MX-zH9@2$Mw#_0;^5X
zzg5$YQ+`cQyoh}(Gg8E<9B~$)ZyZQSDgJaWZ^1}19xI4E9ug!^(0orlDfi8#x45$i
zX(#-PFe#k*d5q?l4iJ`eUrPLio}j&goNd9??%|$KFu1cIn!<p2FC{L`aL0}mk)ML-
zstR3I3EF&fYilSDQbr<Q6JEq77f4u@zC%5j98tuHe2#yWvB%&EM?%OjYRt;~c(G}D
z_pBRn#-R(u&=@a$BEOEVqism4z*!M^9Sjg`C4`#j>gs9}5A1mW+wE`|XQp~xDzcm$
zARQb=rhxBvpFAdx3y0j4qtQq&a7%A*EK2CR+}y+9OD7<n5~vKE((F(SIldB2KNZQQ
z8pKG$70Bk=$a^*zXQ)o<IFCIYy6BY&?cxa_`JD}W4?vE+2UDHM>Mbo#3A4c(sAoB*
zViQ>28^HVxLec3&`?=E9C2N)NhCYbasyK#z8=1jp^Bvzn@G@g`a*zOpR@U6n5rKsx
z;yl52!@Z3`>>id76Umzg$jy-MP|u_D7UE?p9=v^}rKRKe10Z}D_yp`*g$S+mWYXgX
z*t1(osH-7e1p<^jw8yAkt(7ZR64Ht}FC%&u6H`E4*hB6WEaFi(M|0fS*%^WU5H=((
zvz@?8NN)0H&z^O6R}6uRAclGH1Wak^=-|YaNOdFKFvE?syQj$l%ziPPKpEp>WdwpX
z2)n#ml0IYz_;&;<R3A=I)<wxp>BXzwUbXGW1|Z3Hck(KiWho7q@3gu>*e?yEq(i=8
z6zgs{x&h3(px;go$ASztIn-x4)osALR6=9#u<CvL^zw+b^zFT;{#n>5jCW_GGcS0%
zTwH<*8HUodL0em!KS&X}{2LvYDeuqxpzC|d#!Hj|X}pc?q(62sCMM<@YZEzPT@ZFj
z6g@nTca0cArwHgte%2^AGDA);HJB>&nj7Q>PVqVnF9J9}B`7F(W&^RW^Z<%0_xd2C
zyhu+s+@KSov|}49D{H}|<u-}oqTUq!f_1MN8yn+Pcd2D9hMIc^8;9}c&6_qYSrBiZ
zU`MFA@63ny9n%MZwQ29(y=v}${{ALtx6I{$CUn2)WaP=BQS|L~y<2~EX%Gu0*Df-=
zTSp8BSk!uY(I6pKWP{gf91g9+>mOK)_=U#IasutTZ>-w3uTT#w{06l04sjtL_IhqT
zcsnG76@E<<0P_(<{JuLQBO@us+TLs8<Ky1|>UBSq5Ec>Hgp>Tw<6(J9=iR=2n-@nQ
zjg+EPOJh>?&`fn3bIGS+=Ylr|9eJkkbRSa+TD8a`#6yQ$rgy}HIpDM2W*`~J;CHl2
zKI080`-sr{Srr)T-T68Z+nx<*=G_PLC+O5|FE6jXhQ*sZFJ{k7r;|-6ZAn~)Zu{M+
z9JkSbwNYGL+(t|Vfu@I+e4CjVe)EqeiBrf%Wv~4OFZ>ge=YH$f@kG-@hyL!v3sY)_
zn9V@MLwl2=rn8p6+Jh`|N1CM*XKE5BlJ+XD5SI`S`F^MfbLlB_?Nm4(l!#L}%SFVJ
z9m2xua56~(g}|tH#}A;Mh^DXvyb`%KV4^xopIpzH8rJAcBiTU2rt3YVYSL~?EEE*T
zvVX2F<vagLa!ukQ>YmlGRb%1rT0$o_;|%zEA{Gwe^ukv(ni3X_2OPw)-{WRxW^~eB
z_6++AonqO<UV?e|49#ccqz*u@HEfV6C<+m3iJ>9sOp0W015YMP1|D?o^0UPzg!bTd
zY6DSIF*)Rf?hfc(_^M5{48zgZwL(HdgrpPk9k$%euJm*hv}Gelmtz#f&cPvH3_;87
z_H88v1v>CE>D8Xax5RJ<nJ0(C3asj(qYib~so;RD1jCv&+rVz`ut;d5bew=c6{C|K
zL%eyYV;>sK0&pb$J)FcYCnB{`TN=@oeH<W@4uP1+Vi4&`69MFRVu17Y0DhE0jU@~S
zat7(>MzkeC>VG^sdJ&Yo6TMu|-^6O9$dYz7AAfXLN21aF?Ve?*ibsh;h;T<kO)!3(
zIHx!%@6ik`A3^sO$IeoU$M*H(OnwIp5oJUSsDt*!&x=TzK?WrY0;d05_vL(&_2`3e
z^}?aOk>(05A&*T0<fihDjv1`^$jfMw^g>*RVm8Q~>~Z4gIrcPad|UX$7r3<Ls6Qkk
z=)@{QvLW0}0lzKlFZc+ZGT{fVE}@5@I|Z5AMo9R698A$g3m0;1*^<+$nu~g=oNfc0
zij^ShhG})&{`Fj%@=$!50hb5F;W7AOV1`U1=yQFKq@Un8iCljU!V19`A?bMx=v!C|
zZI?Vdev<eb6^Vv&GzCaBcoCwV6VCb9ILFv^gJ&Aypzu0w?Om_-keu&?NUa?ChOrUn
zh|tOp4OM9n6V<bwo!pu6q#w&564MQt#BTXJa<I+zS&+2P@u!u-!TlecR4zrM;(mrM
zLQcfL8U@HMGw>tlkcFM$<H|T52@vdsI#;6B#MANGw-%kxryAHKgfE~5)w4|p%}g3z
zi^nObJy@U<m<D6f0KpS~j!sg#8-zNueD8MIwCpkp5I>fN*0Co*0hNuWNG`YWq}55?
zSi&eRuEdV#7iK2dV}&n-+RS9>LVH<mYdf+m-JlpJybUJNS;B}s!9(8dX6AVD;)~B<
zk9Syj3~p-)wH$Kp^=rHpi3bjySwr$oS!#15)Tq#=kBu^|Fj?|>VZ+8(R^w3kUuD%e
z?53d?5D;(&BBH?tzwsn;swq_O$@$L%fZJQl*H%ms=2g>Gh2mM)W2%FYJ4j!}z;N%J
zrnB{zj7FSoGD&_0UR8>a*#_xQ#soX56k@7zl%JnpAKn1NNMufI!+@SqY^k_6B0@rp
z$h%3McGbqfbd`PRu`;XAu;Y1E4XZ*!#Ms`>PG7iZD%Qz9K)Ls(hlhauNRO%4K@$^`
zP(2s;7yPkfco5&ud)=|aDmWBFZ}*{duf`-1DL|TKo2Y30d0X3*Hxm;Rn@ELz6DVp&
z-j;HWud<|syFiZ%lOr>bmH$;Wm&IlBl%R-+h8{xFc;Ui@XGX<u-MqQCqbk<?4&I4i
z7n-`^nrT>JaPXFGXtH@#O&YoNm^N>|4Y|<3*m1{{tp5lp>b1Vm#?#&)O7`8pbMt0=
zJF|M^$=yvbLfYTpYa_(V<Tv&gFK$3zN&#<gxl1FYZvj$Nfs{7kKsY}kJ+JffHsKXa
z&)YxIe(FyYbrO9dRd{1b2szyN>H%8AC4o#6xKKc~pqVVX8+Z=wAy09rFT1h@+qwf@
zeaRkY?52%-bX_5Hsr(5vy=8}Og8<I~mEtc&r{mA!l|Pj&IEG7FBy3DYP{x2B!y$#^
zh~)MDow=J+q356|>US4EsD<9Cgs|xB2jYS*5rqSp5)Btg7pKY}G#l&4w@b%T&BbBB
zOV6-db(GWaTEA=7c2J{^fpZHCqnBdI(xr)9h81_Gv2G_24he-9u=470x|gi&CP({x
zZLICkpgV*bq2Z$gnHExo{+J-ymP<Ua69HuN(?oH^CO!Gq?c3#eJIu^6I&i)>cWTQU
z090NaR7-vFA|y2yDS%jS)<RrdvS<i?g!<2EZr*E^w+RyDkPR|YgQcAVy58j~>(C0-
zKR8&pCmFktcxVZHS!gRI#}htJ0a`tjaLEDzV<m6P`T$R6-)#?f_evPaMq0!D{U7jt
zo#y7|duEPM9LYJM-CAkoJ^ssIy@&f4Nqio9SXdaq3#*Dsl~q$`MPwM#yM_<6$w+Ly
ze|qw~LeLMg%tINHl{`An61aMi%25alzB#q_^q38Zi@-`k!ab$JjM1pSIR6Mue%JP{
zVV5mOKRz1`etaoT<LFjZ55I7pN+X63HgN>{)QPBx5Dq#uiMRDcH5}aDj<TF=!zEJ$
zUIhQ22nehV-E8AHE_r5WHlaI^+211<5Vf@8nW7YK7@-myh_^pIf#+dpobmX^9Co;6
zanDS@gc#&kHa1k|IvvsVe0Uqlvt_U1xEfs!I2<8;)Is3Cx_WbXtJZqhAF$gZ@J6^)
zW{J#rbl5)zifl)zDjXW!L(W(bhgmDRZD9=%OJ}~<)xKs`8airJuZ;)^3KAKVzmvQN
z*GkSJBMgRKWgS@?@Bz!95T<^;#tbxHfM*@Q?;!>Uw6hz{250e3j`lZN85c_pgwT9k
zc0o&16H05|E*2NCvC29;={ODs-anUtP6C-iTq~OW0#YBvu@S<^5DvYm`gp+H#l7`Y
z8_>P6cGIRv0DMWHfr1hS%VzjO-X0!u*U+BbcCROEg|F6@ys9b{6wM?qyf<;n2<>Cr
zw4L@zOHrDy_IY-(;mwhpI$0TLsx{NXI}d_*ZfiY1d$;K78(_F|1LGlvFll|)g@q2|
zSVWCx&e`g?Q~c|19Fs%)6v&IN)-=3VZ#w<-h@L^4>eYKwBfgovewf1gn5vHHrr(hQ
zl@aAyO@^&dzX=%~M#Nzejfr0c)8<+qs6>1!hcV)|904QE@&x<bez?Xkx&Y~07~Wxn
z(2+xj&cY66%wn-0{Ca9c^|11I%LlR@|NYumj#kcD82`q)_oEgXFwu4G&e{&Gy9`af
zEnmMrXxy>x#e)YM{32z2cB&M|cooC*;W9TjM~?v;OX7kLJ^(tPwAE1qgM-i8Wxdzn
zCDC$arz%g!e<jB>Dqf&MRKQIB5arT*S1L+HqvR%V5gbCXqH(k$8eD8Kb-<d){PD*h
z{GA|;j{zLpjOx)63P@69papnu;gr;a)EerbKZMgMXD%jPba>*Eh&T2B+Y4~$$dL}v
zZ<)j_6gaiO*Y_A-?Ric*ggQIRpXphCQIze%Yg~%*4G)j?HbG7ytsr=B&bds|PX3U`
z&>dANATm$3TU(_V?_@ZMx7Jjk=O;=_Xz{WQ!RVn7m19}C=m}mXVviGRIMnY*BQMvV
zMP{Ugn<bvfUrp>$P`wTu=idL+NI*&|kC&Y-^kkSUlV1xKRkUmrXsmXZPL^X$j2S)k
zF3Uvgh|UAno#+5bR1-h{$rb(fA6`EdziWwT9tW1=>3-jbHn~n9#*%N-K*(Rwa8lWV
zj@;S`89@Lv13SEtWg`w<+^-|BmP3?q2OKihxYdBo=-s}#WY02mROF?Z8d$DMJj2%}
zEhgoJB>?enU)HpH-7V1P=)BupI$8b`ti6^qqUKOqNCzx=nbRpp$aY;1&9iWxhoH)?
z;-+b#3kHj>qtqm;tyr?;orMV+0!Yxz3`orAsc;cNqR1lKd8(_HIPH~932=wV%hbEZ
z-D&TpC)n8Y`1g?$OeRcHFJ?y2)*B52R1p)M>@ZP|#|lN^Z3k;X>%s>VCUeWJ34?gQ
zUry@_CmUzE5jP2>WSrKB#37IxwDqFBMNHeG1YrFcL3T0mZk%&<E0g}Ap`@Jv?G{8U
zCVh1@+w9xR;Ws01_Vb7-A|^`g4`f)S2$t2<)Wn!-JU_dC6}vRz<_@olj~O^f+M+xE
zA7gI<)^qxW|9>zT+Zg-4r7V@LNC+`P2~kNYYce8Q6j_onj4ewWDMX@@P(qUIsZ>%S
z*`rjlRF;VTuk+zM%<uaBuj|+4nrmj%r}y$a=Q-y-_kEw-ek<ND#2k5^PD%Ur?R_CJ
z9C2C=j5-us<@vWED@>#6N`HHX?;iI-Jx^li^cAJC?YC(diY|0_`W*42iyGoFK0wO=
zwo;6x1(_rzla57xZ?e({ixx>jP2T9Nw`ke2WoM6xpIYg#5#5GDfF1Jc8@;_kK2Kx=
zFeDb<ZjqC?O=v8Tx^GkD94#T<Q7UN0Z`$K*kIQB8z1SlhsdGy&RSM;b`^hct=6Puj
zs>|Cd6N+BdL$vuJQG3fi3OCgv^js!vv9h6wFKL2L@ULAQKEzp?FIgY8=BC`$q`Oe4
z8Mm<Sibwb9_riI+T|H+teeJVOsD#XkovxoMXuMQ+683|XCZw0opZR_<trWO9@Hq)h
zZmf;qP2^ER(DfEY1UcCA9<XH;ih?)q-%sHzJ-VkA)%+M>)&Z9r6fL}eW)gA4f-9?E
zYG56`+r@6_R`uL;)AHq8+rrixTF|UjtHT7t5>(zfY>elnrF#x<^9%UAZIE@MhYb&}
z=dJOSQeVG(KAK9qG@xbB!wkC4Q7Y-64}o+30Zv=aW_anb<#XxX6PI7XYxa2b?86DK
z?4%7iK^|-0%IVd;*m0y5XX!ZOZFE4Mnp;|_%W}Og36Fddgn+Pgq0{r3Zr>P(ax?mq
zOrfL2&jHa~9MisKOaD7X&FhE9@5{|<RJ4}_vCtI`{8(?n?1aKghH~4Lb89)0Bnm3O
z*V|qL+`&A*dnoG>ueRq?%>(q?=DL(UAxUZ8u3g}wg;S^Q`c5MyrnWTOlsozS%1X^S
zlb43yuy4nX419jS<)3+7hkr=Jc_K<+Qs4xk3EaPb-xh9d)%|Kc?d1~zk6w@gMGvxc
zVeJ-Y9o|N}DiRG7EDQsvwh<39y1V}4_j1Xs^VL^gkp;n}z46LH0!{PZ0AQjYPJq@&
zIqo;>CCQ}fgM<rKBic7h-WXXBl)ERy=u~<3xxFF7P|O!UOo(?K)UDfSxPPXkN5X0)
zs7$1YOnI9>otK4}aWZiu8@jw-pBd9xWAh6R3dO-Skawllt=pkCB><s1ds@LM2zb73
z&#Q27MH{E?6M}AnK_5+h`0%0I49y;{Z{Jeg+TU@T{4jB`^YVohF(cS>SwKz@5XT99
znYTXQV@CLp0=p&cyJ$#@nY=^DouU(f4}SXdyY!;Bf8Vd#>zw6q^33B~mhxJaU-~+s
z=j$)FOYzf~7qfD86GJEG)l0Pv%ig>>z9R4CxxM<+)oCQ}p5WtJoSppc40XZ;mzu`K
z8v6=I5lQ=_Z|h&;GVNCzBGQs18IR#IAZp2nQPD7sawAhWFS-KwXlS}2zo;mn#OVuX
z!Kz-9XB-_G59=r3)fqvpa?Z5etoFdE6J9{pUK$UE);r36)hXHZ^wAi=j0)=2F*y18
za;6^QFM4wG5;6!UVv$EW)aTB&eS2vTHtI0g49FrTs{HNSO!A<2*kHF9>(s97A*EGh
zhjBD*Ptp?*NPqs*S9{vG`~COri(j9(pJKT5=DFe?XYJD31~#+N)*@9NAEetaYgg6X
zfz`+m`Tjn0kAz$ya@j7ML46cEC+Ro2@I!fT;bFz{nZ2f;o7!@-0WtG7FGb|!#xu&Z
zZHCJ^i@@1-$UH-lK<!;QTA8@%ekcf%ffuT)z^8YkINp6Amh(p)$%VJWlX<gz9t@f{
zJ#CU&IH5?V{A9%$G%3S|Jv{sJ#fvcd$S*iP?_g&w_)yWSbB<8@1jiESpDaK_S9dB&
z?5PD0=eAxytnyj2@p}g647PJSIIvGz+EU&39ovl`I%cRx)v~SIKHJ1OUwt)I`_aG;
ze?vwmkdmWYpE{lPQg@_`pE-yv*a#RA@ZD7HMN{?bug6Q?PEPFP{HeobZrzV=*r7KA
zhPpddoe%rDKXttuT;cdR5ANSz=pPaiQa*Uk$R}TqXpGDK+*I#fJvIFiJwg13fgkC0
z8u*Rl>^^LfoL`}5E@fqD&zN4<P{<Z#bMLcr3sTP@ub#TMxxS<wdse!8^=dip{Fhe(
zhW5$d{q931jQI{jmz%-Fybat!?s<SH6h;#Y*`~LE5JxjU+z^ojT@LwQv6t5XjSBm5
z{9e7>SH0`@2ol6Ef%tg)X(Cn7;hs;=xt90~XcO+Zul+RZXYbBeKDqo)4$`@j*LRbX
z4QT~uQ%7k~W5UusIH+C@ZRuB+UWyP7b=qz616gcGr=GY}@XU<7ZN-O0f>*JZG8t>2
zRh1a9(`EC94IA{@Nn=I-qNAhZO>pJNR;Q%x4!V>Bqbl$WW&rAtk}0RXDgD}nx12^U
zE1aNfTRR*fmP|lNxz_E~{rTEsr~nV0uPPfSi%Pm@;?izwHLCLI<~}s$-ETi9dzz0}
zSfVoOk$3%*{2#8aUKO|~s3=($hTBrjaF=WVzZY_L!c|pGPE8#rrLySjNWsG~&Cq#^
zzHDNam-Y~(r{UBP+_vHO^cr?8D7P?_g}j_qI%m$D@`|U<v=iN0tO5Bh%$?KH+})+3
z&G*Q|ZJ&kJz0lt!FYV}Yqc7)PCANhzv~CTs42z?yki;qbwlDeMvi#|ljdI;fsKu>^
zUnBfyN7)Qvn`T;FD9X<dA&Luw_<(0j>mIbuVw)XSMW82uUi3ywsuxJgy~ypMCx--^
zj_MYDZ=f`)fDIyR2eEO5P{w1*rbR;QtHAKXOvB=v``1c!2?a~6^MoKo$eQ2G1!;`%
z#DGJMmu|`4WAMkwk+!WzZ}o7#cTqZxo^g`=n{FRS<`VcBoV|jR-0<nhKPtn)B$s1r
ze$usMigaD^BK3moaLf8i8G3$mU|Q{$$K#kl5m+-_^l7C2LJ%<rw{PERn)-&<j*~Mc
zO^dbn%O7Mvz5QX0UkOEGNXIPzKV-5~DzF0x6EF7qy5*M$kk4VIp+sDTc<J^#ZXyeE
z?&37HuF!wy*LPuKx>Uud%x{(9{h!)7aPq#YJpo7n1E156eP-El>8tz6F(;f;`LY{?
zQs9+?_Fy=Co>ez_2OWm3oJx9h-0e*sMLj8YIMuptYO_7Jd>}td<u2+!N<59CMLYk3
zf%Sq;^OCq8b^^Qxjy#{;{dzh=2=D%A*7Zu>SltKYy6dX)7z{tAy+$97dxyM36ip@0
zaJ8HN{`<@i9|z4s?Rsc@yH!w-=!|X=sUw})sjmk%`~%P9P0%9(?DFVz=4uwsy`yaZ
z8my&(_}ALCM^i~*?!cj(Z!?_+A~k}9%o5xi2XwuvGK+l8GJR5Muay<xCp>$htnVTJ
zqjHMvxqAq*1L`CHSEp#nowWS=GA5(!*@2n0-*%W_7!(5ZIfMSi?dM=@5oCvzpLS6<
z-J{D~_2ml<N!`HwmXN@2QZxk>z30eF`@CPZDyn|1txbO<TDhXKil6g(xaCN1D_OVJ
z#aoBb(z4vT=acFZ?NE>D0>3E_W~eml&b>@A{E+5dhy~wv^a-x+pcx24wyQeiMjL><
zIXgLJ;`emJ=)|bx)K#2F=AgJi<yV|9zz-9W&~NQ^g9%kucSwg90NKue+6i7v;B#*9
z+0sq2PwVghJY-?p^|KPrY_C||s$*L%+YMw~_4Q7BM^USP;u(m`YT!E<Vyn)+6=Ci}
zp6WTNwYs~LZ{IvLrvxT*X>X$h@=Y5jHuUkLYwpm2;8V`)lc{oTOa8B9C>B)zY7f3F
za-#T?4(YqEtE=)m%`Y?Z%CA=t{1f-#`XNrYdo5bEXffC2z%nKhWRs^Yne*Vm0|)=b
zh|->aB!4=tvF)!nfOV<hbZ1whK#U)<?%wC+aE9(UXZdL-yb2^6Tk=S=%d6odbjT>A
z5puA4$RHi^$MJRRKihWh{=kW=X4@A}*i#gDXNQ`GG$g>wp-}kFjf5k{yp}%#T`{nn
z(CkC?-T3k|Jxab*S7*%HG5(hpA8Lo*cf5LH)Ww9RE~9&?P0LQXm7<k+c-Wq;Zqbnw
z+=f~0+iGQHxu@8yrC+C@emk4*9A#WP`?sggmj*Z+)ZVeD@=NBi%26tRU9#5P_PO+0
zsm`4NuS>po{O0SDFfZ!PyRZ3I_p>4_N<R<e<|K_W?q=$DOu2}H?P=re?2+=RaCmii
zP*D5OLx=3Cle7YiC3rS8o;rlGd|WplL$o*N?DHt{{dP6>*GY;e>TJA}Qg*{o&F_dw
z^z?+~k~zD?pE)y?5UOrb{d~{T`1tq*n1!d8KKj~D^O*Fq`J8S($(4h~xi0OsudDUq
zSw|8RS8ptZv+!a@^{1WXF=>}BjeG$0XKvJ~<wN&%KX~w<ZuJD<!AO1QpXnfGCbMTB
z0DfPU<0d+)S+n-L$iM<ukBy(kAb(Ojb<FKf9>Dv~Pil#iR<2s*3KTl{bWC*gNdyf?
zIxa<~Bo*@dv;O9Tj-C8eK>yWpRm@iU!VnnT7WJ`=9#vFSB!q#3-UfTato+;;J!Drj
z&0{Z}vrnX2cL}KdX!S{8Kj&DW71NCl!-prP-XoVdd}ZA|vT0G?q~=tMduVZXPeXO5
zj*L%G1lxUDwZm@o==F$-3eIKT#5Lh*)s0$yudNE)L(|@JrQ_~vmh&mkcQ5WZ#4XY~
zf8wsb`>(tgVub5`VIbs8@CN_2nmvSC;C_+^JtJ>bXeK0=5%-hY6j==$)8qA{BPv3?
zrSUTOG|a$Ah(gvUzTib^6nSHLuhrcR+v3Sm+vm2!PeLX|3$v+H!#K))bFV#d{FBo^
z-2vv0$XqNIJ|d@RTDucKgGNj>u1BY>HNKv?weM?1cUDs2`_<JSR?m31i%Blk;cmxj
zDNF|X%`;Drp7O}p*x2@6+Tbf`X-BWD+bAp!IN%|F{Z&&4MR4{9_O*F1IiRp0afaV0
zZ<?yd274IvUgGPK5FDd-<mAE7P`|scNY}$lr7KvN&%x?f(*7J?Rv=My{ovqe+ub-*
z+)tuT#A%j|1x0Flsf2+~czCKhh3t=jM<4AR`}vFBYa^F)k21z-Yil1Hy!u9Rawhdl
z>VU7n$=T@n3g2Um8gk>t4Xte_@=8M=K7MHwIc34dz(&ZT!>k8RkIr34KXSV&3B)8w
zA3Z<p8tr_w@Ym-9Xfr@{9(+3SQ?W)udHRshQ|LeDZ@l!HY<B?-e~S+JNY!Z>C8#Mg
zgvco+u7{$g{Y;Tkzy9EX4ZU2A@&;+7pprbKm(JVkJ{lw~VfG4MuYrNVpwlEruJkx2
z=aHJ&Nc>}&MRi$~6UU6PaVv}BX;3$9m0G>y!6Grvf?xh>(=vs^I%n|_XS%@C$ymI<
zE;dg`K<6L#so?eN>D$gefqT^Z#K>y^jcQI#kw?r6;wm}B_Ef(fC&1G&7V(S|k5IZw
zqc_sAfL>7%4Iqn^=Yxaw@%MLHB%N-AmzuGq<P}EEn`h9PVK7Kei_g8WEdkUo&>XcI
ztdQ>O=S}C8#L=8B5Sw2mR(lk3#>cxpxl|ja`1+AY)K4LCEsQv95zQuTT7FyOsP{))
z4E^qUJl)vccgK!N_IXdA&K?TG2DRa+IOk85mGddl(@VE(X(&9Lb?erJo*E8nC&qE0
zEW3@ntpbfk%e-`JKQuGAUfI0eMPE^LzTO9c97+I=dVg3pIvx?za}K}z9{FAaPl70E
zoT`?iAfcwc0k#q)r0u3u`az$?z_+KGT4?rrggz0=-@h!nzZYBGfFmh(wzfjMO1W{v
zH`!JiDIYzK{q>)h`~sIEQ;Kj=;V`7qRNC0W%4Z_jDjG#?>2(5_4Vd?t)GpzQ!Ceb2
zC|rfD_Qu7<(Jt6^p7jM<HU)~b(YSG05S$P3n6Sz;Y>O|ZdBT+iI>51w+Gt&g`$8<Y
z6iXsh7QTLdq0nwcPLD%VHjY_KaL#$bd)Mj9zS-FhqBXJ+QOw?tOQ|4PnYyZ(8qFUw
zGMHW``gHEBPj_<Ph>gXEQuGFXsHn(hp9rz7@VqPr(ukX61HOCq96fhVPYCRN5-NnN
zK;j=-e;$(w%t66oY2k~KmRnKp(L;1I?|AYNLiE5QL^Yh1E6jkDbA6pb=o!usulnVz
zR*}up2cbClOq2wS63d(b1+*&Qp(zFII|9DpC)ahpWj9PE0Ax1=R&nvz`Zf?`b@HFj
zYm!Nx!H#$lc1ue|2#+38Siiz90FjJjc9&a=R;|W)9|#E<04Oo}vy5&OysyyB-aY~(
z{3spwV&E2aO-(am_Di>TVgJFHKR&9-lblgw#z+?Ex+v+|HMtdN%OdFZ+qqM0><z(R
z&O%t}`p@P43{A0kfgRVJF;}l$8*H&&EFp4Y7g!Kl#^MIK+BY!ph`gh2-HP!+ngjr}
z1OYHQ+G6xoX56ku|Ghh45ridveRnLqly4OU3EOWH*I8CpCQly}xCqu%Hf#-ziVqQC
zXn|kAL~vYG`0UxAlt3xT$#IcKk+d$<3FJ`fW+;l?8+HIryF=L$?2VqohtI%aAzLW^
zG)>bigFoTr;eV!Iuasl~yOduhe*mG}?~<HJ9v>1C+)_j}9lVXcxZ}duo}XT_E3H7|
z)Q)F;1B|zF?C6S|eyDi-m%MrNrbo;p>>4iZ*thTX83DEK7O?7;=(hhYNhEi6GZL%e
zot?b=#8H>LJ2l6nvg~>2`Ssjx5@%P`@|8T8_Aut0Wy-0=!e-5yxqMk|{J8P5YK2J?
z@?^)q|NdL|Q6PnI2TI(_uJ4K<|80NvYl8+y4*K>@A)d{6dVenO<oU2uk>zQRsw-aR
zX1U*p4LS^F965517>7M?n@@}Bw|_1_Ii9=lxqYJJlS>P}4vqhG|Nf%WL(nWoecDZQ
z&%b-lJ?8R*2MO7Quj5}2o3{_)_*OpuQ6?+KdEdSdFETHmAdEPF*^1Rm2SU3GHHvlY
z4E9YD&7=;sYuCP0HE>NoE}iLL*0FD^?CFWArGEiY-|5_GTIQg_*c%HFlw0rbhb3@I
zjt4kIdcP>0;l9uQynp7cYju>Vsc8q==E3EOA!X@TX-22M9H^m@>V(OmO>E+$f;fD}
z@3+Wn&;HKtSCFyq^_#bE<BEfpuyeM;1@ybSZ~JzWSC2A4pHkk%7omp=D1XGx+<9c*
zCXXqTGON?U3#QqhmkC!EUlvXD?n)i^mIZI~wvyPA!CPuiMXPDCmqTKRws@plCmlR=
z$p5+r5j_dM-^!~QIv&-{nm2#;k>2#Nk4x2GS3jDZMyI>&!{O79o-MQazNpBHTE-*6
z1bNgEuFEGt=UC7WQHLpoF+{adKs8ZsimgFICLvzOW-^R^2`r^KvOB2+#HnNqZqr}~
z=5M{#y*Oh85g5w2O<AO3EfjkAa4OwmU@23uW8(i|S{zPw9TobqAMzrCR%6O$2BSt4
zon`Oe>kMugu@OR)<SG;?{uLz=+Z_{9h7IgQ(@YdBLRxs_@%g9}lLB&5yV0=m&Ryt_
z00mBt_~Q5-$>5*!8QLZY4E_d6QL8AHFy7j8WDlaq8Ejp1Qdt?kNjEp*eqkzUa`n3z
z)uG5I<?P8K@e_y&EHj+shZdt5L}~%z>4?oB-RNnND38+KpZ#P_wA`us04p)kR$Sa@
ze+;%niI9?{KLmU&)FlwQ!5&p-L~O{C6s{KSi!6a+@T>D%(0dAxLEgmfVXMwc-6FV(
zG$E!>-&?a*En)IDYOiY|I%bZr3R>=Fx}WLo9_4H>B}>a8wIWq%LhD<4_d*i}Q#;1$
zN=0{H_qX;voFeHjBg=7ea=OP}!UnHh;Zg(O2_N1YKz}F`bquLPvOrU(ha~oqX{!>;
zCBtG%@w>jr!6^cCfIHsWm;T_B4}buLD46Lq?$4XSYd9c=N_$EwOOXcBT`v;9rZ;a)
zI4T4Oen2<uEJqCF<Hbv!wM~7-q8|OvzsvFrh~~+gRObR6i5dhWP0zM}eQ2y0T~4ot
zr-60_#kMVuq56$3ex{A1|NLoH{WE779Wn!$_JGT0Q3K&2wq~Ou-{$h?B<s@=zx~V4
z`u7(R1S}yHzh3$B_3Mn{s0#!BPmi2$-)VA*^Eqf67;$F^n6<JL1AekL3BKmS#gCEi
zRNqpL@PV%_NmcD#ufj&N)e7iO+uJ$z5m^T*@zeVrw0t7e+nSX8WDjAlIehqVLtvnt
z4JXu8gc%Zgltx~4y4NtVaO&7V5f)ZewUVf6_*4&FP;hkL;bUy(;)wxxp_K>yb0oNq
zW6ME}IiARuoDKAXt>SP0N633kOr=!mem#7ws_@)>fIO0(qxV;tMr!TFW$PP!`8ss1
zT)$@uni^MgC#MsX(O;oSYEX8wL`)}5+QF5boR`s&VFe3{b7Kxe%TqO!0a_TPG@Q`L
ztnr<}!#{*=>Z^PR@{g$BfB(I6QFVQ1kN7!S3WY`4=RS8<uVB1}#plvT`-o%LpEzE(
z)oR1sSd$0H(Jof*`f&-^aIFyzCrjDpgcs=@IYBIq?t(5Cs$W|b7XB!sJlh@tfKN|i
ziJK5&)*!AJa1od-Xxr*)rfozC1_g+r(WlQu;b&4QZ(&?SSb_50-*6W_+VhznThDfb
z^eS#uq-hmI(jEf_n73)uX7K8-=S1P>lY9`wP%H!BhGqkM3CJ8Wr&dl%|8pAr%5PkN
z*z}+En**jWJ}GG1hnKm3;3e@mf}F2(W!2{oO%#ryqBe&!<?S(a+jEskiVHc8ylRC>
z<*t1pQDQvAPl~q!joYg)boITK94&V_9r3I5C6r%}>nB!Ee)eI8ze3@tW?l7L@Zc9T
zy2|?%GQ79!Zp^%G%70ZT{MI6~3_cL$`m;1C>;#EcFDgyLTNIY_7^SN}{y4Hm!^z;y
zXFhiDoVDm|QMbZ5>4=*wOwG&=ifaHKTAsA$XEG>4;dyV3e5^{;y#IbI>cNAwcnohP
zR)1Lx<aqb4zQWVK7dBG!=g;>(2r=jgX=k18HBBsFj7;EZdmp45BSL@vr^0L=EDV7P
zReP$d8zI?~|EUP;AQC3^r^^fe^U_NpDJ&g!DzCH-(>KhP>I9}$E~8T8zdWlyFAXDR
zLkHAh22JqbeC5U|7D1iwop=9{*bcY{*1TsM5!IeC75c6Pj=B}3bYs1ZyZJ;;Il?sO
zY}X)kV<{nQ8d!cORKtN?+V-3MH-AO+ZzU44vNI3=v)@Kzw0H!*G_ufy6{jf(8ctxr
zZ54$kd_N;^+&iblE`Idz@hC;{S!5?l!7Y@L(QxNl%Wmk<YsPtN@UiWx5188}W(rJR
zQ0SM6YS*OWogqI9sIr&QcG*?_mhz{eF##ls`1x$5o@=pLEgxi<m0uFaEKYdm&rEAY
zP79WYLSe3y-r7ek^;qPr+kR_G!+)iPkL{@zL{36BFNzgv;c&<lqj>;k&BU<|l4$$i
z{y|WD^yJC@VDN33b|acW>0eq}S_;>TZ6(IfZTK6L@nC+Vp<jC+r05biF@PvWaVQkA
zjYZ&QsgAsQr{BM>VftQ$rKLxOQ0{GJZ5>Hs-%g&!9)@_BCO&xd5th?o&u03uO)#=j
z`s|gJm%pR!r;H#r<g{cNsc}O=2kOgbM*PYzx!`AZyDI3X-*I$1nLj+97X%a#H|}K*
zz>WP9O@-%0=D+QS0Y1)~g&qvA7QD(#Ll_becD1y$L_Q7~aD-h<E;jUvwKA?{IrWRG
zA;Y|G6D5@9D`6u=`5791uyW83Q&7))7deF~A9hrfJm}lW{aF)~4a#THb*`OVF)FO3
zixSIHG@3is40EI15VX{}54HRsjfWltx>;*lL$O6Lt)ZmL>7}?zd!Zjzc&dv^{>>P_
zLH}70Z6!sm!+~%3DP#555W0gGS!s%{Uog|oBp*;SgvY|bfs)>9o7K=SVz$A4*0iCw
zl%#}Pd73j1!fl=m5#LW06342u81*({4TCbtCaRh~H$7uK3Hv0=i^^yu|JdU8=J5#0
z+hFpEDiK74M%fm8za3AI0m(vUweiNXs1d{9PPAi%MEwbtkH>kSnTd%UuRx{qK`&*&
zow+N$V+_L#Cqr4|uo1*a9)u#hgFN8Lp&1#4(LV;7gvlJqV1}m|C>M{kFd$}JThAwD
za~jB;?k{c=^J&-Y;o0R(r$l}SEN<jY8t+Qp{(>s5&SAb==;m0dzPeAIavFg@5w@S~
z@`zpgVt)E{u51M9?qOyy&RYssQe!-e9x=VbGlEv%phm6x>Z4irJ}8$0U$q~zM0_W5
zj$(zY!;Knx^NzwOgRbZ0<$e1pyj|d3WVm;Y0Mis76}fyxSO@`7|ADzdd3oGV$DUPB
z4UKWIE6gcKRp&E_TWDLAe|1pUp8!*C)M=PCV1I&YI4ypZ>46Zu8cwLAc)pY}_gzs|
zlKYPBIrcds{S;lv+8T-oV>`P;bsBe!q%<|0%uc7AJxC(mhT#>8;8)_C;ySTKdPd<{
zo2B2p6nX2O_SZ4qkRBO@nyR66P5}3SHUGp+Uf=wBMur7?rx?!|78b>NDBITy*UUMf
zRTk;y&m43DK~^JxnAZAm1tZ<uP7!s>Yi0_!hTM6-Cw-v{GsTHxQF!rxHdOsUQa#at
z^Q}in=LuU=9378yt0J;%)8W|PbQ^{Bn39s>t%_z=a%)&w19aPvMA^=pw}2+ARl@RE
z)o^&b(f}f_lqYm&5jn6#cvi8RAq;)Ijb3z>V8??nv6N)sNKOPK!f)Q2boqrPU#B}~
z^M%Forb-L{s;spB!3NV`$3Vq7$ZHJ4GiZQ`oMLB-rKdOmk!9KEMu(zev)R3rYJl_8
z7fG<HA>tb=grZ<jFK)FbDO(E5iCYk36K`R#P)y1`;g@=roL*V!EaDUDTknJP1Le_1
zYBf~EUSO{?Ydn-dX2^8SM&gyVXcP;|q*1@&bv^H;8vI8TeD#0fEUIWK1qvT0jbjX2
zoxY+96nUQ+iQtiE<CG|CcGx75>Z|sI$ADx@_OSXYV(0>C;PYgK$#-x@;?&c5rH~R6
zF$NDViv$rBQV7MXnjzJjQng6uQ{H`44%wb7s8I<{H9J&6Jghmtr?5*K@lF(;8scyj
zyRE^0GQUlEiK`QZ4579v(6}38G%m$$DqO7LUZ-x|Ar|HQx~~J?B`}{2D7T31?X8MS
zV**N*vtW-J<!S)6Gik+rT3!9Rx+m$Ow>S>tr?P(YAw{+kl>ca}qtfPAt`I3`rKO=<
z1x<eY=MT`}JDhK<^&U!)SzL=GtObe9ki}&pvwhuK|H)Z4Q7mwnPX^cOl8>SBRNjB|
zRL$Km;aCExt6s_rgP6!_9`u1ho0DOiOhG3Wq0(Cs+*De)Vzb5X$-m#KO`CA&-||-a
z_idvvJ4#|<&ttzw>gO_pEhoGX3YWIKShF3aEhhVnJg)FO-XJ1@ts260nye+Bd+tsP
zX2G70DE_zQpFN#&nAi|AE5ERypqL?$;utk!f6WO+cl_?*{Subcam9@|p==7lN{|8K
zOUhQNm0w69gz^p>Vfq2j@`<ZI`M@6N@%R^$63hce5RGz(5J8`*8lVkZ2;JR1X_-R*
zoi~UYj_QW&1DENYi%CC(*8<9rKN3d*&$jDHIkSyNXZ)OD-%&-RvE3<YOPdgm;Nrxu
zC5cF#oW`$O=I&lZ8GD<12;otT^_EB>XPPWKD)vaODP>C|#hK9{*vo-}b3h8EZaur~
z?E%$r-hh}qiT+l(@tZ2FFQa$bx3X-baCO)lhTbSGLV(-CNRW_O@3{P=udbRx-<uOo
z=D9p@yxeFx03A4^M$w(kU-R+BgS(KD<~2@o6MGeo6cZP-ot1IP7l-_t)}N^(zBP!G
zQm@hH^<#F?c#f5*$)AT(hA7D|tX#G8CiK7(<b}f9HzDaclHHF?edpf2Q3ALL7(f=Y
z=QG3}Q4}PADFVn3?At}rx`aO0Zv3!>8rSqQy1g(4os424t|zg_)?~0EEM6?ggdz7j
zvHCo+fhYH0<4A~PL}?m{8$v^4yj46?uCTx(4%_Pt-#6}uQQnzu;zdgQJhJkt5$6hp
z*?94f6R<$No%r?ve%=CDFnW26T#<^Z8;+Dv3DtFV&!li-*zEN8KZH;`Wzp#cge;Kj
z9&fAYatPibPr8oUFP_EoSQNVlxg0{4WIa0(e3c1(QGAF!`M<=|5%p>Wzn9PedJNNA
z_*s4*Bd^$M6){7Dd@<o5#ErJQ#5m7<Bde<Qv9|wRRlRXH4i67Ezr9*uuRhZw;Rc)M
zwq7lMpkI(x-;q1$WdcGIE2JCD21W7$@5*H6%+5y`Dq0&!2Yw;tQax_%G+f*@|0I+d
zO`Mn*d6X8daHVAgs;Db!%&!q_H<g1(%3W}#u#>54z+!W#{)FEwaTmj6a>8e&r4IU5
zim(8Z>1TS|nDrPY?&8uZq8qw1_8pA^dJX+)VX&y%YzHk+j2530^J8`XQ<pxspr#X<
zC*nw=+$S8LTp6li)afwY4Oz6}Y0Wk^Utay(3BwO?wH2R$6UAxr4|z!dJ42-VBd|Qs
z>>Qw&uzo^PVBp6XXd7`tz71i{*&fCRj(Na5S3%<N09eF#LP3?F+8Yx@Y9uKLQO`t{
z^$XZn%kxA;?$2f%j&p@%$;%2d2$aA9ND{G7k5XJxD%439a2q2Q25piuOI|cDK^%5d
zu3zuV*qiLZVk(Rs>7uyjEjc|YvvOXHRhWbfYX2L(nPiq2y+Uz7IP$EHro!Y7=+s0E
zqv`D&m!z1=ETk)Y^>O3#AD=dOKuaUt0o6SM#p?(&Kpw%5_lr;+Z6i&J>(H=pA!Yyj
zuq5Su<>z;W&4h`23V(&9TpWJ+tC*v(T!8e(ZAoJfS%=VmBnQo6W~e%wL6&BRq7Of{
zu@d=lQ<74Tk5B3nm_({zVYB8Ip9uu+r{w`9NxTvlkn8x}nqe*=BRQ<JGlcLg8<NB>
z^Mt3mA5<S%VARI+=`F~XGIyA)#UfOn7Ovckn8^~8u?M<Efn!qAVUY7EHk~;KR=J)I
zNcW4WVQVOa$ku}^Dwcd<;jrj~9UHlhWQmp^h8r(7YO*It=?oKF*7|$hI?Va6+sCFX
z)M*#gnwRgr1gKi<wh5xW+hkA`p>F0=w)0S2YMa2+`@&cqbGA4O;)3hP{Hk+Yf_MUC
zo=Ex06WP^#`SP<a>(baCVrs{7o!gO-Neu$h?C1@PYb|cAM1$w9E&6`?`0)j$kh1YY
zMr-3TToG|w>=5Y^PxqF>g7pwiQY8fdxkh`~#_pH*NrwdTftiQL`SMB%2OA-{uBlgJ
z&PwSWVvd4)qIq#SAmvR$lx?gPQqwL(y;fq@b78TR`NGdX{#MR#j_$FP!t3WM6s>>q
zvoHQjx=_GV+yi_O0cHunlMDJ$2v`s@_80`>#jF1Al^@Vk4goW&=T4w*gbF!G^cD>6
z=vhMHE7cY0wVx`)k_F(b&1Z0=WNXpP07Z=tDo^4m;l}XC+>S9MflzR>4eG5a4uv=Z
zF_1QUbY7{Ev9V^X4G5xi<Jc{*8;n>My(eJ4W^!6QD(L(7wSCDrKLa@jWj9~{s8C$~
zfo1AD`0m|1efGLEGH>6hYN7pUe+$nEbUt_C6q+D-+q&nRkoFhkhABN}NN1MpjZhLk
zGO0%2DXcRpsZ!0FGf=7RUG}t|vfg`aMHV|BwL`kOj<Cu(=_6RfH}ZwXcH@RIqAa|H
z$dFl4aVlCSrIIP3U-C<{%c9{s?u*_v5^P${q;t~og&kz8KbiM@_Bl=@y%WR3#K39A
zt3iydoQ4X$cZYTglcPVVFZ$zm($FQ$4%w_DG&fjRzV61nmF$ESOTVu*h!t||9h+Z8
zt&7vuA!~=BdV0#!m#UtPpLpo4qBR#~BiS&sb`Q{?+<D|K(_Iufmsw~BNK6J><Zjuv
z?GDc-MeiDA6>Nq(U}k$hOK7E+Uo1*2&KyF-l$^{pZF{0P1<aT1gInbT$2qjZRG1+0
z;>Dj>#O!NOMfi#sOGp$0&pfp`g42i_EX`tTt&WOds~>b=&)pOcV_{}%I9+tWO%=rd
z*s?90RXi@$nB13|_g+$vhuZwU3><>0^<_jUtxVD0VO(XFaz?Smh2w!=aGR-y6s^G!
zjC|d*_T4JjMiqRPBV=4;dL@@JjRem}Rnl9x_=J?0S_5pTi2=!MPv?myj&sTHUT^5e
zIBrm(nwyz5A9zu!;87ImLDQ*j)H@YoACS$}U~zw09j73Zu0OpkS2qF-qKZGJom$xQ
zgL+S%qJ@|~>jci#2^2?D0`ipk>A04#al^6;1+)ZF84m&ScMVW77RCsVua)yKdOLgN
zJdJh{8m8}~S(l1}qq2%6CH6BQ@`FB5&b`1ZMz+7IA?spEvcXM#2Ood`Zh>jt;vYnj
zyhR*8{w7)njR3f@FFO}ajrh*h^5&khJ3{X<B{zzQT+K-#$U4c{Xg{wSim)9&PJrOE
zJv245(UP?2)akY6J!2aDmYtrjShg$-zCe15$-D;|uyeHntan+<gMJ54w)5^h+7Eyp
z-2Oy1nA7}@a%=jA(mf;n=xHdelCOwLM3-%`uoHq7$-|g+aOErbow9u<5*?Y71+8{v
zPVKtIp4{RKs1kmFJETCq0Rfiw@f2{_T&CK`M&Q)YySj>oOz<I%-o0_{IEq{CMx)n5
zNT-BAz&g}OSgg?Z8hGdOzvGk?+2+u`h(1fAj<@O5=@=p>t!;2j*4?mk*?3?)=RqsA
z^U-v&n;HaxnTc%;CsyZLf&?iP9zH=qFIVcpO4Vwh+pK9*X}F};*HsJWC=#-V%MeY?
zNawXRT}=RySdlcfyY(D6(1Nugdq&Cwj=~{%rI^Ej&I#$eIr36S<f$kK8BgfLv9#tG
zzbV-6dUCS3bEqe5aN)MZE(7kEM(hlL*^57hgPqZ*b3tB~Z#~O}o2Cvn2}t@_Ss9KH
zQ7bTwKf?Ov+6s4HXmJwrBJq*n2>b*OJQDht^V`mHV`g2KK5*SBuJvE6e!WfNGdWLO
z)fkgCY4qsP;Th93TS6NwhV~$gb1iLc`hHX9#;rNq8<^EC-bN>IJQ<Vu{GBTq10qg5
z68um!^V;l`!LL{*ZwdY(NirUsQfpd0BK;8wGmX8EMkUP@CK_p>|B^aA)wfe&0DR2L
zEC2<8bYW4@k7ivD$n(rfi{xaD>+$v-#t?S^4Ufu!2{~mT!xQ;>+?Y6W5Z2OltP}zQ
z0%!=HiA)n<4wn#`&Sjgv7?V*}f4Y+>*e}rzKL!W$HtjCFuA1Wcd$VRC!d}SE*iEG@
zT|`bO8BxtVgaMs<vSm-u=e3Q=<qy3<vtqBlQ5M2iVJDO)v{nS`{n$;9m&<`c8)!0o
zM$A?rLUlvI5^-Y=3}DH+=VjV~exX2kdf<v&y60I{0xk3p6WU@>8HC1eeY=3hf#~8f
zO0=?@A!=&MOAOs<@ttJQx_WZM;DxjmZjs|kw~nbKd)RQ<Zk&nK{PH%i5W+>+CJ6V#
z;Nli7TgnX+%QK-mV<>cDa^3(KL1KT*!!X*@%I_b#V3cnIpd6($^XB=04`q4b%3UBA
zsTCR;I>_QK2Z+V@=zaGg#~ybk-Eq!eN*NF?VEuwi+23xfSg0WB(^kAIcFjdaSrr5N
zw=$>Z1Ve}`PQzY-zdv91i%A#8PH?&W5zlGNoaXYFPv@a3d9hHcHTa97@fH8A@AVwT
zrv?hq3l{tu$5rsgj{rR6F4TdEqpqzTlkG<1M?P_?rep`)6<n1S;|+qSWV56;&jTS{
zo?lB*-1-NI*5eP7*i(xiy(0c0tmE2J^}0cn8u(y|52qQ?U3@+e9>(=hQWF(w=s5B!
zKW<yM6Z-FMl56SR`5ifin~%o(+80`L(5DCfBfJBsT`nz`EMjN|2?VaW)dRbA8x5u<
zJ!{sGATmfH#V~!qM{{n;`hj1%i-%;cz;xUM%+5&K2&*jHZN94FldK9Z{_4bHK;BjO
z1Ht^wbxo+F(0^d-Z1&8C;Hdoaw2J(8+PXv?kHaB}z3g9EJBmaE&e03&S=!{UJS7?w
zy^WRo=YpTq(ri+HRh;Q8F~HcaDi)sH6gut6ai68(55Iv^KpHZH$Y3L_T4C*JVd7Yl
z00T*y8#G;g??s(#A#QD3@TonB5qEB>8~T@#)VralP`};+wI`Jwu=3*@=QPtrumqfs
zUW3&W%BbuYP^ESR22NWi>7{qU<bRzr!(znIg?*t*iiEqfutwdDkqQNAv_l^)R4t%a
z*GaP#VR5vP#R_=|sIuqap4=yK$}ibgP%(-kKTse*OqlVZBUcF>i(~#QJQY`LuGEbh
z2CG+{|8+JbrNM-a@KQGE0lwSv=oE1vBV}IdL}F*MbMCx(oIn$ZgK+(kqT7a-VhoM|
zQSNNno0*7#%>E1lX|zPx-8)8R&x$81WwUwMVuj+X=f5rm`VVL-hmqb5vA7GhHe1pF
zy=z%nrj*|%*yqGJid#r;Bk<NFKzqqANdHDdM4D^Ril^L)riTosVD+N@I!4><InV&*
z+8wws$KoELk%94bKH^-k!n${_UgL?|EInwi1e>>W9l%?fQ&fCQ>7*|6JV>-SC6*50
zfL#OA4$(+eA3nS~S_kfF{l<-V0_qc)=uQ`wv=wpX+?Mmu4}C$V&}86A&Xu30^u(He
zE1@23(s6+^rdM#<T&HW-uCkHH>7*@Tu(3w4TM8gPISO%A)o{1g+)bjlqIoQ+ts=XT
zkm<7b|54ZQ7T7}t6vJYMoZp0uqH%vGTYK@&K#}q-hZL$YKtqww0Xte!5IJ=)KZ=Zu
zOucs|E)MhS8*xwl2oe&_!+No#MK?zZg$C1a%Dtz6-T;@3tg@m+narG+P_NgHx=M}+
zr`i>d)vH&Fybl0*MyOY9MOgWdfS<h*wgVO-Q;7HY>&Cj5+fmpayfFv#<rsbV!JBp9
z4unybNbJb&#U{Ir8_yH#oVjT2#H0Q>v&iUY(D#0g1IBGqI8{Ry&TUe%caU_gO=nQu
zr=+L*$NZhg1jTP$09C>rL2;6we>44eJA8aXI61k$RZK|}AEI#%yqqC2!B9fTu$5;(
zDhi*<+wB&<wXn6!>tFz4uR=F+*9yMB1l3SvG&r9tiO{zo0+*xv(4h%e>IVYknQ#dK
zNzn0nu%+x`Ecqj;H0B|jo7Pf@;!Jc{m9O7%ga|kETPw{&ZUe@nrs3J={?nMfe0ISr
ztkfUH4SqxpgweQl&o(v87PqiblLQZVsE#U3nw&J2I%d!65<9G7ji@uvMSATxvg+8k
zelKB5SWEy~NnX58^EE%V(>wqF*ILTPw>8`b=_s+Vq(&zutZT8IMQ1$|)-UDmQQz|%
zpyq%P7cS_}wY7yzVs4bEu-}U8?@taB3foWcAaNyJYT1qzOL5Fyh=7txDyD`Z;;2b1
zG!8S4G+&G)0EJWUeZ)NyDCOG$C2(R5!%!GF3|lh5j_D>PW20o2XtN&+%&Zd)2GMr1
zx;PfYc+ph!J3l>twOFzhbdP`%WEG?X%H#JLlmT%hli>x_7i~N6NP{Q*38stplXdti
z$~_Z-46=vS$kzLjLr8T5{mcJeZ&lSRanZM-jtPNUxn|TUkwtS$#KSM}4tbT4!yU{u
zxUghy8CON|A=vEW-@AMFF=U&gli0h9esmMJxcn%ZoLV}Fi}Q^+V%jNzV4+UxLLE_+
z5XhDg!{y6<EB%inEC|1Qhe?s8d>#s>?02t^=WYV{Sn@0HG&@r~@gST}vyUg*Es!jf
zwN436?(<lA-CDMAxIy7~RnqVRr8iMl6ovMIkP7Aa1Aowu)&Cb_zAyl+j8SABSCf*c
z$6;ie-%iSgD<bA;F6+j}<VI4EoPW$C<)OCUyRyu%m-U(Bw>|}u;(dL)Z8c#n{??~o
zOooszCELXa3J72V(0nG+3vf=QYga~x;B7TkrZg&}?YWfludgsrBMnn0{(x~F7SL0&
z=!gzKG8FT!By&JG=`D2f^Yi_Dd`y+uG3ZC{NheW1=6}m3Df%Rn6~4UI;=N0Ht(p*^
z6?*GaEt7#kFwH6FBeQEmY-7%mG-BsHv7-hEFP%ESox=0T4~ClK7aB>z#wx5ING+3z
zzyQnuSQ60d|Ex0PNhq>si#vo=KrC{j(t$cUQwXHWCDw~$Pddc>qLcIzImIKh-N-h?
z*g%pW(lPNCQND=>aLm<?(#K<=G38!(dPd(t2Nk-aBi7bQ&dSQV%GL^-^wS2h=Aa?f
z2Fya)X2ypFB+*kmK;$BSYu0LpGK)2-EeK~C0zGtIuAxy^F#8Xt@G~lDtoV1EBFnrz
zxjKz(B1iD7SS$YI0Q*TjZL&i2#k39mDea&wSQIby_;xBTJ>Th{_3PViZ%!YHNp#95
zLNS~xAT;gR??5<j)kh$`f_~t*;&2%Ar?#G+z*A1&SAAzgNRTI`Ln5MH%9uM3W>{J6
zX7A6am#lc~EU6|&t3(uuj?@`%qjg9$S+XusRlpC8#(vU(oX(*yTHmvbFmISzLvbNC
z?1fizgwjW)b1fHX^;5(&ggf<ZO>xxsW}lN;!AAl5gec3a!=yHib6W06-;zS`9RxOe
z;Cyi4j{`oqz`jSmU_M(*(ZgoK1R)`HkAKjcgSDcfVsf6)km6MCBC&l&6&}4sNIvAe
zIZP>KYRM?+ey3imqX<4gGdz|&RSq!x8!dub{ZhBC3^rmzpDjI=hkwL#+0fZW|Hc%3
z8~=cSa4JByDU~^RbgtI6dcXd9lgZ`JH1BO;A{Hyqnka9wlb?3V{2kELRTu*c7e-1i
zgoSAKVk#Q1$re{@c}7h6wj;O#P91pr{`aB?KXPo5*YyXHQQ|sLR0bk3aa%NzHz~%k
zq)-t{tQCrs5_qV>k&to>#VGW9fF_^`Y!3<c#kd?p`r7QnI{Jy9>1qHvMO(M<TRCr}
z#9o^Hi@oS{RZ+pHGG*s!bRCQn#6{w=7*_(u#nhy*yWlDlp(wgj9t^`bfJUroi*7YN
z%LXA*rHi8Re^21#z=MKil+lE#TdYsNyLY5j%DgkMvm+I6vD?{3CRym!SX!}Eb;?sq
z4t`Y9z>uauFAc{MIQRSj%(JI_@1-z}Axvg_p*^KH(5HMC{ZT?ZZ;7F6edv(4QmdnQ
zobr`Qym;~I&w;8%WFFbWt1Blm7OHg+aB#4l*efnN^P~==%X(0R@mon+LtBpgatN$U
zVo3wjniq1uXW{%6;oYLDN}DIZxAMFN1Vx`3wctJ~QR)3<7xJ1!82thttb`~^xE|XM
z98i&omy1qo5Ud24jJ2WP!#pfCA`MwTww&?0q!+Gj|4oQY+A*ih9;+@k3|fU)5Gk!~
zZ!0Rypi`1ZKheDZ5^A0#L?-<@t$edajt8!C+$UFaKV^$Eu_~I5PQwY*X^#if<nQKF
zboSFSv``hO$qc<kYR&@FAktGxi7Y3COy9<B3t2aM!Kek+iZG++y=K?@R!2o0*)SKi
z6uHpy00-0rA<~`|*`OmD1mCM+fgq)x3w})N%N7Ns0P&Mf22f@d^h-P%O#v59Cko2W
z7(hfRG%mawB7&=xZ$>~vH}sZ4!P3r3N=kx71nqVQVN-NMlohuc4ZQC-epS$5S0l6n
zrtE*(^h=QbQ|S)chxhh@VILpYN<hped<~as;hTXqP5-a}3WMReaXx8wLgeD`MhLXb
zvK=5LfOiVUe{c=3uQHGO)REMcmAoRyjjPE7kBX>KtB#rOaD^gqDZN7x0?FZy=7`Q%
zp9T4F=%9#Fu}RND4sFT9^n3n$Sa=rq25q4&!E-PpW2eb0j{YF-qzPi)3Hl^1=#iG%
zvQUGISrA1`j=w3;9UvyGp<KFrWS#OklHLjjpLUe!cQ{_}ob*n2$ryTAyj;C2=xQz?
zR)(IUT)a)u^6uV(WAo`fEq@hPC_QKEJ0%K5h@8x5UVZ<6Vb9R0l`9FfJXLrk$E@cN
zA70?MD@y)8SZ<aq?*NJv@~z#H-ma;tGJ2uw6_FW4YYF|Ho{Co8$y%QO$aTyHvK-W=
z+`Uf|HGFB-mjp2DFEW99<;243>bGO+SlaHYXEggtdfokM{bs-RSC1HW^wi25x9yMU
zn8%yvFF0*5&g;O@YdvPKyW4N}olSnft2W)Zqw@3RjQGErYE4kh_*{BBuk=RA@@|LA
zCZrEKR)PS7%C;HgA0a+C(@h_-kv*^V5DYmvaNx1bevE({b&D8O@*+2;JM5K14*1PH
zVO1w3Cl{@|N31(A^~CJ^XI&d;XlMZZ1Rk&~#4l)swRL;W?Nyl{Ra64f>EA%muM4bH
zMKrd?Kqx#3zmzSkGHKv}$N%l;*Fr%adzjeQ=l-#6+ZqEU7`<J%PesN1%l(8Et)R?P
zJ)$@114uG)Xw<T|Psu0#Lg70Z%d?@)*z3tvW>-KLzJJX=$jRoy!Z*z<Kmf@YpKIC~
zCr+O}%<PNoNXup{Z1%m`mG9r~ZNPr}hfP#t=A1c~K{Cfv08h7XnNH)mFUW%e+`PIU
zjtl+M)K$vVB^-uNwSyj|R~l<IZQ3+yxiNshZ_CT)fK~L#vEU|m4PQM>)MpM0xi}Sq
z_D5<-`u+pJ4>-;WHqN<`mKM$!tZ+wCs4UNeT#I*o^{3p+_pjt0w0?v1+`TDxfJ!(^
zA5#Ocn7)<_YT2@-Z*QBnygSoOJ0SZR#RlWQ|Cswb3?fbTkF33^Wv>)=2}Y+k#>Ki+
ze_cfg{<Tq~tCueQ1%7FQ`(6JD8Y*S>=**kS+o9@Qh*Y0r;jnurJ3G6TD4&ffBKyZ5
z-mt;8PoKe_7uZ`HV`8YKSSPApVH-?kg($yN699gq7e5-t`=6yQ$NjQ`L0FLBhTnp#
zq;R9*+7f+#8{etle?%*UcW!#daTb!`>1Vy@i)XUm6=YW#pg>vbdpC^rm$7*e6R+@u
zE7@If9OS!C%?LGWK_}|3`1lpOPie5mCazqmhknS8hhK&8&l<NZ)>sb{Pg!Hwhe*$7
z_Ua~0c0~SRAX8Glf6ebQY}j7Nd`&R0P`{_=^z@49nDkZGZ{2E2^_bN#kcl0>8Wz3H
z``7|pD!=r4cj>YQr!vQa8*LQ46CA@fT?`*JYI+AM0d!#7!HZ%zBKj{Fh+QHG_eT1$
zhb#kmwS%y#^<ih_mz1QEGj7<lX?d{|gsKo&cP!?bEXkqWWg+jrX4!U+{`Y4zYSc4x
zMq^9Aeg671g2t<d37>Dc;8g#nqQZCzb;%qb<*t?QSB<wiyCgi^GlrF{{<wJ)_Yt7K
z%a$*9CP(P+FmUkT-4y&!3Ja5HEIKV&Vx0BfV3g_{I+4GT2C%`>s7{&Lyy9u=1q&8{
zWE!#!u{3XFu&H^!&YVgcZ2PDEw8_fa6JLL+T|2-|LRMX}i`4~#PMtbs7G>S$`zyDJ
z*THeDO*47rdo}v^Ct|k>nIn(#U_3|<OYjn<jN;9Q4~J<=)H&R=iTgWQ@yKZoP5aD`
zVy65z@BZ+#PCUX>v<>gZE@(x{Uw}OQ8M2ItL-Dm(N#oU*U~mhj>C4hmZC$z6*2daS
z|9fp1`cOpw&Eq2;fa1gfP-<sz^@ci^?MU5^qyGz*U{Iz4eP;r1g^7$qp}95!?e0I=
zn~xiNmi6hyv!(5!e71&cnWB3h%%|i7X2ZvA*lAdM7nkxSr4v~X_?0!py3!73he)f~
zP3-xh@6OQ6l88qi8+YQy{{HF@e&>Cs_ix7j`u=6RfOH8f`3<t?Yu3Db%Ap-4UWQ(G
zf)9>;yEf@?^AkA2w0oT3M-EQm)#e_bRZD^E9OF^`X8kQt)FBci%LG3|eSO87U`04B
zzdGG{wgVs$1d^i98#{yJaVDl7x<iJ{8@rktUkLAgCP(haPoL0+*TE@ZpYAmx1rI+9
zJ2Szi7=1w%ET4VwTKG>}KiZEK&mw3@fTBoa7=T`R5ce8rcNfBBH|Slrd%WP`9+Nzj
zlJpAs?gmWu9r{>AwTH9AZ1vW=Ca^-H4?z9shkqFRcO-l}?URea`EaPLrH^)o9J()2
zsm<;Yqf|A{xR-Y$x?p`gZ|c6<^_xG2+Hr^|-{v;YvMQeg2lLAyI!7)H_T4+hZPvVb
zV>rm>Ga7;-p&N0ijw-<C4vP!AY016*7Z6k|0m++l5d&H{;K#R_H#n@kecifs*|2u%
z_bT4M$IeUdTLilm7#BB)F$A?~xy%MYs$v)gw|)~&X$wad-q_usx-;pNtc0LTL=qoT
z*baO4w;?Fav+77!u)eOrsOh)p4QikCY}Q#*a|`ugKboENYd405POogy#Bd@hty<^K
z3fZ7h*2z4}y~~4+9MK@F#DJh>e;jNs*R|)J?6B~$`*C*c(4_CQ2J~q6x4lU%u&g-j
zOk+zctEZR*@1B*Fmlw!L?V@#~`O$@{J$h_lT4Z0O6kfi4cpCFndiLB1ZtKPFbS`^l
znB|7$J!sALS$*6)4O_X3)CnV@zXJo?fi<75@5zdv3<NY4H;-9gR=B&Lz-5aB*=}kb
zVlH6I2Eg3oG~Oh7UGV^YMmjQq6O(SUnv4hR9Li!o_w?G_B9(L&xHFS;%wi`o{D^CO
zcdHQjw=-yx*<43Aw?hC4;r98+IZ%fuR#XmheiqDkg{K6Uylrf=t?rVcU-$oC{%uE_
zsDH*<#jX0RbLZHM?kXyOQB8y2d2LlyDKm0%cD{7+;u^fVo&v%w32G>FN2mhF!0&?;
zLw9a1U5laR?FnC;WaAA2!8GpPyJu%I`}?7r{2L9)H)UnBX?h=WU~@edI{V~Mqc;Be
zbVp_KpyI~v0h>ui=1qO1qGD;daG@quV|R7+l;aKk{r#KwvhpR^nBZH<KsqK>7E-!r
zB=qU6qLNH;kpJSvhLDi1veMa+zUt=cx&3r>bkMf$lfXw*!4=BZ#7cIjiq?!K^Oh~s
z1tsp>ts82W!3QIbqUv9V*7b0N7u0-U#J9H!|5>*#Qy9&2nd#b<tv}AZ9g?Sh?8rW=
zJl52#jmwNfgw~7YRe#*cuVJ%o_5bbPY}c1`DboV$Av>8#ENai<2o4T5Yo)q*!}Z}D
zQ(;q2sNmdv0LckR>rATfC6Q*Ew{0t=;yu(xr&g_6g8(kHGm>|7tqZKTY{iPe`U3}O
zMg1=x{!Jh5Nq+txEcn&~7qu6^!<%2J^$j@d4abhXkeeF-GC9Y=fsI83*#zP3CV+im
z$3mVMd8M11+q3jN?Z{6#8BWB;UQJ1<&vd%Tqep2Kk<!-SZ&*GCI3;olunKClYuApy
z^@*K3e||F58_}T`RTcxrXg$_w)g-wi{?6o+bvW=Lc5H^{wE^^E!OWFBdc$$~nqCtW
zjq4Aj`|q`HUnl5fs{Q)ys9C%I!L$F1m|Fi31~IkkBdn9#$7USMoM|xX7IT|=klZf3
zdwdNb+ib!Wj3aq50PZRX8Ap#XrqKJHoWrMu!$XptfI_{Xxk7Q{k(tKYn<oST2q?eS
zhnXJ!4dCq{?a}oW!|iUg=G;_#ME$a48%aWoP0f2=!a8A7Lc-A7w{KTlEer4`+*`&u
ztI-OvLrG71(0zPft0p^EgZc#evtcp-Lm6gRk63pA&~FzTzm+m#H~eF>?%KcpntFCg
zPjJaDI9}$S#}^D()ydIuS<(AcKaDZi15Wlkp}(eP_~1vkZ|mf=ky&*S<u$;=cSpG0
znyuPIxw!PcZD5w!|M72Kqmt+}lkceFtsh=&5xtE9nSq3IojwmLJbZn8nz3}6_nge-
z)^#Q=KhectRJ12P(p*QqMvb0>Y~s^8H1|B5zY`#Jtjibo<Ft`KUZPiY;Yu<SIULb5
zNtWHP;gv^9xvE}y`yoSyF!RY){az%A47>A&GD%QA<cxV{H44E7ETe7fDcDk>oEqz&
z{oW*5o!B@A11znB$EYFuLak+ajF~$1uIaX(Q;&D%LOFBuA3yfov&Uw=ZL=oHMh6ZY
z;5d9n&Py`YgrRWvT6OH0&H-w9ZdpIIasCI1J!t=1tXQ$4Rx{iQuOCU*gk#k+WXMq0
z;^uI4R6BR>?A}^io9;t62QIP6x6h1@<5G?LS%%3z8hdJKdBJ$#{Ru#s^c{u@7P4qU
zN+R@wSsbq`Q9eofn74D_;_38QN3sUqzJD)~woYjCCdp@^8|9LBK~|q79t0FIxgI<l
zj<YmoHDlXoy1uYMy?SI*1uT>eUS6vT23_6UDk(3o-M2*0oi121Y?nG}E!igSr`Q1s
zuM|0H0!H$_0tPPbsDRuwm*E4FsO%_|9R^cp9taEt;K1DTFd&)Sq%Ai=D40BS*(s7b
z)F``1XV?W))_31riSJER$fv(`kp0Jetk%Ahxv%N7^0iKGpvC?dDDKw8ub(!LiBI$C
z-WZ)^O9-2XsFm8Z7{fR6trt+MT_-3Sm(Wo?#5+9+g|1HzBQ5ZsXHdT$b6OvAAgzXL
z^*2OtCcT(yYHBt#?xEtHfEMNips6fP0x4B+(ENP_ZQ$(zFv5*bXO#cHRHgl!o#CC?
z+RJYQ(|L*zAB?8=?%LE`qZ$gpi;12A#|J4N3!anV&n22onpC!;T4adQI&5<`@p}SK
z72XK&p?10nZ4vBd9q{#<p6rXABqA`G0<eXqhj^&Q{l_C_eevySx(p(a+y{wT@D`1y
z*70t=a4P6Pe_zYN#mVW(>gvy(XcMRrjM_B$<J;r91>;S0nwHF`j1Guq?m=svg~GL{
zbI9k$W0vre*AMpq1q7TT^SDK4OlfV*v({Q8Z5u2J2w+!12#oU*5)vw@-6V`e+?zOd
z>|+!>ztpMIv~}yTWU=_37{8xc?(~1wy#vPn&BHpU#Zew^vcD#Ij-oSBLVhhydcTmh
ztj*fU4sgqY>BEa@bb*Fm8>OC?(&|zBLC$}}ZHl38&C;QeC%M0hAayipTO(t#1O*7c
z9Sua8l1y^9>(pt0KeVWS*3^)2A70Vt>6VQfpYWPyvu6%D@Jg79<i*i_WSO!ye%8XJ
z^xgng@mj!(#??XB1GKxt$ZP|6)=zg&>$$AgC5Q)eUmSmZuKYLDQWvqzyI@N~-k(n3
zQ~6*z2Zu6TIHsVHjwHR95s^kjiJ{wTyRY`d{Fg6pFob*rg+}<KkU9jo9Ak6dVVO~b
z1`R@>izM&;rzUObwUl{neT{f+ljp!QmeQ1r4-lYdNdCd896s#VLaK+md3ntkRDu(0
z<_VrqPh=-4>~Vo~6fAq<kRLF7c;~UBRGZOq(B+hRN-fsk!NuA6DRAHc%Go;Inn^-B
z>QhT6`tAMzgm}Ev&N{_;Bs0HAOZQe3Pf@NQEwH`1t%ok}PfxkO0|BND%BE}g?%lg!
z>%D`tqRrqXJpe2Ex2VCO4hg0YJw7|gE4^Nh-OJ6;ajhmWtMlG%ATu6v`K5J!a8)H9
z=IV?aH;#ZfUVGv4<;!J`CMd%#)kF!0b|jNs&VPP2gw$y?8;LUa0MTc3Qd2IwIZW7m
z6ti=nev$aq@+4)`#x``VPpCDd<jTbA&%+fg<4e#k8ck4DQQ5Y6>sDq$-|2L0`~SEA
zy)~zux^eO-9{!l*G&VanUHMAA1!Z9|t>s)$%(Sz+#^XwXp?Dl@Uw4n}`PoI&u8^X-
zF}QEy<jGCnzke@9Wv5R2DvIT_W!v0^ni4Q<)!=ddn*AI!rk&EntfdPJzD_rvEx)&D
zVE~rtlh(hCjik-PBIRjh+)YKL&r{xP!?yi5&}G<Nzw8KQH*t*PXy&H8N(c{f&;WKM
zZ_;sCxbT(+BJ~Yp4=s6~(!vAZ6|@ecI5)OzrT3xIbNlw~`pf~*MRgKgDr<&3?%0*o
zyRaKBr=@w3n2@_9k=Z8i8=k0KLQOzI_QS($KXiHXWhNiBq0F!l45;rKO<;@lqzoHD
zkdfM9{xo{CmVjps8vBClsN{}j3Drao?!)TU!jX(5*LSZC8=9{XVB4(+*f*y#uJM?8
z<7}QbpW1b9(4cW+v5|QBFrl4-UBgfjeK-{jC&+U^Wo3@AT0CbIpkb74wK!mnGb_=>
z9J|uMi{GjInwWKT+CW&34fOGne{ttnCz?}NP?kQn?ODzW)`n=SsME0Z1*B~am1_Xy
zj~=P#Gpr31Y$0umSZNwNYH`|GQXW4iimU19wDcqrpKvR+g7h~}{<2B(-?tJsXp<un
zmA#s>oWMD*BIMZ18*A3KZGC-#5(g}3DKWLX&&ifxL8L9Z9Q+C%G9yEF;>mIOF%iSt
zis%;%O2MH}%OuRC3$cF`uWCyp?p*ohosQz;BhEiXIY(M4Spn`{1}=VhB`3#^LxiMv
z(9&1!N#SH8?5KXt0`LFlO;50k8YZY&32Hl^j%W^KR1U#iG^s2|MdLbu=boJ3jKZX^
zZUAou)zyse7*Q@+PiwRT^*G^c1yc%3iA?g-F39+nk*%#NR{+xS>oSx3qEb*d3!5oV
z#887YSWZ`_Cn@UV=S%db`qx%|y7I;ArP04^J={Z2K@6OW?~gKN_=h+?1{TQpwOuU2
zOx8dSGvyhq_1U;#tkod5m#t~*KH;e=Kv9lkg4Di4heom_R01|^q73It)q!5eEx4vc
zlzN`=@k0m;msli^ZN07NX`z$Y4nAF@@#qZ?7&;}hS}!K^Q%9khVEEgW+_6RvHYB~|
zXQ?|g<c+X^{~_#y58X<g&VQ8|Q+Qz;?#wt|m{7Va-fJ?4PG_+Hh7Af5mQc<@S`RHn
zn<xGp%z#j?+WDnO%d&a;WCV)G`T3Ut<d2^^rSt81H_fkT8LNCQ2YZtKB4ytSJ^&2I
zP|0bvgDmw4B&6+HEgV~_n{5z1kf-S=)y*j+Km>~r8vF`eW&UEZ{LXNC<(k~_@(D>S
z?4Y)sp9BiI36hI%@zMBCa3F?Pwev)mru;0QGdQn^=>+H)Zpvg&qich*TB3z)hf`Ab
z)nBU=0xScCwhfXKNUPcZV8kMo{{4MPZznMbH6&zg+mCbz#2Q=S=<GZjeyq%3d~C+r
zehBy=)8yk;R%$7TfihZC;6gUgC-(hia3>!J7gyK0U|InDS;72}O28c`?jYWwv|dJ!
zNj5=`!_()7Mi9?*SAFU)>q`0bVfW3XVSQokrM>KQ**t4aSvbp46%OZQ)oyfBoC0O2
zjib8!&R_nA*&S>QB+$aqt_iQ50Q;)~Y(uoTH75Y8NOL68I<6^ENC9wkgNNE^f*Kd*
z{f;%U39ildJ~M`8oe~)Q+%l(H{K^fsy*c-1LpW!hK7R6K1ZQ>fqB*$}19iNpdKBUm
zCGe7MQx-%27LJ^hWjItFk#`yxW)-VWF`RVcMng|@b|;YJ%58Aq1qXav<D;I<co64O
z7ts>kng$o(ezt}hcVa*5_%cs-Gskt!?Gyomf`R}v{C>4i+sXjgw4fqu_UdJc+jZs+
zd5`OuX4-do#h+wfJ`OmTc1J1Ew}<&P>WYgT2E6<^vu0hSo)!CVISitur^Oh|4DxP;
zbVy{_*0*0TK9Xc5^*pJATsWTzN8cV+om<A=G5fODV%?$sxZ>^GS#S|#+c)Vd2Py)F
z8jh4Wkgw-Cs_=q#yzIg}m~%K;u^QI{WTm~(l?5@H>DI=zgMblH43qyf6im}GdW#&E
z7<m46foFM4-ktG*G21#Gs+le_E?LaE*#J)NSXU|}^pY$L(A~kNYd!vdAoF!o+!yzO
z%`mp!^XSoDII$jr7p~khYmvoGI=|AL$;mY?u%G6mZj-Ko@=JBdS+);f{TJNc0^Ts0
zl@QW(ZWsmp<AQ?Ggivx`6lz;|F6}yuqe`20Vs>p%k=hroUR_6xBn5!1%6=&jv1xv*
z>gf2B*E#}xN=%j7ph>^={{shi9R5r$3v!~ENmeLGc5*@e<~=TuZI3U3X52SzSr?v^
z%CS0Mz7ifS*MXsc><Pk|8|B$db_sV=<L3EQE0pW7CMRb@1ZF3P@idkH$EAh3*-Q#K
z#g(g9`$4{4+f2^u9sk=g8}u$_WNf3LuPMnV3vDaREO{EL+6-rrrkX<2>f{E|MFx$P
zCp1|_OG_Fu6smuGZ%6>@d<#acmV9_+W8~Xo1PLtJDfP2F=a4xYH`uN~-?5(pP9cGt
zu-95XO=~UxXA-lBVFLZC&&fiq{4I>sI``2yh!#d6kGZG`gRJCBYa_XCfd3e;wxKMF
zCjH3#x6F5kT}Mkt&J0zT@^|ml0B*rB88*}%Twxgv@Ieg2%FN`i1BseX$Z$s|Et|`5
zq<^S+6q3MxdvX9!TRIMHtH3TdE1E|X8#@5{|G=ZeKMmW;f6{TYzEX+bgmkcoaetVZ
znJu@KkjGao6f8T@sMV4f!TV8WApz1>pgOMkm^37EdcwWOqvTb3H|5yTrTcRmwniGQ
zmHv)CeEbG}nKqjjY<u($6@wSIj|OJ$m~IH*{2c7E#)bR$gFt_kFF_&LGxu0y8Dt=-
z<&}wVfFZ~ZBYXQRz{y1)q1=#7x}Gy>GH=wVUkK~WrQXX<pA9-UJx=*VM_uVmwXW6c
zLup<sm-1E}tEmN}nM3y}z@Kt;&P?U)8HSVmNZka{U>pmAW^|vqDZcw<KaFYDWQXP}
znJNId+OVC@77hcT=usfG<eB{j-_w{!;tnrXAwKvrZ9)0sJ6D$H(D-S$OWtZ%N`qlg
zbF$9vJqA*5JJAlaKEs|vmr?c^?;F-(nd916&f}t%#^0`4>mfJM!z}D~p3T+d<c-AZ
zIdsTmXC>Pb`+Ay3H0Lre^Ta@7*tK@bP1nS|2Ac!u-!L(W!{n@t)tz53p4SZkW(@jW
zlo?W28cezmj&ud6rCe^-vgP`9>uOVC@5fVF)_C%JiN%zOHG0r3fw-<MfyZ_<S_A;E
zzuEhw3BT5>m-$%&TyE-_R=X$6r53%gwp|2QJz=ME38<*d4x$WqVz(${yu^dPEt~8(
ztTk}pVUUFjYyV0g{`KSKZ>y#303qJ|0=@J$Qp?d{*u}D!0*(x)4W~tY-5EDY`s+1l
zP%D`FQW-*2W#m<sGetlnV=f~gVC~vk05f$vHeZ5;a2w@D*0S2PS@-hg%j^zS1&D^=
zE=4URAK+^XGSRkKGIGH|)<E*hjvbzrW@a>&wVpg3XTuEGT!V0!&78!(@^J=_0Kz<g
zWzjiQbfc5p)u6uile8Q(rZqypiyNt4_m6qKjfy?xf-1Wpx2Q6Lv+%uLz^z)fYStWU
zZ2ZfGyu2X~YAVZ#vG1MfkkgU}i2f&YQgi}R3BcLLdaY~RWW<6z8}9(Wo~G-Ysu?+=
zwMIO&anq(taCBZw<&IUn*GRQWQp$@_`1Y9e7iUoxD-hE8V#mRxwG9B9L81d`GPLcW
zHZ4%kU5jn0SjiX&I{C`2TGNr)K(rm|O1rD)l*nCj6RO%YL^_hsfp-2wal4(q3s8=n
zrlVBX9$eNfokTgynQebdjjp={j!q;a6~#H5xdazb&5-<G-@w^$;lmZefxkqTzL
zkea`@YL&#UXf#1eJC!yLBpZinPVj_9qa`C}7Us^L$-!diMG0^MGd^KKOgb>C9!c?3
z<ZziMqGKtfKnUvVlTh_<aCpPmZqblb1#Si{-gt37mlzXs;>3PvmI{WWk?r<YZpTqt
zlUWRyTfDB@eOKUu{sZMbH!+MQW}M&yD7lkB3n@7-Jh}YO_*81sI^6_PFrbxSoVv|J
z;v=u}@$=_b1wWuMzybQ~MnE%9a^to^?Az(rSbsgO&-7)STt~cWYq3FV&17TL>7M#f
zMp<*iijP$1J*S^L^KG%Pm;?OZtz=5_<foY#o5ys|WhL&2>|<aM1lWJ5T?~Eji`TAo
zv-#uOin(pU0hUz;H=ae`7>&V|lU4hwtCD23l|?M8DCbj@_m$ic>><oLP76JpE&HA~
z2XC+r43sTfr7G=037cGJVf3j}hrYE*TDMXUjHkm%HC(IXjZ1^*3CbyYJ)}aeZn`hJ
zq4}0dCPVJ#xkkU8<y#3&wJ)Tl^|Gn;eL25*F*Z{oV&9E=6^?H@dJ^~BO7xrFVw=Zw
z%8c7`nXKD3^s#PnIcBF_C`&jO#{ypW)z%)j)|S5&1C>Lh&)G*T8u#hbhosDFtMAcO
z0vH<@)DZk7;k25Hce38f^Y-)S8)C1cTujIQqNnoxVXCe$oYbU{w=q7Iwrf$#Z)a<C
zW3pmK^PXApA*!hMFa0Pc|DHdHUjDZiz0S+45vM#%7S+$~zb%eNuz=+7q~Ke7`t(3X
z;MAeNo0OW`fR0T_K1aUQhu4piQc~3aKdSx&tmm$4|HnUO%B;*p8Yn~NOht-HQmJGr
zAu6JbMafVa6crH)Whx0JgcQ+uA(ae;k|;w+(xm#ocJAl<KYks@ec#V>UG*9Ed+)W^
zI?r>R%gjNHCil<PwSSx7w2$Z|>2@lr^Wf>P#3}xj;8l}4Xwab8c!Qj;34FyTAD^6j
z8SlzpwBcIx)}1pi$I2fT{=!Fg?D?gA|2;xWc!eiixQDw89G2$dy!q9eH_O}`2<yI6
zrcUJ!IP2xtvYcyR?Ij(%b)z!x18FM}bHraWs=X<h0-y3*=oc7yYt2y;(SX})z@YiL
z8Dy99vlp;=#Z~+dQrPKi^@f;H+_IafhsKTi`Y(V5MaVxTI}ADM;NXxq%5QohY<C^*
zH>d7wL>NHHG<IJvy(MTsy1?l>uiOdJ@|yO8RzBK(7v1|rO=egtVvok>Ze&Og80@sK
z*S*lozqWd3<jwfoAg`p9R;<vvxpVQVwQIewS$JTl5G~!cYub}16;Tc3U=P6o#!m|H
zPK+lYwXt2}Al653Jiy9XAOS!*$Hw}t5-RPvK0Osh{Tk$w$Uz@Tp78pn$V*Z5+$%iX
z4i+I8u%gup2ZtxSTeWHV&I_u3*4dlO4XLp?^;JwQ`-@(0!Hl&W8Mwo9)>?s=tp%Zu
z58r~flks?mC_f&%M`5|e;>CgE*BZiNwH#lClkB*)hzadpKYl#&ZHo7n@@IRPoZRN;
zheMjb#1Spl4l)NXNJmdapIc`Mo~ma^LdLwc+_Gf?Yg<av_LtO)JR+niBr|N<WN@Ge
z@I1=!mjSJ>0#e2?FbIp{42VIrJSk6~9^_vegM-uTa387HhF96o<C6x<%SSmuCezCJ
zlwwpAoq~TR8AF*BG;R9^*u=`8KhJ=V$*7u-%x|Rp^b@iLDj@3RIl!*KJQuZndJ0H{
z+Tk$b$O*MsCZl-?vf5(QJS1j<E)j$UAdA3y!G{;3DVp$OJG`?iIopM5knOPuoF@Dz
zTRgb8l#-&NjcxL;E#0o}Yx+(`)h9fAbIWsF$2q+StRg`}>I*I}po+KGyZ3O><^dm;
zbt-uE$_VupayG}UTPKbVIp>3k)yej6e;^PPScXVWrayVuP||<MB4W*>hdY<;NWz^X
z=-xThVwR67QTl<|u?FH@fazz<FJLB=8VlNf1tA*HVp?jMl9G}@``Br7(bLv}SdD)>
zzk$+mVXL*Lu$IYx907(pVN`Qbb7N8R!I{;U=q70gt}MhoQT7-Q`S9wMJcT%>!f`R5
z-th+M)otP=sg*Up`A0@B4kEiX3D@#8tv=n8#D6VdFAA=oK5l@KCA>8cXg`*EVX!S@
z*+89}Z-tn@9Cpv9+PzOdYs8Wla&D0keI{*=-D7EDUN=?YHf9j;Q1{vVAWjf|<l}s&
zX;3Nt5I{fKP^irY;RC44LA6TAhNc-$>j|pv+DA!{ujn3hxOMje;DBa^gUyV2ll~v-
zsI@#*WR|M0|9G8Vi%axc{$Aji=(()zj!#NX@$&Fc<4-2o&C@Qm0pw8HIRw`HA%d>X
z1}$-(q|BJ_o%hZ1E%p=W$%ZhVDxB|@aQE)T*Nu~U1p1lZ+OQmHqb`7lsB|M4ioL&^
z$%xOxz-ZW9tyipFdvP&1!Xw8qRm(PV5Sf=M*-esZ!ixR&8y*zSwh)vxAg~ymFmhQ!
z_i5Aan%-)lMd26!Iq_m*NU#X8FCvspK8yM@6z$n=Fn~QkUVSyrmIBCwnY5FDWUc@M
z0r|~)W^kwwziGSIG?JfPXw!=P%CG-Kl_!>uKt{(2UhQTKmkm5!Z(t2BhuMe$kBzZP
zqsk=9By0~N06wJA_%SrFm}YTewaW>CCnN04vkhU_UI3s4q#uMX0K)h+n2$&ph`mCH
zg}df%c;1C#l;VHYMoX-^^$2{9G-4D|_DL}($;6-lQT)$gby1iqlgKAGH=U%v`Y;)H
z;E^MbcnIoS%DYqdq`ZAQBefA7(gA435FU7IBce4?fB(gc%5d>aH9bM!jt_?r66P_p
zIWx$#MZuN#cb)rl3WPcu+;(>;+DBKC7m%INIo%#yJFnOp;v=&*k56j~f(4DD44JUW
zTYU1*&Og0a+)XV%uODFab!vTgJ+5CdYmYp(i-Ds2>r|vg9i^m7SR;V?BYcVxW;G3h
zDL`kGs}{bxH{VxiX(g@<_>viE@$7%)0$vz%&=hu<p72tQH-#Bl=_oKVa(ENdM~g`p
z^EwAmNX&mZbMY*Cj^e7xtoH@|-MRm9z-OmJeVnqQ@i}yZ=m-TbY9r7mF)HkW8@u0=
z{jE5_VK`V0+Y~LiAsqpHp$?$dLRTT)`r9`HLQ~#^4)-~%Pw4Tr+YwOm{`~_+F<y&}
z9XseS5S!q%LCwqaE_S1_-~&gJICB#H6n`E1>Gt3(@2^AUkN*b=YS8UF{WY)eZom4#
zr)fD^tw1E<!VPb}efRDZrLqZ93vmkqK~q5}j2jn3$`(r>6@c>T!d=8`VYmVk3Kh%t
z?FDI{H&zH>C|42F20{i0CnTtg2F>0hEoR29OwIix-PFuu0IK)3a|2w+Y(K!!$%*LH
zl}#DOEkGLjfd2_mcan#0(>E((gYCS~<gP_J*Ycw<1Ey4IX)-_(phEle;9n~NLVs5j
z9;2aZb)gqN_S+pxJ3+FQauC{zQu+4}%uC6KL`IZe^x)v3Lz9QMq>hRDCKK|-J?lc2
zC;qdw&6IWkZ~A=JYuB%Tz~}x9abwP0p#+Jy&zFKX<^N9wAl$I!+-jj-l9FnDp9>ML
zgVN8^oCC@2dy$eF6-F4>WF?ZQ914&L6&Lv(aSJSNRrbNR&zPqmk_KPNwhcnm_Ar@h
z!jm8l5ZZkkmE!oDH`maHkSV1u7tCc3BC;<mY|zgY+Y;s;aLzTEvs@g^S}2Co!;4*)
zp=T2AGwYTh7Fj{>^ay)*ifH6XeI-=?5KzIFcUKvcex>K<%kCC|*gC~Srn)jh)E*>M
z^Y^)P<Ca>jJ+N)_=CfAGyDn67LWBzK*mq1u{9w{r7k~a-ms_D*P>4jxcU=pKz~jix
zK?7ld+ypt)Sr~&Oqm~{8`4!k*ochk44a!pLt=L#AXZFwhCZKCRLo2Z}TPPF(Dtb@f
z0J<sEQtQ^uqhk47c<X8l-l|<H8xSFH=py#hN<^yb3vc>^lnfZu!C9@%OCaz{ZVFOx
z<Y8Imx|y{N4JUp3GIv{)v=4krZD^Mh8nb|q5M6>%Esko8w%vB3CXc-)GpOw2M;%f&
zgRcSwBOkRpVZ&h|56FstM~IW^J+k)kW)WYZ4d#DEIshm+a{lI7X=34z*;rf80<d4W
z<r_KytH)28>#Dc%46+AhlBpC-STc4d{(jV2waJ4wBCKK;u3u7gcl#Lb3Pa<hK9z_z
zMD?4u0}SD`iz}bcx#Gr4gRhi8J3lAk_&A%)Or$~!PHT*Ld+^*n2%ah$h>E{_(TCns
zcm8?i-x`Sn5qE@5mIEonRhvQKqt4(0-wp?M9z_%20iXGkIX!A{x<t@N+!47$V!iv0
zg80hc*Opce0sj**Qivs)+?6ThZ9y}}^lp{l_`WqQ!IJwA9?ZHfB%a=@P52{#8c(D~
zqCsKbzJ34qpjOyzk@VP24_C-h)@$90o_f-J`nA{^2s@y+_0ECDfiq^z5RislBS9C%
zj>eO?_3sj=yxsv_W3q*?Qf<MoK|43@W4jYy!~GISCdV*cqYLBSWd@y8UND$K+(0vr
zk&(R2NWMtZ%gy8;GqL`#yGQ<?U2%~BiVh}ZjYX4q+PtRkxC`1aLNs%##R#5^=&n(d
z$XOH>zk~us9e4(?%_TeuyVKb~q!OxD1uE1rTas4$QNXL=G0E=>DBYRjS^%DhWq*m~
z5I?$~Vj-|ap+k42-BNgtR|L~NQQD<C?K*V04y{E+^n}wxDjaVYQ2e~}k%~{B(l}eW
zYEi4rm}@0V2$P7=g-gsou%IQzY3KTaVLybv@FKTs6+j4JYXd|^8OPfilQ<k5-EJSO
zTz~o(*3uFN3wyCP5#^o2Qui&NuE&Fi?^*qo{?HLquaen>fT6r#^0C4A+zY|K6Gfh$
z!-K)jTr?k8l_&OTJpR3`Z0(Yp*RI{q%d5KW#yT32-sja5AxMXwN`G`c3iJYPuwe-^
z%3m(r^6o?W3=Qx5RUW5j_c8rHEr6F-3tLQrbEITehesAn*I7w#H8~oJSqCV;;7`f_
z{aZVvUH{F~*C!;G_*6uVsO{NfmFOvelo^Mo1z_lZU=1Rt<O-iO^~CBnLY+dKm#9yf
zl5^Ec^B>aw?=dXd@o`N_=dN8p5ti4WTOe|_AyCs3B3i>-cxZ?rCr({?Nx(8f7Du9d
z#qwSj(J1rS7&(~zi|io5I#RQ(w!Hx55sQGA2b<trDG!rI&Go*xSZDzv&13qLUAuq&
z(Fkg9Fsu_+P@{AzzQKo(9BzV#V2unBRtqg==HU)t`3QxWmGj{B>$>yf&U<mw#nRZZ
za1D}`UuD?~sI{=HRU|>xGiyM|y#7r>Ae+z;&o}n7C%S8t5n=(OOce1<eX^zb)atar
zA&hkmidnkkLdBt^bm<W$^)tBt1+*fN3WW9#$n+dHZ=QF3{qf2j(^c*@FPA(@ON)bg
zCRcyJj-+Z=9y+uYS)f=Yaq4o<W%ly4o^zM#iYMGh6o%wr9$sFD*?pm*p^n?PgFhTV
z(2W4qVas!ap-}w56hhsJr>CgO$m(R^U75EcwiiLvcFSX#%rD7b&%OAdnRmK+XAYnO
zqQeO>$A%lXR#lCbsa{C$PTuf3jsY5mlN4_Ev`du9k!HN!p}*F8G#oRG5Naez@sA%1
zz!4U{`9ZiAXk<=KATGi3ANMFOBsp14_+DR>6b8@ujA`ez{LQ3MG$UB`SWcyOZa#(E
zaTqv;PL^^#2}T#Tmm!~74EdWYbR=C4{+do^FK!wPt#IMDtuHL(E$E>l@#-SFpi1N$
zyjXQ{+X0HNC65zIcR*=@mM}kUj#x#8=Lj>K%Cel<$d^`;Cks-Jif<DTKUiVLX-!Rn
zNdzyNh@Ba*cBy9pst|qxpM#cvV9iDI3|trYj~?+OOetHPL{p3A3v>|+{wm97;z`Ev
zzI~pasq=1CI#+HbT{jIF)o%^?6iotco+Z1xxA5r{LqSux_58sX<B)<fAa(&s8@b?l
zq9bM*UL1erX-&YeLU$tWd~Dijq91N_MzBTTs;aY<=e~-wk#_%E*?}u>7@Z~HmhVeT
zwW%f}m);t^F}ZOPNUgzEO;M_$OA=3kY`>?R+yag()?kC>+<x~UZQc2P()eF4Hn1fm
zwk3eHWiU73n1|(ce2tQKxgi8<PI2!AuYVoC8j?g(s}i;w0^AS0pE!JURf!kl3(Vj8
z$1jdE%EU@#Azs(j?MNmSbtmm}^$dKyttowYj!u7ToygJjNM0f4L5l*}o0T~1B|a}&
zj`U)rWnze+3OH<(b7-RF#jjm5^`Ucv6vJzM^OY8@(&*zCyf<^FgR9j>HE1-|u3}c7
z4@v<dh2rSZPvaSA;7iT037bH?h#v6Fhnru1)ysjD(U;AlIes?xwlmh6haDQYoQ|tI
zl_Z}MUlTqrFye1^75!f0cm7$waMp*}<f}rGAl8vuZ!C)OvYXYyG=n=iWnm3gX=PS&
zD`<`TMe|-Pb|<9xHIJHr+^9HUN32PsFcH~@6zv%wL}u+yTXy(`kCa{-Rj&x<;w6>W
zXn1sJ9z(e3LGcAcDRO^0yrhuJwgTwq%7}G*uLa&+|NN#q-UVobe?R)epgTY4*cq|+
zbOYtFfRf9u`!ZN2z;D3hgJ44%v16uAJpn6E-`70o%f9N4#hVLviV^A|s9u2rT_Ucv
z8~nF&VDH{(By)At$)bkl_Sdh{N_+gRQvSbq*Jr&43^@6%=rY?|+HE7_=?rv2f3Dc_
z<lE2U83eNF@tE9*&KlnE)T`~_O(K-@YC`1MjDs<=$EJPyl&KtL_#^vJI_-z!5GaX#
zLPN&M^;T%$06fLf<)DrwF@iamOZs~H_?Q{^)g1$rnjVY2^cz!2jXD;6jPpDFs)-iq
zvFO$NJaxn*f`aBh<vFts`ui*L+^In0aK%O-Jf9QNrD1Ar!}dYDNu=TkXUaDzlBuKe
zFP;(%2|6mt^aP&<=CDk!P7s<}Zdc`-c+7;L*Zm$<CdAiIq|eDUAGeHKo#{T~55VJv
zy>p8_cO~5c=~$`>d4EFINu#ORUVk%VN#4`_TYvobPit-c_O-8Y`nUV^l(5)ZS@~ig
zgRotE4WaZ;OG_&se$|g!OkAuWYq7xZqM~sqJSu#RC!bcFr>Czkq!=_-DZI|!yqn}l
z@L-UkdZcI;364X_HwwlVaxV;IwN`Pq-%x}3G}5vKNgj);R3u8|vPd0v`D*;fo$&ct
zcpGxwT`R3P^h7u)YZ^<J5u3^jZm5#koQD?6cGyYK{r&ajA?Udou|Eia%MzUf#lr#h
zBq(H4b(>VjMf}Sf5<?-COo;6h+*&!4bZ54|)W(^N1lVH1iRz)hRA{M`BXd=IJAg^%
zf?^FNxAHmm$4Z-Ofk4=mv`FAU<?sA;mw*DS6zwXd*{3<6^(`rw<wUmbj*_?M)6vV%
zsGf0*{Mv~##S>)Q6307Gso%l~N?yu^FHsGI%U4Nm(-64|Ey3#~P%kHJM3Y3>%5Y{b
z6@x+!Zx$Kx;X&@~9@4io^7OvrR(B+17@i(e-aHL)Dd>*tmz|N_P8$yfiJ^T(2(1`#
zU-!Hwi|YA4!{~lL*O=qi_kmFLPO$9`j_X{qJlCs8wC(3%qQGH2QL<-k-WQjTpsIBI
zx!B8blto7m9d>l$=`rTbu86E>ic?8jV5edfAfkps{ZxPZc4K2B^g(CtNY^iSZ;(s0
zM4^IsAIQfO0LIKbedX@fO<A!nZz=T&hLvL*zt=_{yPam&IAe#H6e$F({GH3nA_3Ea
zSP^$tQ3VM72}($jzLa$>4h;#(pers!&AK&Gv&=u_1c#v31aL(gmC#gkKSYoxWj_(R
z$e<pNPrK`Wx<;n0WEiT;de#ywg+w2&@B&njpu(JgrW=q4AC}cfR6#nNf}6QP>Iy!E
z&`ybz3Jff6(?@+&O%HewgouWP%^`;KkXo;%{zUvjd=rq4sFKJ*IZ4BKkQ4pudkq@&
zgpvi-Xr**@uReX|QYHL6V{`!-PV)|OmYsD};q%*t!eXuh<m3Dv<qI(rWDU<w4;K)F
zvVrQ3yD5`3Zb?+{z2VEU(G*lonb9$pUG=c2NY35eT|8lC`lbS|SzdCEf`k<L5N)HV
zK!1zz-nAiDn%#@9e(Uk5=OdE_bD8|M!A7(6y_bJjRdnmsOL==e5APec!!ag4|7W>u
zkD>NHiXKaR_fL>tU8H!tNUrYVt}??lw}*{C?(wp!@~87-pUpqqvUjZgKIKPM8fB3q
z_x@ilCeFj&<YY}wT{k=?aC%=si7H|@qdT^^EIJU+f_1z#YUIdg9EGroLWrT?`zByU
zU@>T+RO&g0ROvZ#5h>1AlHI(DU>=9=xy~I~`}yMO?lLkS#l<UxR2AvXZXcgc7`nXw
z@#7G?w``}O)WSJ)+6(PC-2^?bg;AS0F%_on8oDlsiH@f?yY!lgI(#2Bz36|OWbt_a
zQPf8yb_|m2i8A;BcEUY+_S_2?*$eHlR_?YXOZubz=PfoCdz&>pOa5A}wPVNBQ=0Ch
z<4;apcj0EqPkwip{kp?OmtHM$vT7HeaFq?H|IS?F3dYqGVzzITG&C%ARhqG}>wKa%
zhC#z)c<!EL`u(RKSvg|Hj6ozl>1+?<m?aYtXmm!ikoTpbqp|VFuU`)!%nj$u3m!y#
z^5@S@36$1|ic;=d85wmar#{lPf7Spf0#MWaOdSTxkRt6862siw+<>5<y(oCB*;$du
z{dd1f?I>J)HnwV|gSJJ7yT-HaQ*?>Nz^2{@56&?+?=AZBND9!^4<h92(lQ2$r^dW2
z|3eP|>MOA_?N}4TUyHhiY&0M=)DNAGLD*E;3l}cvGEJ1<%DK2e?!4Zme}$;&-{Au$
z=f-;n%8<@J<jU>duU}n`<;DvoG?KO^uBGq<uyRo{C^@$2$+AJ<O(X+d4Gj$qiqB~&
zYI=RZ0CP5Ja2LUB8s`X^FDk->goI-5UETTfMRNy*=or%ukF&Dqa-W7+?*NcC`F#(X
z{)7SzP9u=DVDljCk!o0ebA3PI8htB);m~A@9oS<a@U=MO@Zo$|N6u<%WC1Cyivd#~
z0B!idpxC-yM*e&ss*<RiMt}sa_8@8EQVJ)ENH+D!eM_Rtr&6{%waoh9BDh;rXYFXr
zPT2ajH~y2ApPmdLWu03)Dl{-~4@pf4-@Shg9;Q3tPuh<=-kJ5;&&DTe1sp%_1NO_-
zT>*^5X4cuZZIXaZAZ_}*zI}sV7I++CLCrd)El}aT``NKcgU@0Xv4T?|7KVH2J-Y};
zk#iR=ln_*Kf@sI|r4b`j@EqDp=oyBCQ^l^l+}vlFG$ADGNTU>N_Cr36BkGrP=g+?f
zT^HdphrSre?fj`7y7X^LP7~E_yeK$!W%@>P8)bd__H8xwY8;F59|rfM?`|$Tn45$s
zP`19{_3KOkyogQ7+!x)e-!gfkG=#QL8}a*G7q+s=c=Q$|QJwcroj?Y=*xdYRZW5j}
zK@7{vgz?iUWY^pI(T(g7;b1xl+nG$XfkS_vWK_pU9NTP)n5wY29NuHzYgAej(}K#_
zgef;+c#K~|wXKcg$aQj4HV7H=Uq*(}>t^W6h=%@r{Hxhi?mBLi$>+zuBYiR~FGA7m
z-*{})Ql+VBfXJqQW-u_|@w<1Hzu(-n%bW9I?5}zzfi3Timr7g_?L4cwwR_Yzllu0r
ziJ3<?PLCPAk=cwo@_`3q{(IMNthnmN-q}{6a%baJ%X>lb%bDU-J|$<7YgXScbpe@D
zWp3flmefZFvXY=s;^^z|<VM7dj<>Q7)f8ru-;${>;@EzX-x`>_=v_ilV{&(6UG`Rb
z%3kvk6w=4!lbLImwHZ-bq{->;-=O;L5Z@^?G%U;}`?Thw0?lE=W={gu%bCQvm{i*s
zkomi2`NbhMvn}=Yeak~w*t~=%R4dwrbP2C>Teel$(&n!{#)FA$2Eb92`^udc;n1P$
zK#ONTaZxqEw{BX_uEHDZF6RFsUGL-IPO6t#7*7&cw3(b~Xh9KsDBjm=%U5BE-RfDU
zCc1;y^qQ{bYuYFOr)vKuA-41G#{~sT4%Hmdy-f)wef2$eXa)}6Hr$+MubW8-#&L;Q
z58#L{X5G>nPcojaQRC;YkMkCjQ+hQ1!T87F4Y9jNJ<07Qw`|x^BPO3Kv>5_Da=M_g
ziGGcUnrc!)vwx407+O9Jb@Gh{rQC_9G+R=_-^1`0{Zss)&1;e-8?f3qXzmQgbR28_
z;}&3c%0^wBKaa=6#9ZcsP4y1Z9m%=*Lz6=0(b3CQgn8b2B%E>d4rdTRrQ(aYeLG#X
z2B29?WwM%B)<69}Z{G+4(EI9T*VoKnjOAPSY)<(o+W9ji85odSmm4t?)!^vi!^5Jj
zl9L=aM488&_{O0Py6M*R_x5Ah()KfZ<DPZ%>fZ!Qz>10Ar>UtKP53U$z(1#x;F-QR
zsg+M<P3h82P6PMx<34F!mM>ejnrh2zYW4)2uZbQr!rjS1C#F?t44&>hM0Ii8Imb|1
z;Pj84IB_X{BO-vykWX4goS*2>ZozY#DnH}=^_WGgSeD2ZIB=8Z!&3!~Uw0<Pu@qYF
zbLY-oOkB4ilQJ);dH>LB9Fp$vX71ey->sUanPYL7wTq_kafJ2oW<uk{?@Jf|tsl}{
z3%{{^4WSz{;dEA0fkt}0vmd>k+S&O85wkz@_4EQUpPp0B1-AS~rSN_%U$<tBQUjs{
z(?4deBe9mgw5`Z9CNfMs#7Lu&V|oXxF9=oNL$cW)b@sygW&*F)`OXqL2QFv1O_?_B
zEa;uqPGFaymAd-+DWvieiI~oI&fy&4GL@UoV;dj=M8ybG8Dmor1%s25x^QfB7kB;w
zi0}|+TL+hVWZcCxH%bDQUf9KR2&$3m+jsbzZJRck6wFmR{NUlkHrNGAb_XfWOM!)r
z)W2ZBTjrZrkL*mP1GgFQjKeum^!CLi!(AhTeb;b13~(vy=&0yl&D5)EuC<QgjYQpf
zOU6>(u0lPagL?EDQ!q1D$JO{|8;J|=jlf=4QK(PVOrs4?$zId7)+l!y@{x^6!3Td?
zmdRZrkxJZAEhVxU6jGo1;$E25#D+*kL*lWGp{0>K)(v($tqHDk2*60Xf@|<S<bvtz
zA~FDiOYZ{me>t1f^Pp5{3OfMQnASHV9=6~PMf&KO%rwr#4713*kK4n#z2-{sEM)O#
zlg8x#S#)`sC5&cfdXW1Q$xK2`sYAZ;n?sYvwkk;93VHv2ye;a}C1%6UmsD`g2wqn7
zPw<~r<SE>lN5db=2aQxwkwUxih<|MM=Q9pvktb{bYeYTn-*AfVE~-J(CU+`i(Ww&d
zh8Bbzd4WwccY*XR(#^|%YBp1wJrkjQEO+t>GI}kbxgLN_=C6zL^UZ!+TwZ#sD{xN#
zL;qSewRKLOKd&fMucDQhmbypaDMHuK`p0CZuO^H-oY{TQAb&va)<0_aSBMVM+5Ap?
zDc-vq%yQ}m*T7AmSH2PRomuvgOaT+^B=p)zI1Xf6qo3{4Z1^@FVUH}Rj$CeDUS)0N
zDwxUst_LMK4W!Jc91Ag=0=;bSx8K{_cN#GWqHmV%+&ObfsNjNbHUgjEox2bDg>==d
z13`@YpuJtSF&Wb447ZThGzEpWB7xzGKJs+@jkvg;{;6;8?ObGHGuHt>*v<@L40&mc
zF7GeeLX|QfVUh$P&?iY6toKsbqKjfmWM3x!6gcn_?YVHm?|}ja2X~X+nfQxznfiGm
zp#1sY%>yRb4lzB^c2l}z(Y>9@c14Jsdq25rLIL0bz-MmTP+L2MEPrLGXBl^{*oXyF
zybGqM|JqX5SBzB;=~IZUB50TX$6Xj7U!(l-uWTjy9j|RJGYl7VOKJUFY$HZNKvqas
z0Kv->wsT`e2>*TFLcs|gObXM~KQ;@68wj6P1F4cq!-<n85A3=;!{wKD?&?ou(?dzR
zMV}R0Y}nXMPw?j6LlPJM7D@n=KI-iSRxUzY-YxFdt+}L~6aUc8+hc>(?M*tl-9i{1
zNxIj`!Avg{x!8RYL62?XbStTT`iyN~nnt}0bDjGK99eM2R7YRVM*rx@s6SqYgX_vZ
zv%~d!<(JYt(-E9PhXR=VS$sJGFWg%>!2_Cp1+i3XC+XX8PSsP<tT5c(3Pb+fZQPo5
zO@G3boqDG=2-^%&?LGxe0Wq2uI9S5-ZoBHk8!!c>ht!_Gs&i`UW@{KDN*@RXZgSTO
zu-EuBH;4e-`hTc<hKjKSy^nbzULoy8m~0^Esxyx(X?C9P=;W`ik$^8i7V{;$5m<`}
zc9r*HoZruX+q%22Ov9HiRg0Z83%Wj1{j=!UqvK@3)X8lm-~%J>gv^&ahRaDGtcmZ7
zZg+4tF-Fk2{&y6IIoB2rSEY}77Rv8|IduX)c$vWZHQ0ZHR_%Zn(qh}_hC#m(Cv>_+
zjymD{(zi2BP7!sypZ25H&2xTt67S^lX%zsD69#*=T;JXrhJ4EuvW1+x+c#3Ux2LgY
zCQcAiiAB87ARf{jasJ7&u|AUFn~wQe`V#As>2CEhq7cGAb<c-$e<}4azVRZLil8w|
zZSv%~V9wHsR%X)DX^$Ss*d{CL9}%1YlCI#^6<!%)_SSx|*)|gT>Ag7(1B9r&zPlgO
zTaFA*09*DiT)k#Z3iY`}4T09ZRP*kD5@yqW2%<Y;8?}$Nbm_V;rzb1xEaHOm0~zZm
z_I1jO7v>d{_!8Ymz1b`*Wk0EBD!(4FYV~Rl5|fTx5bHrCjO~h224D5__m@@lR~e?6
zuFe3t30Ey-(b}s|o3`$ovSzm*d{>=q50=%HE$HWP>cjLqyZn&scov$ZWph49+ZorB
zHb{wT9640y8;$<{{yJ{JmBCM?MdOu<!G--@XZnJ8+a=b5_iuf_4RHcP4n)W}@77TL
z)r&9EDZh^7O<CX1Na1kiHOz;tm(JJJju#M?@zSiU0P09vjD%RM3)j_UK*Zhy$p<yK
zBaBI@AhV#>()uv7h)l;*b~#~0G<}Y!P`0|!EhEf8m2*65g}uFffc-jZdI0Rb&qkBA
zoXd}*d&r9`+gGX8x`-!L=W6>BMvH`yI;#m~YxbfTVNZLcvl)tln+6F1P~X15rR5*h
z`OKbVwm)|Oqr|5j7%gliRi~(!TQmg7q&3LXaw!Uz0-koj5$BET)*WC+NU2_Gq?MH(
z;8{u1?)6jM#TB!~_%nGxz>&*WEM|SdUMoU7?~rIOQ3yLmR&xjvu44t*uD<@6;Un)f
zU8xF(MtB53;2|(shHN<a7iKDNX|tPLt#p-2QnPGLb&^cYD*H=sUpl2WoEd$EYOn;Z
zke261I6B;zdT{7;q6dZ7{;m(tG&R;(-IM!3jKl2X5vjm?JY0Ze=_Ttz-Md3`QsB*V
zuI^WX<avUgWN5838vTW?J;(QiOKI`}eFnBH50_88Gpi5*RLA6YDEJSF7a%9Maff#V
z<c$8?eEIUozhu!`cWEdoXO2ReGcR=z9ABg}l1<7_$!3C8YiMHU$5`WpQd-g&oBZiy
z%Qup`UMeSq_r*b=e51L!xiIy^10t{H-9{Y0AZSD<V~)i>EJ@IT{(RTaZqurxa%<gc
zz7Jv(1aDkFcPZ<ah6<?NknOpfNdHh8>?Kj@W19@AT6e`AwrUt9V2DSf22&>diLqxP
z+yvUO-oe3|XGa#+d$?spw^@dUU1>E`h+lNs!ninl9@|?YdPKm}hxkn<`x;EvtPMkB
zwn6wCB<^bL&my)Tb1K~Tg3DO~(DO^lC5eX_tyLp!;ECW%@A=WVXX`CB>sCrA`(}~w
zOx&=D1-+pc?LxgL1r-<WYXjCt<F)pIul?;4P4dRlP%y?`ldGuW@1nYuy-onU%zd8Z
zQgz{6ii#(W1alz7uB7cU;jEGlmzK?#15pPCie^8wz>Q=(y0Mwn_YI0*gI&G3$#!sE
z%4Eq;GPV_HEc@RQoe8<eRytKLs623boUc_Hq0LljlCTxTY#{+STQWpVtvh6?it6v>
z$-4&e$m%X#lVt*fQWJS&I34XHUDR|d)4jt-C{3j&&CZ~tbLY-78=^0z91jcg=WV4O
zxJ_>K=~cSe!lGjv3(pF6U_@ySRE1vBD@#g@m>~5V*7o*N{F?QBH6~3sG;`t))R#5J
zsFTck8GVjJ%FI#P^X{O{h7AX)&}npOGjoI1lfR2!XifM(EkIorN;%~v`6Czb?2!K-
zv8=5N=oKgR!L5>7Ek|kvP0E3;eiRkbdZ8|-v&lPqRp-2{o9$$F2~F=j-`52N{b>bB
z(eVQU5krkHTT~AX-0@BbrFAMW-XRAH8~xa*yEvRfaaDc5Df4_;ht65iU2b7w$519_
zNTnv-(T`i?r&pM&B-0S>`;~MrWv?6vb8e>Z=lYu@DtfZNbti*wcE<{W63+nBGp1+V
zhySY@RXl6;aA?t^@dZi&cO`Qy8Ild8Gkku@^P_*)UcU3CJNdwU__tQ2N`XMfGO%l@
zyua}UgMKlG2K4W*a!Dl@Dm+uV#{>p?_LZ5xV8L8lTiZR#UV@q6M{gp9y}G*k5Xf|%
zD~^PDdRiv>`ua)9%uVTCd&hxtnB_S3;|KT4NXMRKygUi&(E37dOzFSV8Y%GW(t@|d
z&OC|6fu0^~2Zt_PBA7XLg^#B)8uiRV8yTkJ{BcV4ecQEd+p)@8xbkr$^lQ2C0_a55
zJK}MR8&6CzwrM72wGp-jW~&c$x2->DNef|2z3Q&)V7b$(k;D`0@A3gl2jx6{YM6VI
z=qz*N{AknYC$7RR@ICjaWYdrB3>mUL6*@Fcl?(Q2tce{Ak4^<fIC^9Ctk<k*4=lg~
zZDXXVv3~d+*Mmh~&(k?N8(;<YKiin%(Fqt@K=|*bjWw@yD<+MyyGu=bmNyU>m_!)U
zWMpNPl#BqRT&<Q`vQ!t4bmAZ5(C$~GmIMJa1b9>daR>z*2)s$b<f}_85hoX|Ykfh;
zH8UPRmMK72mUXOil<yH)?JYyckMBh*atgU^WqHmpV3c7L+^bUd9I-e1Wp&9+(TT#t
zyJwsV?9a$c-ei2*&yQX;z2tM`<=t`2?Mvejh+Qp!M>b-_%VXnCoVrL}heo}fpwFof
zaya=bjXkSC5Pw!xwF#J}mViJg^G8e#15MVF%$QoV!)3cO;P`zOLU`*5XCqkc1dQC^
zUiOgN({?uyrFt5on9<+LW?OE(srIhVFP};#f6nQWnQk<&ILW;^DOb8ry}G;1gzt%^
zH<@M0YsY6O)_*<xyviC~ptX&SV(3EJIC?OJWXcrnqTiLv%VrBv3VyZyW`5UqLcn+|
zF(x=TI5OsQ;_wAdoBT_X-W@9`ODu7kVC?!XIBYU#Xe{it<hFFbzbFs$#_89;;)xF9
z^CNeE`Nd7^+=3G-rvLGsPeq1J9v+!R*vEQwH%M495r=&~n~Ji)Y|cmbV=BLp#~@ho
zu)o8@37KHm6Y9i=j3Wj372@XUtw;|QU$ZgsbsH|KUvuFKRC4-ve2l|!-g@@mk7uG}
za&K<R$jA_ds#@1e>Uv9;UP?=>o_Kux#@6d*mHNw49Xv=$X$=PaT(2i{9r_THq!Q)i
zgFcbFh!(GBmMESA7rWw6pwSt4Q^;~epY)LJ;fNXJ&Tp0zod9*eZ2I==hkmk?=b%fg
z+1CpjntczvTleLv`e~y+MK7zw9nT)nYB;<=!+1$&t>Z8_XVA@gA3D@QA}*#eR!Ps*
zUtVD|2Yooa;AfjnR&9P&m3AI$xfl|`-%Unt_SRJizX0|<U`U7JBZ>{<EG@0QX)cRQ
zYL<t@51PHGW}s^B$5p1>S5Rx6JQuAV*N;}5?yN5-<1Cm9H@!6cf<pGOQRVB#jqw>j
zaiZDBL?iB(h!mrdLF(uKIvX$hsI$<ghNyLh1Orp9wh{zb-6=|T6Nn;<K;T3{EV=D7
z?G6J(hS;yWzW4l<o}Hd|>e_WbL<sTYeD%!}gPU=<DW-yuY*H<ZT2ejyhnMZC5)-G(
zYNrb+#U5ab1tpkcKW%1JRa*&onu@uq21CpRFR;%*X*L3_Sb}hvl2tUS)V4yJPS!sI
zRuwS66OLNv-7qroWP6BQDnh|<ezcyfEJCReaJ#f!BEdjME0KBHpcQ-P8{x-^F2Kz6
zz?wf5OV7K%Os|_g_B)(W;`ZuS9Ms;!eMq7FwG>Be6^?Wi6u<8sI;(0j#<BJ7t5fO@
z%%zE0N6BZQT+vpIwy4ba{p?o<EheC&=vM1fv->&pv^uXaFR5E<_IQWGPO=TE4X&d4
zu0$yV#3WepI*rr5m)N|$52Q~0{h2E%!hUc48V3h|^3@gS^00N>5BUXVV`ZDJY%=%c
z4`1SR#5N`a<nH3_L%>8p`J<{`OqVx<s0t56;z4^bTUFC|1|!@@zU4@IF1a0i3$b%+
zOt@X6A7w3E7E32s3@}WzdF3aU(?wPP7#wXK|LfPs114`xDuS@?`0@-6%z;ObwwKs)
zE9|Mc$2lIVH(|{CI1-q4?|y)*NhNlZAD0NrRRT2MD&X!ANAjZ<0>{l84%QoOL+gw|
z%0kH^NMYATHc?v%I__D(x>6J8IM^;7YPv{Phc{Au@|7xF8XZ#?iF4H_|MKSeuw)Ca
z$DB8>shDjIN}hkLy~IUv{_%gtp$jAI9@p>;1;B94IdRgKiDSf6Z`1<OX}dHuGz6?t
zpi$p<_RE*67>2%w)PAMl)0R06*`8EIni31f#&b9ef-7Jf{{4k*h9M>EHUaSVWhmW?
zv2AW&)oTeKf#EU-6b#)ArZX>O4i?yep6w;Kt+FVc5jF2LPDZpPstKA+wMZAX^xs#a
zqEKKe^}J^FUxL{~W)z3GF5#O9+al64lz~f<<j#xRfXAiqLlby6;Vorxa3g7)c16j!
zSgXq0`AQocP8Cw2hzKIuas%$30IF;(E-Be7&Ms=rx?QgXkm78qIRPoq1B#8n;9yVw
zrAQ_FFM6IkanYhhfLX0Lc-lZ|gEe;vWe3?1fL1ZEwkHWWW2iebKn6XdcKp3d9-q3+
zvQ8Yc<#y|%D|_B{9eGE?U1rW3ITZ(u@v>H%b?j2(OM+G}3c0!AtuOKyNyeZ=M29cx
zolKUUUr@RC)Kp)$8&QKVA3T1%9~;tx2;U~7@roD)I%VkMO0DFT%K+a*+lNF<FJkJ*
z&8`un>8$uY6CSAZp#8I+giTxYLoXCCOgQqq|MY1;dIGm+X7{6<^FEEYsvJK7=E~|c
z34-`uQ2afcK7IKjMG*s7^pG}GtKcpO_UNE7H0exE*s){B-E@%%E*2K;fcd0$yCI-b
zXpiK6c?6%WhUDjb8{@1JcPrnrE`&YtQ?E&wdReFS)J1?^#~OPYR~`!r5@ytr-GPBU
zqKk0?dIzc|LB!HC=oaq$p+wHHV|!qur@H&ixdpjhQY$qadPgmKVHNAVcbxC4u!3(^
zV=ZU<EmAQz86{jS&VW#*d@XwZTn2T<eTpY$qOPLVLaE3i23XlpHSWuaE9uXw8~Y+u
zYMu>t;)HQKDM1GWc3!rI&Sdh?ZF^~aJA+VF<l9-Pht&5AM=Uwtm4T0<#t=eN=&11q
zE|lB{d(qZYrz%&+Fj(?-(fhW)t`0oh$19xQl0x^~y=Nm1B!@6W)p0oA-9c}DiK(j}
zQ?9r4-ls9bJT-Uj8(2CEW*-LJw*#Mms{*N-RhF*O6^?EXS<NsqBR&Wfw^g{Z<EL1}
zL<|<)hp+IQRaSLo`vEYdO=rbMs`}M<maI(+7&==RmxamAeOcBmHQ{B<QZF9TXSJ2B
ze<gpCI(up2z0+*4j;ggZlJdB|-s`!XO|5EQrf8@i-a^6~Q-b6)ty;NKWHd1!#*H02
zi^@Q!S#IFKk(alKB-J|AAd2YOjrIap_hhFJ`^_7|BT_Lygv{H64G)tZ{kU`cPF0W(
z%Zd&Pyhw9RT|E{v-aXaJo}BE!T^^Qh0zJUp`G76M_4)wxm+WN7)NKd^gkBeyw-xYe
zb$`MWRK6ghyGC{tmj1u~eV&EqMx;ktI+w&cDCfJ=OL8QZG9O%5^!7{c6FVKT6eEY7
zQ_c{*(F)4SXNeEOcOqP?v3U5wnh^)r+DE*yD%3w!@ZHME^+v*>MW2vcU_GL>DT3X|
z4bmQ>yv<y5=Ls{?J*)cyGYD(|4XEJhk&RuW(B;N9Iuhc7wp0LTB87!YIJ!~O_i83P
zQCNaZ^%q@ek!rg7C5hWAP8eNqL+{d8!!^NjpUOzI?|Zl$gb9=+fvedv(RmNoO`DAQ
z6I`o4b)f?~M|3?8v^q6O)$`oNi(jV>d_Y&DPJ<knfMGB7vk?(L$(JR!J4JlS%2ic*
z&up--BM;UPuoR=LAh2~D98N8b(tHI@=Z{oEx&pT9%h~zGiKMj5q2B{pPtaNcsYM4~
zQ24{3QubU*1+b&-y)Q?@DU58xo-OSALpn9PvzytH=hby*uaz8KiIYvpI2--&K?}Z=
zT~aAfXrp>+_7;~t*U!{^m>qr~eJC)!Xc3bLqUSyp-;E`sj4ZjER;*kpe_2xVw{CN0
zp~|0pn;<u!$L`FPo_h3%!c#sBddN-{q(culU2C0kye?w1Efh)nyUAV2`%$IsIqT#e
zgVf-)lr>}QFJ8DHl^@pX43GsF&PXFy6VW-2p+Fi3+KJ9!4BfSBn>tD;yVId;ioY>w
z`U<}tadY${Spm`g`+11Wy-!a&Mlec!oT}KP2eS*_z7@u4IV+$2(hCLaYcKJW(`>re
z`(;yKizQ2>4xKjjrHrnuT;gz5y=jC^_?qPhulBB~%%XgSZ0}S7irjzXp1jml9b|PX
zR9Iaq`(jF&Li-PJu@E=WquW&shJfbwcHt@lTrf#;IMA?8_`eT7bk1ry;zI5xZ8W8^
zQ69sCT|FlpH`q_Km97vi2@-SyU13PW<nB@_g0}W4K+F{s?PT9@)^GrN3fUXrDp7fj
z|1Em|7x~QdrrN8<XZJ^HEr(NlYPpX=zY%}X22R*8Y@$y*xn#sm*Bd(XW!$Gn$v3x=
zde&HFx=e5}(3Cppc-V=c30}0cJbGrW2Zp&%Kp7R?+$LU!Boo6MBL09|E7$cc$o*J%
zR_((J<+|*A;QsK4>r>m!e|l_(Mhn!M&IF5SgFllVH4gXv?Q+W0h6pVB8zpB*ek{nR
z_ayD(i$9>YwG}|muNrU8P^Vk;$!)zOwTX~mHh5lGa&yyCUEOw@lIM<K{>@Y&XQ(S`
zA>g-vdu42=siM(;@&-SO$qyVB(I3wpe*C^(s>!>V{vw<2aYMF`6oa!w3G{43;mw%x
zTh7*^+z@OA`Im1~5-aHHmRwsRZM^iPPRD;hxj4t=)Ji=r4PwN2PZE=Hj>Mwh;nC=@
zx$sBE<22(%RwWU=E!_tX?h2cW>rR2u&|LunySg-x(q)8;aWgGEIj+NLk<hCex^+s@
zk~is$gSjnl&&SW18CZUF^yyWzHRpgXr;&a20LD<+?L^_j)QeAFt=?E27MZ1sM*aEC
zCnh<l=${gnb6hGvzr7ysv44L`Mn*@$%UT|EShHq7Nttv74`RAtH*YdV?~o;zf$}`P
zC{rcH2a4IE?kfauF)DmX2p_d1zWX52@^N%aC>CJi_J8+oFmld}nr4i{BX0gZXSucE
zNCd|+BFloM3|tmw@MZJKkdWc_|23B{(pjWhFCz;U_=rn4;KT_jVZ89_+EA;AGl>YD
zcs<d$@7edfJw-{6(Fx>=U(c6pG&bq`;{f|Oob*epfy-ctn}*0)YTrCNvij10RbCjj
zMmAF5XO1l^Ej<IubM4kGujuIL(u&`+3bXT}S2GXviIlIt5ndZ|DY~_TcZ0DE0GoHu
zDO=fP!Y{}I%W26OE<+*GwH?|)2fV`*S8DR<$Zg5Qrk)$faA-d^T{RM@JVIv_h-l2D
z-VPH1j^g!80FxCQgxcA-VfO6|gLaDm^o4GV`&#TUfYL2XW*p7v&e2!;{HD#2qP2JF
z(4zJpp>n81<?N*l<)_h<vE85{Lr$E~|B_4_XHM&Ov{d}}GM(-Id)i-!$R%zUw{G4H
zz|@0T!oPP^?EZf{@7w`;F1l&{l?>TFeG}^X_QIA#qxY{W6`s5hXG(6HP1*A%Rxwp&
zyQvyvp6$ykVb6|ct`7PX%~+?eFLfMfHqk5e^zxeb<rwv({?F?-Z?-VsUly|eiq^VX
zC>@rqtGi*+$LH9ufnTe}inGPXxP}{xzZ-U`J~Zk0G+kKVdAZZUBhPDk1x!oMeet3<
zp#+8S{?ZEZS%1GjyP%})5dAm-)*~exYHCdR7;KpwKf?UoU4}xPt6+%6<wK5=(gp30
z#O)9Wg+j$NV9v<twg4khJ601CbR6pRPkqVIJo-!1y9x%v{P_#hy6=36py&_8QQw}X
z<kpI2U?yLJOXXrXD7|mUx|@yw*E*Yj=ySw3lqjNu6*<WYAX0dovWD=*A+%<;p+^?!
zu+FbZFbzF7uv3?r$Zv2m=ZXVx{r&CrLfiDz)b}8MK{w|Ue#!48wBNQ5GFQvXuRAOD
zy~i`m0WzbnLWInBZXPf-d|u_|iO|{(?orwPQ(dx;EB%u0DEXpV)W2j``Mx{)FHLS8
z(|oBrrNcYQp?e2$l^+4mj1DUtlpZMRznbZZSHUn3XjtJjf8OaNVqv6XogC`2?%p{Q
zQtC#_Y75J=?N{t%hL1##cP|xX0N?Qgs9Y_b9I>SHu4a}#zSlYFD2L{(N=DI}vptKE
zf)k2b_xq)~z4dKrHQk{fuTNQk5RQ`Na_niiNj`yMeDJ8AT+Dlkz?~|HOfa;1zQcGT
z>Q;cJVz^Rpyg<7D{d_*g*jKgo)3HBl@tWnI=A?^R8j*{yUwGC8Ls|)b@E-J5a>l80
z{SHma{a!b^4W15$m{_KmP4|$Z!*(#F@T^Nyg{Do`lETm^Y08{bx5Qs=X<i(V1ApHX
z5G^odvHhm}amr}FbY1%gr{FAo3$oIlRa8{y<g2q(o{*4w4VttgS|;!6GNjpc3-@_M
zN}7V9gjUH21y3a1!r57}Mbpm87+IK(Y6x23nX7ID(o5$rTOy^EG}ZQ9<%B6nT+X=(
zrnt7}kMi=4gl+%A_;oK0kG!EC8XCSMT!m3-^cIfU{YG}>5yeR`oAb7~&X0aB^1tZd
zZKS*-FRl0kk$_~Cp>@B;{;r#z$xMX42=M6p9(g&*#uhDEIy4u~`V#gjEw|-~ch;ao
zMZ~cW?b8Y%)F9P4`}gl>9){kkWLWM_K50bDRI=KM8)!IdjpqOovB3hN1eQ}sH9b<^
z;MPhqdw=IWfQ8T4U(x3($Sg)HICSYZPCIwfo$2lwRtfVKEI9W2l@Jrd0ZTTGIn{+B
zu+gi&oLHGZxlVt7mhL_EsHc?F6V1nxb^8BIl*|NI%U4Ql*}#9>#jvRoch(sGZYIUh
z`iS)VrOI0|GQgt{Dt%#tC8*jR^UTpMLORN=3#HTn_J^%K8?HD0-60imK$lxIA<3!y
z{dr+<<Mp^W`pL$`^C+v#wk%vj4I&ED9E@=FuKEB#P;<_)-N+b0hh`8a-@#T6y}O+0
z<^{q|CU_8hREUmI%6kxF%irk#9%{UGSI{j4<~enCtY=+4$dajP<GA6&X~^khU$Ya(
zotQ<QZ?CUCl%_!@58NhG0Jw5Hcb6zO%|edF>5q_bETvz{P@F2-4fwz0P(JsnNSr?`
z5VtW_&FhMZa^<U5!t|Ut_M1$$^Fa0P^gPaHZ55yR45_Ft&zXH>3yR`N{yv*H9T6$r
z!Pt7q&Not4Rh8y5n--#TvStfu6+U{@mak`Ca7+ErB9+ngJ4-Wq_!pMSZWx}$By$0X
z+;BUmk?vX9H%!BA$%lmLlAP&R@C5Vz6rH)0qS0~M#EG(0+|Lt!KxzvM3<+fKiK}0f
zd0%7eRXvqAML*n%;L=v}A1Hc_nG`g*pMSZE20*PXPLn9EVTiR*2j(`xYT$>VvnP68
zR338GkdTm$6>x{L(7}sdH*<vssd{HE(mGuKA(sq;47^WJ+^?RmQIQA#$%g%*r@|x4
zPJEveBcYt}cP{W=xzNmE)b!~C3AVW^QTR_QMW`(0BrZI8L>9USC|!To1{irGpx1CW
z9uke5RyNJxGj;~ujc-1Im+gU42!#m^<_CI)cYe?Qq7!m1%P+1bM!#?(EH9>e;{u9v
zEwfytamKf%7XgeSAf60JiRoPtlX5;&SK&_}2{`E|pmx1ZPlR}yWKYPvL2^XjeuTbL
z>Yk*cJ~KDi#DQ6MtRVGY$-}cheVA(QJx?FnAn+Y&eQxX)m!?|sYHigSjnKR5+rNiq
zHBrmcX*!E>C=yXfVv%~uJpK6%`f_~DKNld~&1zdNBo&}CJr&<1b>qgts>sx0)I4K8
zCv|TC#M0j<uz9G#Rch?K8^XK!M%QF1`$@W6PLmi>nxAh7a37RR)$nnpgq_XX<xH&;
z+ngM2CPd%AME&A1GcVYRL%ehAjC*HpvrNIAY5?U@QJc1sa4J(si6?$<X32zZ47B%W
zef2tGlC440DC3B>pP1h1D!MuQ6QYd@YN!gy&cmL4&%1c2TuamK72>rVM9<#}Z*`-a
z3!J0_bCe?fZ1yXkE$$M6mvFvOX+WLR6G^C6A;V5s!zV9AKXf@-pJ#^p`pC`iW|?U*
zzgTfpT!bzCo47OHDFA5HNbx-bS|!Z&a(8i5dX1VR{A%cuTv)Na00={9Uph#H5esoB
z_s?j0ZvXuR4n~NrYvH(+ewL}rpB<q~+!ymz?@t#$KB-wPYuGq;+_)4pRZ21)bqx*Y
zuu;sO*32_B+z)G|mL5f~QG|!#e2AuwIBJDzx?9@F5q9ZVU22<+f)CQvxaGJ`oiar@
z&1p?jSGO2`<ji9ckd>qQ2LUYuG@RdN82-F0L2d9^MCd{Y5ZbMq1kOETY%|Hkh?qy6
zBwS81isuk^MTU(J3k&N4*cLwX5tp|S(uSDtZTLEYSH}8OHFF#Nx9EYGe!64+(Z_i3
z`;j0^S8#<1sZ3<oAhfColUH)ASg+v_OJf1@@1-sTz@hqBVteuYd23p-#e*h0-A`#P
zl<Dx6RLvd?*d9(uMoA*HbiyjWJw$oTN-<UQ=e|F{o?Oq}K`4Ak4OZLOFve~$61=`M
zcEktRzGBdU$eBLw!s=+m^mHPtXj29p!Mc4nHPleTG}oYTxaKBAAEZ2G&5&DCet&%@
z3W?L+cGBH(q2$fQC!*{~6a5nO1Pb4ma<sGO+)kc2u@Vw5qSIgCwELvl|5Mfr%23>>
zWH=sFd163zbnOq?z&EbEe!=&Ay$oT4iHZy*YCrS)rCCI|o+_Dk^^>7{_0nDFOHERE
z4~L!U)V^IiYx?fECuZSQK-?840b6rHs)IXg+1slCckIJBXx7K9S-z#cuIzr3`SHiX
z>Zi|s*Cl1-0nh-t&USY1(6+6x84_}T(WxNTitx%)dB|CN|M{~I*QlUZh@3CKT(>$_
zCOU6Epy_BIu(O9X9V)f>?KlD@(Qb{{ZvamF4~S1f%_^+RXKsiHd;ap4xV@|eO4UH~
z100MB`VmhSkYVS=><fz|*D35j&vw#Q+I^T*Iz*zUJ<GJWxB2>=ax~sKH~bj|S0|>^
zQA~=a252$ipigt98hy_TpIhZg?cJXsSO1fC4&TxD8|vzYq98@&lYq{`G3R+!mcGq6
zSIQP)KNeo`@uR1BEeKmKH+U=2FM@vy#kI6w-?3VoTtzQC*|hSq$az!*pfC#V7v{yp
zNs9pw7T4Ew(Woo!CAkmQuST~ayT3gzBDyc|A{H#92$wUzyS!lhcy8*n)a<p#4U|5+
zel9IlSn~b}`cAaDwX`D<IQJll7EX79cOafgBuLm&=v7oGswpX^DJPXm_=e^BlG(_n
zJF~@I5j+Vtn^u<Hz=-WBZ})fgk9<atO9U^49YzmYO8p6#!`Sj-0k;1b%v2A5&X81D
zpx^^|H<jdlF&42}@g1KMQ}%<dNs%px+evIBg!Xh152C0R`C7c;QA0hw2i(7&9FP*x
zext81uHI|iQ<|G<G;{91;EpmIeNG}=Fdor~=FvcB;8Z}kVsiYsm>M7u_JV2%#kd_D
zvhS2l3g{-pwIHUQ2JO1|Pemjm$1bLKclxk^c9us#2>-YvKT$PMDfl#*3)%#a!kInX
z1|l8WE#Ph_Qpu)XLutv{{H@@IG{H*H^q(Su;g{)7ibfM8{FC`q`Wi1t;&b4~=^_#y
z&4M3?kQV(s0ARQs4G_q8{r0afx?rYhU$b~wy6Th1kFPPVhPLKT?kB0t=%d^NtS=4k
z0BJ1jl)7N~$lA+KoR~i(*8U*^YI0v4j?Y-E16#L1u#J36M@Pr|JOK|EH&Sxv-{n0p
zH0y}8_Ag5$*-edLfBDL7GAHX+Wk@A39+G2krLo{YAtc3_WH-|YJ&TmH60W9IM%cV7
zpQ{DFZ`#*y^_QbMb<J8@TB2u5a-V(+ZPmr)S?mFw{MV-a)E$SA9JZ4Pupd8=i5uv9
zYVY4^z|8M`o+Zl7;?kzCye&qP6E<{^pd9X@@)fSoaM1gb%Ml#4Riua!_SU{UhPDu^
zBBgdVl&StvWEJ`SHTv1V+-dzkFn(UB)wSKLP(`B~etu<d>r(R=Zh>BANr&;$C=%^b
zLX{t|rI^u9$f9li<}uAQ``XG>De#|56>Hb8_d{mC0$j0wuPAB@5B5Riz0;!O2|=}p
zMSQFg71wF*!*yeFQ4vfb7H((b?wFtA8w%2xW)&f<3I1dJXF9E>ofrZ#%hffpps}!^
zAccQVy|8GB(*<&hx=U{Wm%3Ah^oTjdsP|6LV>i+{AAQ8Yv73j+k!Wcl{4{N9reuw!
zRXe68Is-Ya^HtR<Vl5kH>5&YadRO1b)B|Sdp)y5ZR086%PnbC@75rG|v>Y6*SX}z`
z!?#Uyn<3XO{6~$>DPrj=(uSa*<zb<r*i-MJAYPZ$MoMZ^Q;VK*XUzsq;PS}abG}hg
zBM3~s+zG?)mEQo<7^t~x*@v+ovt!x?A3od$Fc!wl)JBelhO0P3MJw2XT$K}m^gYbG
zEZnXs-15MvL6zGQq*M+GzcGO&n0iKE`{6Y-=_|dJOLbqErU9gh<$^~u{lj#<bWWhB
zf|gAZaN=c)*w#2>nfCiZHmSPfFO-2Vr!Vr&MO0=tQb^bos=*x|kxW0oqT4}&bj||<
zQXE|r?2?ht{I@7J4@E4L%;9)9jP=-tVkURoA?(NKiP($(eecNDA9zP6A$`3G2XNqp
z^kUnys54Lzk3doFL*{UA&_y98>i|H4h4g-wc!0uwP=BEfxpwoWdBX{XTu;22q79Cc
zcgzsDD9{JE{>+M9JXtSxGs2$!_6<UhEw&Z$?fD<dx)ff@iJJL*nIG7~dk8<R2BEmA
zu1@6)KEE>kq!^1VJUp;Vii{l-&@jywDY^5nQlGRWp{#*g>nMOx8jNKB*I-1!b%f)3
zQZ0KmtdB}refJ*5m8<RSM5`grlUnf^&c<*|ilSn-T_C`nfreW@?C|9lFhU_I?xBya
zV_YCPdVRVn_(h&=u&W<C7;W?5{ta^6uDtb}O`GD3bhX2XkWn!|FUHXOAbbIZXf`cL
z%+(U(!ncgRBwv@(hTexX7@wBAdD=FEib-_KJ>Y^t(>Zj+uH^?8(YN1^#Mr(jC(&hd
z_=!@=BH^*&FBa;`@I5^wVo60S45AAATQQdRrtWI27Oe8D&(ljNfLBpjJ6C^^Wom82
z*xnM#n^`#3C{zdE!aGOgeV?00Stj=7&^_Qzef~83#iuDRs#mp@@VLZH2U>s8DcMEb
zU~pl+zFi{akr%^_`EQQd{bqGOsRs;t^vtT5-{I;lJwUo^U436n?%75Jnjl9ES8CsN
zQvb1zVY9*_LI|u$>P7S<?h5Ir{qsm+UxVhM3#IWKU0v^YAB5c|b--TkuVOxgbKNgR
zxc3z{HlFBqBY+RfnS=+fI_mEV6J^2ASCp3iwj7&(W%&TuiU)Mebnn~Oo2m<E*!I|@
z4__RoAQA+D8}r4qZj72Q&=T0efjmPU+Ubp!KL&!ttUw16{~6bPjc=cf#)t|+9MADR
zRVy<&bdbwJKM1>i4EnnR_U^rYy=te~_jM?%MR;T%ln^=Cg`G&Bt$_?Ve_|M#4uvby
z0Qn^o=lXKDti%dajL8LZ75y+bd^F^Iv7;05l5}<^@@{B=%TD>R1iG_?*pAZD57}9w
zEtV1%we3Sf10xH>&<6T$o?+u}EITAhXgavoh5QR2BB3~#sN~q&tbUhODWj7gi3VCA
z@AnkLA_u{F7Ws&h>l%U1At5;&dilKH=xjM~SdssOiFyivgyt3O%S3yx$Pv&GqL%G|
zte~7EBJkvyjocBa!^3%D^-ZN6sRq1|PNXo-1HI*bCdrYX)L8y#Wya?eOlPRNQ){Mm
zij0YVU~5hv*Vq{_Q0(LmnI8%s`rS(d?~1m-g~BG=5rgxgJ@V*}0A-oNS@$NoM82I%
zMlD3{bmsLzL&V9RNq)_ZX$1Gfbd5HYb|ZV|Cy6qa-dhN&5^@cZU;y#?bMJ)<eKI2_
zY5Ni#3C6Mu=Oe1|pu#JL?H5h!JaR>4$JR6PzFM(y;~`;^PXH31n}rk7doeN^LS7LD
z?aJ6GKE9d`mtOaD>ntLpfIGxV7P%a|{XM`jxu_>ki|JNkz65bq*~(^%jYyh+a#ny<
zBWscqWIg4PWG3=Hp*T>VHf<H(1|jucqv~b7AQt#1(!raNo{2VvVvuZ#&;5+MKxggw
zBaCD&sJ&N}I7`HbG_igd0*Hc)7^17Ew+G<s%ZnK44jno$|HAaDUX5UYT7J%j381i3
zC>O;emg--o_qgDu!vV5xY3WmU?%o}s9&u<c;aZ$6Y(<JuunK`Ed>u}%ZGqQ&Q(+4g
zIgi{6;3}!ci74*HwIt9}VciD><r^6}90*Zp)$k>3L-0Sp)!M6Ms^w=o1IT%Wj{-2t
zBW{Ksefo4D|2K^^YK4rC{Wgt~<6Qpqp|hF9J6QNtiN`Mz48E+#xmPtV2dgXlzJ`aU
z)up8f!t*{9B@?O^k=)}P-vRCumA&vy7DK|t-I4z_4H!zWy|52LdM7qLP}5bb<T*gx
z;u{cx#8f)ic6Hu)v`dj#v3m2>Do(jxqo`LH>MR}uS10<YM$7!~>__kNhbj7@I?}39
z0NV}4b^F9W<7c=g!Or>{7{(yM-tQ4w@#U)&CyrA{)ejXD(w3=dpZj-XY3OVGuzKgG
zD~(WRE?(M_GekZE;jR7u`HAS3O9_XE03UM|VYY?Q`S)htoqhB|O^0Lo>sd(q>wal5
zc<}%8qq`vP`t`BQ$O#wvF|fM6YXAQiA2(s)h%iAQ4eH=l46H<AWL>T%{yx$>Fmv_(
zu6y`DrM$~F|L;#Cnm^zJK}2mO;V#|>8{DEy&Yf(j%+NIPGhe~1BtAY~W&>El=TUuI
z{#ktjeeta&443LWuv4p<JiAsB()oe5e9=rOVsNH;$0j){v}{xUYkx3q2f%mI6-1a#
zm9Fv;-7MbK+P_P{kUvm~vX(+Xa=(&+f)R)O&nXzzjp^ueSj;LBA68|=h)$9*`n_do
zux%$1QddRA)cZMP0!}?()ry;+4*S3Nk&C8q5Eqq1q&Dp5{z7J*dol5AiOb7Oq#_dn
zHP<@2BFiZMAV0$AI)GG)&_>^D_pf_V?hs%qctor2&7X!m9I^0%pCkO#)7^pM&4jXI
zWWAF6;FiHPyl(v~QF(|;R&t-<iGBEZK?<MS=<+g^7OWwMGdmCL;#Q7KuNBP3`_t3O
z{NMi&(DNR71^E#)v3V2@v_gzfpMXRpJ~md!ZE_efNa?qvXlx6pl{0sUyu4Q54E*!r
z_M9CmiL|_T$5GV(?fuvoz6S(AUADQdO7ymfpPckHFoeX7RuVDgh-fqU|9z(M@WcQu
z{nh)^q11J-hU@S*pK1|h=cMbH^t_0|Ow_|Xpa|~U`h%<%KN_tCV!7dE^rGzDfkvW2
z=ydDHo7=1I>$0N~0yM2MPS0)mMe(~za6K)Od<l;#b$gN9T7j5HstEyp!l2&0*WA}F
zUnPot1atn)8;X(~%8h(<yBxfVq}VM{LGgdm;?E~s&(BqI>fOblg#F~;6wu;U$}4pY
znX*}l>RKWJ^4v|&l{ktKDPlP4`$_4GaLJddJLmrI<(7p!(reik?S(ITi^>dT!1`j_
zjuKj7aKKZQ+$R$nDjpK+CcD+WsZM$Ko2uIYax8SqHfNb!0lb8Zh~x~=N(i3d|K2he
z%Y^sa-c@hn_slqsPECJZB*LUnWO{8{hP--mj$1x}X*og;F&Tq)w~6*1LLa3r-&#VL
z#9F(PgbX1we<hzihA;~b(@TN@wih=nzxBVLh!o0{eSBK!3bN{l7?kYhBO6<Tm>s>g
z!U>}CU6NZ1I}thj?xv4Y07`9yCUg(Fx0YxXk@85vu;s;6w!m6z1p|W2Z8LZFFK9`n
zHyS&BN46R^R6Zi&JB2|?-JAUUd~v@1{m3Q}s-+GryJ4Hty1AjK`QLKgO#J!*?VFa(
z8eY~0#gx+j?Io}O_q73vjj=J7>}JZR^}eBjG?Cqh_7Hx8;+(ZClTJP@aGr%|=Ekj1
zXjv})%X7@AQC-Nhdlx_v?~fgOk@4ql>r1a5+E{q=Ny}k07B&iETB<}$LZj6<|GxON
z#?~K#g*gBZZVTzrLr^g>DVNOe*|YZnp_C;L(6vj%DcE$wLSEkF3-6w=t)joA80c;;
zNMXyjYx&~F@0W10k!7_SGI`HMf`3bp?)-rEXOQ4ll4S{wt(dIR@=tw*XOc)6B)gFs
zpP>ozUqqK0a9F+N9Ayk42?YaPTdt<t3Ap$DA5zNBeLJ!7%usvx*+Bn&j`(K(eq@t~
z>X+%mltAkE^2}QrB*&ej5bJacH_`w7rBF#bE^SJj#hM{603RE_rTf1{G~Gg=JPY6~
zDnE%Bkt;+2d(LuRB1%{mF{V2JZ7Vi+?Df{r0^_z={J-^IL-F&9>4Or{@g33WHA<IC
zlu`_+n#f$3wk)|Q`)M3+`6E+p(H+ZvYbBXr|GXoMqDGeNXv)i(X1H4Th;bMEXrr>a
zhOu3E1$WYYTLMGm;%h5fwVY>0r;Y0MZ>fmQXMg>CKO5KKE)tQv=2S!O)~g4TtWd|b
zk%-PvM&*bHZfwl?)vnrrD+(eXK4sua#ee%;K1+Gpw1(!`?PWO=i+q~XtzNJ0u{fZ+
z%=|)+A&>k`^o$xd?K|#km)?2F*O7N@?QUH9YU^c^m(!(sU9`=Te8acV+kD@sCCE?k
z=pCb|8{Bqi(3{d4uZ>sx+oc7)S#Y%MLugZAw%eF7Wt+P`xmQ!|BJQ063}_WTU(eRa
zbE|o~)=@>WI|m-@97sxZLV^Y5VLcd@86FmASG5A3l}HjEQ@_Qd{?f}WYWg#Iut!Hp
zfWpVp(&4mUqNzRV<>e&?OA##l4U(3uxQ0diOj`4gDcoRKG$O2vPzf`uAL)gP7u&tg
z#o5^w%kZbC)BOirB=_wb(owP^J~45+KTLq8X0Huk*)r-aZ$0xy9PriJ7W1ARjJfDy
z*j5rLa*nXnU0x5vDNZBLoVj0&x$H0-8=KEO-R(_6N@h~rR&u8QS>BhQ;W?z;6EPRl
z6Uj{Vqh1U8hVqW*s96?MC<dJ)D%g_>w^wd0eqG`iMpYjDPWd$51GM&aHsC#N2fLK^
z5N;JgM~)u7fbHynmY2Yh-a@nS_hAJ)lA|v<XR(#$8wS4db>FpRSHIyZDnWOr-7^v|
z8mR&_d4>K&^#$+$7VqmeVN1EuzJ;7l2k-*%mjk<cQLctlTtrgy%t`;>XE_eoVBuc-
zP)<RQBKZX$wf2;Zq_{u&AA1gpl#~>3RumP@h@nFtlv_w7eJD*&k@_m?U82*FrXtz?
z{q1?kk`-qbWtE0KiinVB^x9k;4@~=Q(`s4e_JsI6K({c>=8t3C|9_Od2UJw&7WRMC
z7!x&m6MHviP*AL(0@lQ696+Sm5W!x+0thNtAjyqhyN=iptRT&<C}6|bKm{U2L`6|V
z1gsPh5&i%60K&cBx4yOhv$(m54m0Px=Y4m1_OqXDg>uR7+{TebC{<<<fx8*C)dUjb
z?ylWM?nDV0TToco7a_EqMB>@tm~RzbE3l8^r&~swYCzSCW9l@1PE`JNaBU?ri7RGi
zmQhY36EU0DwNRA3eLVZW4GrF%1d?7v;O<PqI^(!XK6&pS61TSZdj6w23BbSj{-aN0
z!5RA)HB(}mxh-7i!2oW(Zr#i@DL|1q+egdqC`#B`nhkTw7&L)!ox{LO<Vo62Cj~zz
zx;$!8W`+K6h%KgF^2bmdq7eh~_kO*xO@9DyMrX3z&3>Qg+Ny29PJjPH?33`2kdfp(
z_+ZV7vipy=4IeDuHMH)g?c0M%l+#}>QX^RW@)R6QQj?pR(-ofrYfH<$NNUa|#nJ?l
zOvcVX6HWo{>n?TVciC9%_UA~8rm%ssA3xrQ!wMq;k%w*TBKt@tw$S5jrXj^IrW^St
zpy%DmijUhu(hZN?q~SP9O(1Z<Z2DNkU^HEdlWTfoy3_wks}3CyMkzk32}JMl{D-`7
z+`^~{`rpYnhbYUIEjtCENXz+14JFCb;k(;@qlDP8>tHL;G)TM6&8f#NIXCi3O=!^b
z++LopnWp`+H$T_+SA6oVL5Y{YOP`gn>WQ6xb2+Pw7?WG`U4_IO8k!~z`~eKl4hEg)
z7nGP#bBiY*(;3FS4^Eh+=IN_fN5CAyLHTAx4CG6E>h~Bj#F0lDuz&ylbsPS0D`03(
zSJc#CS8Bu@<%>?q<jH&GCTHmIUhj`t9g5M3Gu6<^9|4s@N2O}%-%xwWF%j5uDr$%S
z_@j1dS(!6cQ}&{PNpofI4>B<&70I%k%dQ=kWY${FjIn}(f<8vAl)Z<9ibMn5iS)Ue
zKuA*%#MQ72w2!RW$aMcJr6+M_b5N+(1afrF7bTe`Sj#!^3G3d!|2XdK*ocko4+Ao^
zYF97&>5;RqlE6TkK@?ZtJ2vIdSLaI{5Wh&0?Qufpq!qu)xIZDYh}H>M*du;#qP=!o
zGl8mq{`q&+92}b1zW#?une8H7_MJQx1sW{?d{k7FcD4Tk3F^vWpOA^1^&N6ZJ(t>8
zRo`5-zsA<y{t$JfNu<#34*2UmY(f?DvQIpy&le4?>x%gFD1Rd>amcDrGSv?ZmtEpD
zyV+?VyW-sB>z<uvv9n~2!z7a6={`6Fg!x@-!;zGiG=bDsoXDam0&SP4A}{+yr--h-
zds}JT@6S7<hYl6BbU9D9V*%b%F})tdoN8-0EV2RFGp%@HE+?spgiuUyBr(|TURq{N
zJ;o9%@}0%kiqduOi9xGvG$@lAQVT6FP4wPI>031iq}YpR=K($a#KT5>x6h?*Jspn_
zCJbocPrL}#v~}Htu1DD4B#B!%yPdR$U!Fqz?m-i0d_2@q@o4}H#S(Npq4q2gK?(rg
z!|En@<H5W#f3L^XQT=(4dH}t<9*xxAppxB{y*GN`+kY44Q0x@n@<2mLa}I2%`i&ZG
zb1%KLrsToGjk59oB&D2h>*{(OUC$%D$GCz2;tY@NAdI^5lpM!A>*D$;iB&9O;q&PH
z((rgXtZ!v7;v%}V*aRzw`(pWBeg~SH!%cuh1{B(%U@spCM$vLd!y`o~Bst-5?9O{d
z6xv}R4N**G+yZ~6xCjuw4~X4~igbcA+xa#H>89=5uNdac-kS&&$Nkd_Tn1Un)$7;0
z!J+HCMNK6wq29#`Z*OZWvz!?d-ch|~<Hj^Hf6vi%6wA&c)hPP_qvz&;EPE1=mH3v)
zec*1Q;_JgXSe{xEB#T|1@A&!2Yga|F^c#00;)<S{;@NpVQU)X%A7u~ZuI{qi(%s$N
zbH3Q8y=|K|Z=Uz)^QSZDe~!W`CU~C5OnBvm2HV0AsFQr@8)}7K&vwo-q;n)k&z~*k
z$9RzTE8b5j{qUi+|G$(sk8p+LZvzLF7bV|FNLb_`{Rh-SyR-e<ILIM3VbE9TG#WHh
zK;=AuvE`weci!T3=aA>+r4={tyPb+0OR0<G0JZ3Q(A9Xy?c<6v25)fW+J<)FjKGcc
zJj}E8%HBH8kjl5Gmsjf@jTB4vt9?9W_M_(t)?i*#YCgaVgDI{u3W$rV#LQ<qKJg7C
zXUIEz=3!EoJ<G6toz&8g42Zm?&5Hd)WY0}&+N{|eibh^J0gI1xot<)I(nOktETvV-
zeASxCIGomuUKRb=oCpH+y!2UnQinAjq@q5V^>-~rHR)5yM|w+d!*DL58fSczym&Md
z;#T4YmnAz7XxnJU^yzq~oBYf?sYgAtbj4&fFica(-&^9I4m^%J>9b|A-^#@(4Gmbi
z^GK0v0?{tX!%Fuv`d--^_Tl+~jYW5-ZOjF>O>hPiNT!z}gjn9SzM*2Pl$MtI8O++?
zu%dK^m_W~RF^&Mo(#u>Yxt8K%dZn*U!k(8@q!=slgJ&&G6&K(H3I1Lj=Nxo$j@)a_
zCB%#}S@>+7e)F{ysMdzx*o9~Avn$@~#qsN?@l2lli>(UtsNEpw3mUg=lv6BaR#8k~
z&H&;qvnT(egzR4N@$GNtQP1MC62@eY>fgUVl}U4hnYS!qP~BYi2%uRH#i4$?M$w&n
z4}5)$;|M)9B3dAd+4>#;BvwL=Eu5=e7UgqyT4trp*0n31e-nybNAVyMHHRI8S8TO$
zV8MTg_}o*AIc##q35t7eFL@2>BqN~Y)XJ)wxIX!JVKgKc#(Eyj*}c>P!ob&^al~8K
zIh*+>Wp^o<sNb9%#&q!m*HzulU>_BiI&vD%mp<DiJCtzjsMkxF8Fw5%e;6kg*0)i!
zNrZ8`m%p1(PAYt-r}xXEq9R+2+Vp<AQVE_iJaYoRKKPYS*h`q3wEFKnG&US|*#W%H
zq+%*E8GWfPL!wLwih?{B5M*&SL#*ng!q2TgLzc5gIT$HBew--1bs4CQZ+!Zs@nq+r
zTcat$;pX4|twUegC()ohx8c?Ke{NKM3%xXQXj-evC5G!e`=6M-|5hv!)jl(NZ1@<U
zsbn4;lYO5qqn~0Fb@4m0r&zx_HrP+o8^t;aC){2=FlOFw$+kPQkr<?F!Q0FkoG)ze
z-pqqJWo2cL3R*mZHN7mg$>8}TsO(}x*6-S#SRVODnQPsWtUj-<H+!(LTV#jru0xJw
zuCTvU9(kzK11YpQ0njJx$;ik^#X@|?6zA+2jcpxZ;irZlz0h^-)%_R`X9v*R3rOf7
z$>=QM6cq<dj|RP&T61dcb-Qu%rVWG%u$HYmcgj4OIq(C<1?(+1Ae|QCr9|HR(RU-4
z9>!px;BZ5G4xJ$(e_&Ub1^{#(g=c?rdLKQ4A*|t8ib!jJs958$*}!HW=nU+s#%}GT
zS>Wn0*cd&|P*@+eXYeGP{y4Sc>v^c;$TcD?!5R;hYi!uicjLy5+G~uaURYR|%67y>
zPck(#vl%<~PTy(g&z&=52pFh=8W!!>fpxKZzh-xLJs1=;hq(b~PF(}N4_I*F;UU0-
z=Zvg5ms=`H2?G)qo3(-p6H2GX!=%<!yvL0k89OR5N4vPJdaIJG`r1{$8QEc+i+0t=
zTD|(s+FC<YFOgaKv;G$$cdws6XOOKU_;oU6vLaR^kFb8k^;$ko!TfFOKGXjrp#FeY
zk(ZwveYTl7^Wcj`Wn;Q5dX}BN)ufBQ{;dgl>(;Kd!9zWTw!8m|_L%PNBb<J}{tI3$
zJC-8MQZBxwe|^O=fC=S}qydtZ1rmTUH%JLX%l{iO3XKO9{nNdJ6Q9sAx7p|e+BhvF
z@}Xx=J1T(_F+U@I`&lQAF%pCDB3`eea!x^1m~eRP;Rz>45Tu{6eekv7s}J#x1!9m~
zhU|9o$Ai}8OCKVGCJ%+EQxo|2-@h>L+qgl%qO+6Wd_*}VkbBICu;WK((u%J0-Q7=-
zjrwONAaJt%^91c34ZT`Ij!U>b^W;u)vKSW-20wH$9G1T9`||GbVH_k0y{MOM4G5U$
zK2Qm9lhhK<zN8RhmH5bY8?Z28J4`1&Y53iDsStVR&smT5&BTPag>k?3M`hy0HUl)`
z*v|Re?!c2i8&s-2G(5f@S^wvA?Ruk}npZ;iqqRPsp{f&(`|#^E<quUZxz@e&OV41<
zgTT^e%$n7$OPA}JzKW$MgMZ8_4v#hE&XoPb8)PiHKX1hKt~O9|Dy&O_GvQ$x*jGT8
zMF#@|pA#@8q&28t-~X?~J>|>na(a$_f&QvT?jE)(FI#;$ez%#k(K)xE#hF>Rc1&^U
zL?X+SlZ6+KmOogSsMf4tP66W)egn5ywBmvnmgLVZwTEWEEu0lD*YfSRR^!L-z}yFk
z0mn_>k?08#jFahy;QEITKESF)PMCj{*N9cxFC_|{_}InBck|}Q`W~ITba94h-Sy>5
zF!a#`5=IRID&niHc`4S~_q#Z+C^`Emm&)x{)hN{dz={kolf1@B&?<E(>J>jYUd%so
zHszwXH<?veqO{$+x4vuJoiyHmoUUsHIG}`RM8=gJ*45Z}0vm|Zh6w>u=t2+@w&{Rb
zp$1qG0^=U06-knP9NzxNAAjcR>+{rMQlKq5+g)G2y~kKv+dx=)6ZD(2kJDei`{BB5
zb*bK_&mBuT)vRgLr%$jodo}wjfEE7rgAAHFnKI{Ox-a^o#4?WUUX$3;*#XGM*LUtb
zf+(nmxi$+`ZrE8_ZAN_mDCXWw!!`*9!I85_ZZcfPLDGja=EQh?)_<!wwN_bt7TK;Y
z=>_$A2!w&PC>RXY`FV9k;smTD*~QBTQOOhX9*}PV(wPR>$h@83e(T1)<EQfe+4m=O
zDSO|Ra+iwlUns22h7PF-1iNI6Ou(vs68lAtAil%w22=ha6V_!?;}usRLc$EBcC4{U
zrR6O9#~*aSFto4tLC{pw=l0849RSFcMkUEK05QjB`W}Afoio7JBtfsT+ihG){)Uej
zAu3{;Kr9TW#X<`^D3-SgSmr%EH-U3Y(Km=Z*0ik%O_7lOhLOmFWC|jjtzG*+_EoLD
z5~QcA%TV{T{y_<Jnb02>l44AW)fSJ3&MZHKWnKEtyO+}UcWrPxG*ki%Ne?weKs2)v
zq7a~jAib4{>Na0HTrR-7Jc7KLrH5+;&apForUd<5uwh3%Md{V@%k4*6Q4c$D|E2t-
z&*di?3U}y2TqIxZ-(?Z1a5t`Bzdn4m_RD=#EXZP~+x!3G!J>Q~4(A{Yu6+g2Wf_MB
zFlE@)R^kwNDWTL%w;A=IV+nA^)ZE;buF7oKn6d;hif;uts3@6Zn>BAv1hSlOPiQiL
zuq8a{;fi7+9i}I<D_H`pN{fRMGO<^485~Ex*hB60_^%d*MCE@rk!uTaB3k$L_t(e4
zgBjTaAa!@OhXp{ZZcpyN820ePhYx#Luutv(jl5IE%XbxISz@C233;c|=MtWphc1+H
z;;_cp`}_b-=<4dmRxUuuJ&1ChPp0duqhFrBkG1%b!AJr}aRGV2bYk<5kN4GRte^Zj
z8R$HPX$>5GCy)mU!@ITn-njbVAIa33oFC6-ywdl8)fW~L@_bwltT5d^eXP;X_MAEk
zDql8-@9(GD>$P1`x32V3Z51g$^#{N@L&yS^7cif5bBYK8zRq|`8F$#rN3@3zU^sSe
zKW7FstXZ?BkL&>5VPBR%Mi;i1jh&s}+<v<#rbbY~;4y}!7}u&*OTOu3N;*Zt)KHtM
zO%UQwL-)QeyO;c;X^Or!l$bc8Jb`a<MJE9*W)p{>usduQ?H)=RYxN`&ug4Q$9m0ss
zc}#Au7*UtCUo7Cz;ZVzS^*!#rT*RXoMamJB0G(}|y}dtq8$Vc^#8qdsEAUYUfzpa-
zO%H_I;>F@tnl))Mt-|`bND%Mj6h?}{7JRX2>FO--RY~*WxPKl3@sqpu%bV2ra$t#Y
zOeSC;c0eLu=mq*78!leFC=8-4ae<s54@6C22~?Wkp;!ArW0qmgnm|IDIkW&&973NY
z4&gr&o{JSDo@Dt+(z(QLbXr|3jbD%gXQPgW7&w1umGM)$?vft#(88Q10}2ZYs;tko
zt5+rNEO_^Bk=0b~ov)jMeTISuFv!q3XATUGt~byaFZ(mVe9p4&*j?5gCXQ*_7R{Ok
zQEq1G@f7lmu3~m$j_&<pd$vxRGG!n2J^#NXGoVkv*M!MmW>7)atX;cs+W?s9lPV0Z
zPeG20QPVbU!U<$eH8QY#eY*j7DxSGZA^0PdHU03^u$~v*|3W`VPgi?D88VQ=e7_h#
zMEbQSZ^iFlPM+=X|0600N+>NomEgGhZf<^ler4e2dsy&7DI4gh*&L}DcL0_tf`zk<
z2f&^4y*l+a6{iTqg_<>MHp$=`6mH3EDoj3I8x2R+SB6H6__E+(b*MUkhvC`qxUdn%
z%vtdLQZFwiI{!W4Io<D^63OKoY|%#RE?G1sgi?#vLl~E%h?x>^Bya-u=mGqZFE;Pc
zc%eCQ2^kra2)^YK%*}n{rfbh!YNIb!H{{6wKbId<+^XN)9fLo<8KzE>COvY_3ajUn
zZ%;EJsV;UOZV2W4>!xkfs8EZ!n|V@X&8;b;+50zDe4MJ+K64E@nTc5dzet$Nq4|*d
zwtf5dYu2sfV%@fxpvnQz`|9r$xkbhP|FQ8}cBUVK2y0uaQz}IQTkG|KtW-1!@m62o
zK%Rd4r!0PYwaI($0oyYTTsR;o0e_^+aJUp>@?vX=k+ZUzVNr+`-ptSj$v$RAZ5v^P
z39i-uv+FEfKfbx#*p6ZL9Cj4b4@=T(aP9KtVbWucLOP0(d%2WWo4EC^e}36^nb)Z3
z-Iz&x>I-^pmagvb>XS9WqSnv2<J`i@O)5`$NRLl%H+nxO1Kz4Ff{{XQrAl@wR2F6L
zRLQ@XveZqs{l$;gVl-Us(Zjnn^CZB3D(xCzO3_AG+pVp?Jo%Wz|3Cfxm?Yr#)I$>t
z2)b{wt$1YIV6Tc}Z(G07s$uZFs*V6UT*P~bRbn}|rq8_tZVQN=DGt}x<cq~uX7n^L
zup!dqhmYe2qOiV)1<Qb~ExeL|^DBM-K2o>0_C`;%At+&NO-FTWW4FL(2?5R$kK^#p
zF)3bm?bS{!fP_LM?tZ=Lr~+@?&UX_j1hbaf^{>D2%lhyA_<!;C```WFy}fO7!XRt>
z(-<Ie=)hw1q##y#_Dp5mjC3v#?RKZX4VR#^f%bK49zRck{OD<AxM4f8`tGmA#>L%+
zt>YhLu`B){Ad;_1Fd+Jx_3O9N#2z=@8VNRDm7%8P_grt+e*MN$tJ4Hh8kq>B-j`Yy
zFWXT7njlPT831ueOrJC-x!RNH!;K7LrcaHaW9VOB!d=C(H`}@byBlB@P;~;0t54wW
zyXmlbfg8{EGL|oj8HssMBk0soW8;&FLEfJShvaCgIDQ0t9DmY=uA%#%K*uGT6eypd
zQ)(yT?pIcyzmQ1IUht_?hWI)}T$}-G;5FKkZPczULtJNt${e3Co8S$VL9cx-76*$c
zwLBt`Bfh>(BNjY*C{;>=&hF(*MY6lFEDC=sC_Wtwj{g_)mpcO&B-J{BeluLJw_BOb
z@_AN2l&xmq035<I_bE5`%g)K!Zqns{5GsI;b5#$;L5O*B>NHZ8JpcAun+pG{j7%c(
zkAQwm_QTJNnKQQ;y`mDnc=tc);9A>PtUTk9Kb9MCAiNP^R)XlrxeIU*zkG2JmQjSM
zU4aEF4ha0q6b_-;{}a=>@i)BQR5}&a*3^O{z!8YlT4}Q)U~Sfq)C%y4!_xI^hQpIc
z8d#)Z78?z#y=Tdzw@kCzzumzruCx$hDH(u1nE1B*3)t;d)rTi9?$#~$Z6@A>j~n#t
zc{IhCI)8%oEaD^%M;AH^TIgt_<Of2(4528=c+5>I84n)~VEUbfHo=Z;FE~B~BOO9z
zm;`!5hJ5zx&E^xYhUMxDdPI)1&2s~(6-ilPm@|z<|4^0e8A0#Pt&%#5?ZoOQH^z~Y
z7YJ6izB}<7S$S}ZG1E=fY}hcC>pJ<MRr~g@Dtx(5(!k;rn-4&lJ=9Vx=MrXAn39<I
z*!Xyvj&h3ZgTsui+iui@=g;zxJLKIxegLJM1#=z&c7>B~=v;V4@<AELiJ50-PSsPC
z&ovgw%VfpJNV?Ns4Lojux+engx*iFM!5*Iqu2|4(S-D0dt+CE<rED!NJF;@Pn5{Qg
zYjbENjbiyd#$U6iWu!SL)z?VsKzT5b#k3<|PKukqvczxD($kby#^vpRNt*`@Y{T(f
zuw=QS)ChExr0m9j!G^2I@PItW1)X32s-k!^YhN+|AsG@_R1b40hAFAPQFszPJJ@t)
zn%&-?JO!MpNHVF?;<#^2Js#^6ktf3NQ>HxbJB_Mv(+(Ya0&alXhcX%>En*SVFwm9@
z`)vH`^!@qCpnQjrvu%^mSc&;uZ3|+KiXZ`EaABmLguj@q=Mawxz8f~Q1%Km;34IAR
z&sMks+j&r#E=Vm|ofNS6VJH;bn<R%7VW3ZeOj?oZp2Yt6WkQ^ZPa>h2>FDU_VULL3
zy1#uKj{6@6907<Glz)%O2{wrX3)cz84Wr~i(jJOb-L0N{B`d)za|FT%|7deF(@~?w
zSXtdZx}oIF8%r<*T-C{xR)9JnvK7gp&yCkyz0<>UQ6xetHI~f=e^I&XT_PY3VDLZ%
zp2NOMqm|RFZCaqZ2?ZK6wwXk~XU?2a&7rfq^}r=>zI%Ni55AnZGi~`jvc(!3OgZy}
zcIJCg3xiLlNC#L>i#6V(XsVIAHS2|?3#CFvaPC?Wxrg`*-U*hb*1Wp-c?E|lHuG1x
zJD)Wj|J!_b&Z;)wG$=_}yTIuvn<4LW`Nhv&PNpvlKVC~ot@kCI0F1$IlDXlvfn)Ni
zTJ!W8G+4OJWK`!ZuP&6$PYtoUJbd+nNnkdDEDkD&{V5^1r3AJ^Cmbh|O$ptCacatN
z8glurpsy8%-KYLD%zkAyaRDmz=h9Xg@nCSt^r{&$EPHT^I*NT`Q!N`v?#1^EC3|-N
z%?Wwt78Z99Ndm%wyC8HMurq$hDy@OZCN+8A8HKTMtW$Jh!hqsuyB35Iw~Y8r3t|j^
znv)aGPBBGFLM3(`8l`_l#HBRL0GSr528LbSJssn^$;E&Q;c#vAI(6D_o>o_}HLf;I
zy?$YFUMxDx4@<;P4>IFCf4XsrDFJ+_`k=DpYV{;39+6H2QC&U_oF4lxDgUu640?%^
z_X*1k+U}4<4Nj%?z2IHd`Nz;1))jEuAc9d)4!x5e3|H)nNI%Sq=v#Nb`mf6g`38TC
z>ls`DH&rAaSyhQg9*Kt_D-CQMdwUhlMwO@}%o#M(PvlS6wUTW<%uNe4Jdm=HKw~|7
zjs|3v;t_Ui-3G074+&&+Km>KyK92oKI%#RNvb40^dGtFar07e|t5{mW{6g-3UV?RS
z{LEx5DNhSB=~dvK>3v98govcmp*h4&X8;+U3wMZ>&-1vPc!E(m^1v-U$o|8DS)_qm
zOL-8>KEEO{8#-g_=FR(9tL^tc01%|2)R@ASaJ$d()~i0=dr!C+wY?D=yD4M-;%z2K
z?ToRtEec=wza`8WUVWa?)YTMaz*U#uhEKy6^re_7l@h%8jMM$K&q*jqttl6)Lh%sn
zpK;VOD-VCjAt}65Pv4f`o)y!U80og_Q$CQ2Z)lC{=|eE>PeweQR2v2nqSw#C-#Aj=
zd}B#xAqJ5oCkz<8f4~$rd)~wIWghFlgLp-0DU2I@ShPacCQOEdhH~N)9xcF$(C#1%
zJzhg++yTy`o*$>PRNOzzVoSn>Nhdledw;29y2;zdEJWNEYc+r@<@CH=plxOWP1?6V
zLi0`qEF}>Pue69qLgpK_dtm9xJ~-hq@UynD0T#?|&&T-WJ#Os%{?EaTF;*v`*oZOf
z&<e^MFUy@&*hhysUcA7boOF5?F{`APZ2Oxtmey9fs;W=6tp`<Etnb#0`P<nm_&X0K
z-C5@#lo1YP4UC_Ne;h$NJOaBXcgguC9KO>gLEu4QFu>^q#JKRdRK7OsAdK%Y+G{j{
z63zp%U;I#}y&xrR8eE6gq#8F%Jiv7*H5Q?FA>66#vN}rFuWDS-Z(x=Q);Wp6Uus6?
zXykz%GU!aAAgHmUp&#`R$d+RDt_2rDAp<`sk`lBMwvucWD)U$q$PTJG>=_}WRPB=&
z!Z0I(Y;)QhtT+PxXx}+WSB`<ZU$*?>loV?RhuxHs_kI4DGep9F>DQx)1@r{yL1HBk
zpPQ;-6vm(~OqyeHmqPSy8gzXF0uGaJw4_AUwHbA5B1E#mk@P3k|K^)sCO<%(TY??|
zXs@AKvYv<)<zB3PWfW<-ld!m(O7$z&vakN1s3Tz;55e?yM4y9-!6|x64p&rX+H7l}
zbQOz{lVc8QRZvpW-z4dCb%3k*zMu$r@51u4_zeR3NpO_xOB&X%pM+~X3d<rcBjdb%
zN%Z$XarclhBh3}$2P|#Uja#=2Nj_~`0V_z(F5+d>sJc;)U_S~F9C?=n1gz+7{crA9
zaRI_{4xY?BBBrJfv42JOx>qxi5&@}F@sR5*Mcn=S69xIklT!FQ?BgTc1NlQrADHRQ
zM~@z*B9i^`Wp{6HZ^&Uc`78MHv@vhpxG`~zGugIOP@o?Pn#o}Z0x68n9NkDs9fsAC
zDyldm9DNEA)lZ0R9GF&KT?kNImV;vf+gHYE1hV$;*xZsNa5ocs<U4Mp&L@tCs?u?R
z9Kd>7(IgVKi))os53fIlh@;!RdM{!7f$gS%oX*RL9;49`h2VEoCXRc%-bEltXq`~Z
zptd-6ci0y6@#C`Y=tvT|r{M8#l>f|dvTg!j(nsfv5Cg_$>F^~!hSkpBr4YM+imPy&
zAc2&ypNe6rW2o%1PyD7P3Myip^&ANLkSE8lo`$nO02ZvPuWv33S5mTg!eBV9XsjLu
z1&6e1PLeCNr`1`qwFr@%iLbS}Aa#5D84x+VAYbbUw~l)A7UHE!*Bsef%Dr%j`qC)`
ziFd0}XErL2dK`v@I27LBkQTvT5urgHp>@$8K0*Zh*m1dTp+u6j)y;;y%Q0|qHSJP;
z)H9z6zJ(XOk(k&sZ8_a=d&fC=4riger!A*CFRm6dZ$?mgF)xCx6iPQq9u)#^ID{gc
zuXA%HWCJtkx{~S>am<0#NxGXd=4idIzs`ecoHlLapb?(!si6{BcQrAw<Gcy1jFkh5
znpFWSD@X2gdW$I6YWY6chofGHfeh%B+r^^-s1M}-TKoQ>gdM>SJRkD8bfX*@!MD0}
z$q7qiPlm%1vi2^$>G%|EGjB<CkP;KGwy+or=5X_!!LIlZIYT~&(wx05H<~!o3qY#)
zmO-MuM}2?JJJWiPg7ATW%;y?=CL_spG2yYxY(%uQsch;v$2f|Zjt##3_BKt!G$Qm8
z^ReRx#j-hwE@rVO4ZwEQIWq)5UYQ?&BV-kOo_q86nBW*sPtQ=sDSpW6@HqraR_M@K
ziL?YMmMxE35q)u^fDGL(+@WwTzXrtWHlv2a$b5E-j(RB1k!B@Dg@wrr69)xfZ@<(W
z`*7ymL&}v9E_A`!Z+jjlFcuEf?qvn)zH{%s*;b+`5B!ADz*hAe;(Y-O-9K$<(er(@
z_ly%>>hHOVaz-+(WWrM&HI2M?kSRVut+CQd`h1>8z#?$~5YMx7k|)9}bf#1&XnC)8
z%1{mXVxW+X;iN?7?b}C9GDUm@+a<Eo*ELSp!YXHQVllZyF(?AX$WI5;n0rn{oOUsf
z@LZ$9j+X-yrU=6YqNeW{TV-e~W1LIp(Yp_M!=X!X2L0APN#di2G1C>FT^=-D3V#l|
zwkD8<i*P(*WrmmOU3PuQr+ugg7<Cw!1K#rEu$9k`UvF;%l-tmvp7MSH_<CBmqvs@b
zH$F;omj7qt!2q69Jr|7rf?y~Q{Mo_ztkKGsub(|TNJNzTR||(t!NEBmiD%t&HKIn@
z|KXC$o-etrH0ZM_g#QKqsZV7H4(C5~<`iFUJjCdzFoea;jS&W)&>~X=*OLO9Mz)vm
zu&#Ywv5UNAFxiL=Z9_%8)w4S#8lfQ4AnL3o7mkPLxZ5jFFTlw4m!$01?U){H0ck68
z-|Kb{I@UPx*TfFtT)L(WqR|3Q!?3>dAB-o6978eh$OrLGc!Q{YeMQ5NLpX=SmEbHV
zOVSct#@6BvwG>MWv1-4y_xu$^YO#aa$dUgDqqk|(>Dr6=KtqbJudhl=uSZ$r0F<wa
zDw^^ppAhdLwl<$lvDrQ9eIZA30>nKy<J&K>{nFq*2=m@I)ycylA4ovZKggilu5YVx
zs@G3s(ga(|-AvT3Lb&QQUjxw0FrG^|Ir@wbfj6P6P^WVcW`!z_&;fOrgJjp2`Vb~(
z+93qA?^99m5E(6?354Dv=iKS;v3<hX4I-qlqJzl&#ogZOq~JX(^7P%ckHPYL;Ng*|
zCHw;<p;CjaC#`E`AjB0dtMv}k<pYJly?5_W4UEJA%ZpRTi9wmLV0ew)6UqL29+BqI
z&BJ`?(3rkgZ#nMX--@dE{!>YOtW-7I4PG{0g9_1443@MfPIjO0HC@Ta?!TnyAzV0>
z9`KFum|#Q+9r%4;8cOy)^!~hDtx2D|?dsIt{j><C67-%7klnXf#tUjvs9h|_$2i?P
z6?*&<XMHY5&+GZVRlW>dRXe4h2ol5k&p-d1c(C#dET%-d5&}uc4}>i~0{yk{Pi3WE
z!)ZDSqV-rfLP^<fiNt;G*V`=?wz=h4F#`y^<T0OOi%O_;rr2TaDdTDvNEm@35#Slh
z&GN$d{;+R0R{+m6qoAD745~g)&`iU#KS*X;@bD+a_D3d-CKGD=d#6e61PvP#)-}9)
zb?W8QW-$^H1Ve4wWqAu9QabrTQp}=1PPxfPHT~g-sV;rU&If>E(fJTYrAlqh&<LS5
zXzg4><&BhCb@5j8Hg8gifGuvvumY4;bOx~elu`mNUWdXUi+L{R8)eKtm_ez@z>qW5
zh=v+=2lOUbf1CH!^ZH{Yv(&y)&1UGjn6zyKY9qX7*ow@;`5BITSO|$T!I|lqj78A6
zxly5VEjiL#v2BYu)~Uj^N6~M&&5(|UH2Clu8pmvtKw2q`yf@B+=0*)+sP=O>0?zOj
zB(PBB0%P3)@w_>Ad9@MPYbuo62vYqDG?gX+kan@~mb>Slec4a0LgYeKV1=UsOi~4b
zQVU`yRADvQk}%J}{%2NXU5e>DNUlyiQq7U3O9>(++}9l~_a=1>!bx=!3SmW<!tHVQ
z=3gDSsA<JofKm4bT1N-d5>O8O`4ka3epx!CGCP6LSdqbGyBIN|VxK7AkkS(Y`#1>>
z<AmHZmgOh4kh<V(EU3Dc(~~dNhL&inArc6uMvpJOAvZ8?0t`+v{R0$u*&aeAj4pe1
zcEOCeaYofg(^e1C(;Yg|iwYO&^RE`NqC1Je^TocaAfXU-m{SSCpZMfClc~XwN1~u8
z_NR@~F~!SA11z2gFK^MOfk6ND{T0jjqWrYc><nn|wLBz@b#Qpt{jj>`*Rs{qFdRB+
z{N>XIPZY~hLekD7Ob)#1c+QVzRobTZdPzRy;9c&a)({;60CXYFuC%O{Cx(>gj)vB~
zO;!GC@x^3Vfzk@r>u+$w!(h$oBszZ#iOwe4B_4jCbKO!~ew8b>b(6J*IIU8X3{)O;
z2|Y)YGLuCj`v3&=s917@?-1Mz9U!Ei>^_13lhKd;8AiE#$&+utTnJ<F%mJ1)fvCUE
zyL;?sU<C2MEp_O8z!8oG(w3)9KLwC?!OU%5e{HFvt0+UaT4EFk=ctHaT4@Ow+E+@h
z<u4a_Z&6W#o`W0`<p`g=?47fAyEZq1S9Fy4=T-&ir}!Q^cFdC0q{ix#_L%dy`R%4n
zpHtJikQ-6+k$=k^yHMeqsWqUU+)`(P4!M_g&&V-gum%MOn`sc-?<U<J>?O-$>3$$5
z2FND)-j*?c9KNCwAsV{+;qsD6)bGb?QgT0^T%7Z^@9xsi*B^CR`|IX^-@E!l_XDOQ
zttPfu>g+ht)G0jZ)bRM0GY+Q4|Fg(s&4!?mLAJx3ABEaY?cf{|pmO^5MC=9sM;%r~
zT~w>wetD9xI<4?+Y?iBIj(ytliwEu*Z0WWqzwkykJA7f>YDMw<nUa7|$^ZKIzej4A
z&@KVsjFNut)w%V%p5$<>g~7GYgjl+nUnwIsZQHezKB_iZLIvo2u$E7kVSJ9zI5ARc
zjbQZkD;|&MSfr!mzttL80hpChY_M3F-7GxR*0fxFdcW`0u<rqMD_+Nz;**o;<kC^2
zsTPKPo#EC=8S0Qku}EM9O%W{}qd|oXjlWl(F8KWZ`Vg>ET{Xzb3O`3Tt;_9rbOHr1
z#>wcYF-+FrLfUUNd$=U)yFrDw2=M?kg+iHrq2BN>fN9Jt@T-i-XDzPOQgX3l>s$8z
zyJe!kaq<;IWz$6|6G&54;L_j}<RE|Buwg^zifFy#!qAeP1Qye${=jvjlo3co-6m^Q
z`m7V<507=WSgE}xxw|F)T+^6PYZgf-j{J}+FJ5#@I>ZG}a~6<+AxG&myn)Opr;3>n
zk^EIh$JLCcqj7Aq`{R!gz$I4Sq9r&fo1*Z9gHU$Y1mZ}YZ~&dV)K;DjCmR$~zJLFI
z=?){Jq2pus36IjROsXi3igNa}G<wO?lE)@btCgFps}=KJRT<s-2!zP~==iym*mp`(
z7utCo#iOz2@@jk<R#R3qp;G70wUnnbhOcRAg-KJXVfJBf%ue62x$>>{o&4j4U0x@D
z5D32=Un!f?;O(SG9{f)e#qACuT`_GG>gaJsLl|qzU=qKI@|;gSv+{+1L#fNSqEdX$
z9l^6)^H@}}Q-5rDs$cT&RUfRhUxoa8vz`w2P&*2Wic)~GrXL@F3*0sK0Qy$8bPxZv
za^(#e?_e%&NP`S$uG~G(@0@@0cY_b?J(XIBd9vl*v}mBxULP5FT)<eMB9`<ZBv8M0
z-zu9{fy0LJzBQ{?TSd8LHzjP%f8E7=Z<FeMJCRzm6(bhdAV1mAy}9IH#2eCIplSO3
z_xms}ei-|LRnq68o5|Sw--*oUu(joNg5ua<?Zcqg%E_%Y`zR+bFAUDUsRojzI}-qQ
zrCHZl(mM=-T9;4ayn{uZ+|yyn<GNWNI9)srpN~`Us~<4KI_lAbihV`Ul`E;*G7%|d
z=Fslsx!qk$bkjNwpx~ruyBg<Ue!C$n9z-5NU0}6ZQQmbb&KaJU_`Ndp-CwI#NuMF*
zBuyZP)0Bj#ud%l9uAEkv7ha-&ZI||Xti2s9tFLzVH&pz39id5jCP&2Q{K)fBE1;J)
z*`oNAm(jHA&b$`2HN3jwhf81YtZ<3S{h$|+fs8|~gG`^T<XZm{NrbfY^jZ7aJC(1y
zhp_`Aqt8UWf`G)T`3%zK%(w;=7}r)}FR&Y&2x>GcU^gj>Q#+|Vu`!GEhIDXWxRA5A
zC2kJ4xEK2WH4kFd_<Y@J%eSc844W(Q@#`I1)>G~V4*k7$*YU^p2ELegq>l0_5h$fE
zOuuR3u*Za0Ub><!ASu%wZzR@R+f;9=oAyqZM;cZyZ02fFFjZS~QKKws5A<W7Q8}Fc
zzArP5PiECBwYbr92HfTy{$D9CBxH4GkcnC-vXbfUstgrgn5EC^Y8$dCPQ9reFickl
zRjqWSe0S3&o<Oh9q$dCR*P9&|>nQIIal!2QZNBH1cV6?wSNdeg(^J)}cbZ<&(`Ka)
zMwYf^b?3i&5GAAmRp_s)fABM2cu<@JuO&t=3~DLuS6-dESw3eli&gd7B3&0S9r)q+
z&Ad*$8f)p|a!eBO)>#<vaO71wcdjsgqx~qx1C?HRPrfu|oAZUTR|BS9xOnmNw2|Kq
z9L(vI@^jeO!(jxcgwgk2jEq&P7f$<??RD`(gaUWKJymLI^y2_OM-ew_Ujx%g^#l1U
zf;UsLoHU@X_znMVSYsQ?`!au*s=k&dO)gtZ&y}1??Q1CAgnq6NRn+P?9h+7gO8Lv!
z6z-=lkeyod=bwK{h)NJud7=~rHO@eqp+bmQ!zeMdHgegUzIjuvE+@Z)OWH4p-6f@J
zx-za_xiV5Cb8Uc?DlHx{plYlB1Z#kanb(Ile_nlf<X1Zm6M&OOsGjOD1Yjg~tu!2p
za;)e7Tx(<^wzep~|D%x_|9}8ySPX2YKmMPm#IxGhWJmv7+Ol@Ve-8W>wG8Qe%%gAr
zHf1OGoP>5;b*+A3NT)Ro77`MZ%zoIsXCZy-QlgM}5;hKsRo&sk$U}CbT}bHX#6j%m
zr!k9pCIv`tyFi#gy+^H)c2j(JV%0@>UYFAYFe%(lHnHi*k(O9h;bMN*ve!`R&*pxm
zATX&^bt7OC1q@m`mV8niA_!l&w4;MFZ0ue}ws?wqFQZkTQ_`v5xbb%AV92rsVX8wx
zD7@ZYZPB6BSP*wo5P=YX_+dY-C~|{BaihsopxssXPJ)W;lHOVNRX*>d0&5LPV+<mK
z(wi))1A2=cjJ=_g3E7sAo<WahT`l+eAggcynur?E%^IP;6CM{)S(JQ;+ymxUuzT6N
zdB#RYu~5Ybat{fYndBhcKFV#Qe>=~$SDhm4;v0QQ_TKc8&!f*o3?J0*{pqeg<`NLU
zK3BNmh{JimTz~!|E-`U5h>U6SV<$peX&5}Q>>9pG`X&Kf^uoKx;2UNG4-*Gi_@%LF
z$6Q!es=vpBPUTb-HshT!8OfYU1OLxF({6@_hBwP<Z)~uqgR&_C6)5=vO%c$wGeL!$
zySrhfnK!<knmPt9r*QCDuGx$HmrVn~W%u8$lOtpj6S{4dA59Cf8pfEHIG0gWGzGVM
zP~ydAqN%W>S^yI!Sa*2;0+1fh9^aDQG67&p;+N9;=}rNt;cJ<BBN%}sda$ooE@Y&p
z>>_rQsRkgaC$y1C%ymhgOG2NF<Lj{FvsHkMzkmz&=~nxvAg`(a{<ewj@u^qK9@5%k
z&Ex4~zr7$)s)QdPDv}m#%{7o?i_TLBG@5?djT<&h;iIGvT0e))JQj>8_RYUip&P|s
zzux7t2gSH#!~mp+I~0YzXht5cEUe#DS5&@4EMmgUbVNYJ!a(R5baVMY{H{z~Kv-_`
z7MpA3ZRW*oe2|r$9Rw)>KHuG<#a2mQt$NU?(uQ#&2gZpQBwe(ZbTjqQpOU>sZ?P_&
zDnx`Iz{kfhNu4uc2haA_I|J(HA?$VGNMP5yTJ8V$L5qMeI>n>|nec2Y{c|FKrWW22
zjobS<csb8YpxhU%lB!x=e`rQ_2p-q141+&EqERaK<gC%uyljahk|CMV%n}0jb?NzD
z)W$*xU`FD0QxLKWmD($=lMmP|wJr)DxtI|vFq?H{7RYh3kK-dAKwPmVI$T#fVb*TX
z5kx|FKt7<G=tzIWXWG!$z0x2Rj{5maWH9&59pX%>O`r^poK*Y-pcIZOcuO(un5~C6
zo9cxv|L5lrJ*oLo`)xIRA?78GX>snzdrzW3fqdW{coDz=KdL2`mFfeMxpQ+G(~1NY
zM;aJJlR!R}ny8TA4rChA*+Qxvk$Wr_6Y7NHWlh6S@)H>rna3Wztn`H3CvlJ9@o>8F
z85(&;pikY1R_>>7?ti;FN59+MB=pCl13+Q9lV<JOb<ce`^F21p_QdqKvS$o74gm(n
z{*3jr*>N)ON-O9pR7~`c$JE%OSjkSkd3W;Y?sWU`ugUC4ZS%7U=U`<<|9a4A>A7i{
zB{=|uqzNk$am8F3jlelYcsA+Nop7X9`~HK6C><9{&F<a1e$fsx8VxkESJjE$XyG4q
z6ZR6S-~Cp6J&~9h_t-Nz;g3+DY&n;b!b|rkQUm5W&&GC1ND&~Nb8xTaxG-C-e##J@
zV%al?NSB`l7RHCFLSbihwGR9bImjtw>Wl{r2;SL&89kt;VmLK{)Sv_?ZhWh-m+&8X
zwr!S<r6=YeF#PPiJg-O0AbTB#4W+;QKY>eEX?+D$ft?SB6c1TNtvVlzFVrG{b2zVV
zk3N#qNwJBxB8~g&MA?X&{~UI$ygn(Mw#ddZCY|rGk2urj_u6$|?OKYBk<LBB2!nz{
z_E81DdzXkZBvf5>bXr^z1{)ij1Ss-)*V4y-*A+|vFLbSBuxG(j^Vi+ubYIaElfq?;
zLEE=s4qhmPBS&3li)#6&(j2J(wO{gjT|x3{gT*y=g%UdqIbx@|E6oJs>Z70OD<uoe
zcDj3#ixZenzxqZx(!tQGxr?baHF!A#&3f36*Pu6Wg!1x`%TcdU1MquGNonqllMH4g
z?y;Z{WJ~|=TsxwUf|c;cL=!_d6T!v)m4`cX-+2Az@~15w&n}98HZ>iLR0_0Egd6K;
z8k$k^LTn)5@lqn^JGA@FT!)}r2<ZQF$Q02>eQcu7EKY_{Ic*tItZj!Z`^PCl^b)S^
z9<%S4zYH;Q4I3Eggt|QbTN-ZH)s6$56itwRLtaz%Ai2CviQ9nN6+(I_dBnB%tSyfD
zrr~FX2Cc7c!7_~6*+=JcZ3}hS_~ZL|vO78_QlFfc;7ywssoc!*F?tzgvm^~}lZ6j%
zIsEK|oxP^~a<Hy{()nMW!uM#M1tadEuKQoh4^-y~N^D)qtI`J88@<lsG_M!M^aZ4{
z!79yNagBy-(WjVZj}(ibOTRnc*w#J#%tNo6Ein~#Qp#Mgn<ige_kwjT^vCa|p;O-=
zsOi#oM8Ghe;@uFji5(zt4~=h=lFlbTC9rU+4ACYhvZ;0-v~<C-O*o>isC($>jdGpl
zki<7PY|zK%9NqVGu7R^zaZv^Vd%T8>(4cGdX?CsTbjmG$b$-UHhBE$Tpvx%Yq$GF!
z3`UB8*32t3s7Fn8Mp`Pjeo(K#{Cv*Vdz@92<@Ba~q|_XW`f1~HQZ38Q&YrO@=hn^x
z?2L)~yIou3V^+ll(AU?$`KPJP>(Y$#OZ#6wf4XQ*Q@#CEb#-ofva$LdS@0lobC!39
z$&ISh2*t;q2`o<xvnWU2Kh<NvaDS>g(jL_N6%&I;<wHW4XFm~EnB6Je%bua5Kdk@H
ze`4Jo)<|eH>Fm1;5q$US0n6SW8(dywqYbK|v3KM7FC1183Y~oha)Enkodps<Yr>1u
zOm1NVh_~tY`tZQ{)EQS~mSGR(T$|9YV1jJGEW-XF_Q%WvU%R9q9MxvmouG%KHLpBI
zMqm>UpYXRiF<njP<^7g>I4rQF&M$EIFz5_M^`7rOijUVkL~G#X{aMK-YA;GK9p10V
zt8JwQmrtsbgIcQaFefxWZCYZo`rwG~XMGx44OuCh8YJx*A-iYS9dP3NAJ>VVS-erW
zW9yMte79lNEenfRpdEf&)&UTIiYeo8+kkdT2O0sbvkfgnlP`cv2=x(=P%8*AvvjRf
zMT~xW<fBmr;q(3BCT&tl*2RpgsAl<_k+UuR{WZC@(Zs?A&AR%|Y}>3^vCE}Cn{d+y
zm)xFu{O-9HTccyFH89EN+T{R=BcyuQ@C?Uq&Nbo(a{~)=^Ay0U^~VuKUS4LuGnCNJ
z?=1x-?b|oc2u><`{o9BU_uLmuG%_}hqYtU&BW8(QfAw8q&BW^IA4S1W1_P}1JKSyJ
z=|k~}M#Q0huA~ri?QjvHhFL{zp6G%3N!4rGGI#~%4F)5TX^5(X+WqlYO3XtMwq|X%
zxxda_E#=Cm73p-xi*Me&v#@Cxi4P%dnr>7A`%8#I^q4$cQTFnBv)=b1pJ^Y~z+4Bb
z%s-9rG_AVm=nS$y))0-pA4gfuMwjtvUY=9(Jrsnc>#F6dl`He8nBH=mwCG)Q(&AH+
z!~4A@9!h?SPVc!<8TvtG4Z3e^H(+LRG|p<4aq#3zf6TcCJF#EJQd)MU%}=^OrI8m~
zIhHf|s?4-}amSP1j*}hdmQ8F>@!<Gp%uJjzr0_WoH=TwJ8U7)8XNRJ7MO|LcoMkgG
zf-p*#n4@m}rjf7DOIi8;f{SC&B3I49R<&C6(VmR_h~q^WTpG%bZM+rJK5zv)DhYw4
zj=2tLt8~n^Jt$fGXT{fT+)q$i;V#=X7)R?9Pcws|l;=D(fux#Ye9T6Jsi&u(88+L(
zqv`s!Yj2(i4_3ja`c?XcR2dhR4~SnHy0*_e#zY?^OPaQf;9vjj+=H+6KXlA3xtjF)
zXwahuXX&y*9%)u#xy@Nd0FNr9h8(J&6ZpvHq}}1h{$i<dn>U`#-<uxQ|3#gEFVKVH
z)3$lDW;2=&I(V$uHmeAYy3QshCcllI+UUFQ;@@~6vU2|2J?Cq8A3buEi(c>$_eP@s
zq4;F!QX=GM&;_hm{Pm7s?Ls{)KHz+XrI?!v1H&*w7sKrG$e$MJa?%#ft7gkP%?f-q
zRWJR5<JR<#DRl#O9*?k@Iilnug8}?pKYZNSY5C92Gv{W6pB#f07TMDDrY=nOk+@E{
z($ZK@M*y&RVCjwAhP^uc*L{o)vHX#oE>Q0&x{eXKPaP%>BKMJakl5#~=nBOL3_LlF
zVAk&9it{QFoS^mg4%5lkl`uLp6thaE{2cHz{%uYj#m9d;+(ryUn~a812;=}(;Q~*8
z<FG?CI*Sg3o?7a-QYGbbTP~FD4#?bFdpbHm`Y`C`bYaViN4kt(o0y-Rm^i;n;){nv
zmOm_-2LEna;*ItULKf1}h*ef}830p+lC*U0aye^6m#WTdZ1O5_?`GZY4C9^kcAcDX
zy!h8CyqWypB;FZN`E=b|2cav4kbz!XLUK8$zO7BP%V{u9GTXUFrn3!=oj<;RE6KD<
z0B%CYK9!JhY$8F!PiwDEU57v2^Q_$Wl|O<kZZ3Nt=rM7yM~TPWThE<IW3KrXEfp9!
z-D~EDgk5Ms-E?Cr!n*j9(HGBbjh+_n?)|AKz@#vmQMe+Ib1f3<-qdCAzV}D9Y(nix
z*wlb{J9!;Lf)*r4`_3PKdI_D$N$Yw}ik@D0?*(&2{vq&K|CV~IwxU_m@20dQ<T_ku
zB{r2v-s!mP?O!VFofT!>Q@JhPR2;DOW4WGUSVyh1$T#xw7Mehk*mBY`kT%@y`59xc
zEQ_OK6F|K8H3G|%QLzPtwsY1PnK8Zs^E*B272$5b9K<uG`n`pSvwf!j;Lm&-mEh@x
z<?)NyGlo=K+U)ZEYUY23FuDDwBriys`HZ?nzaa;3S&fSus!co#0<nda=1@PUG586~
zdCe%O>=`tLX02L<qBt8uY}{{12SwTO6I3%i>NsLO)f`4c!3JF&wNHC{F-`ViKdyOD
zuwigor`*rcZotx?E+nj7Vo0?^u$1-3Q=R0v&9#F9blWhCP~+aI9&}?r`{g#}$lz9t
z9q%7+wEPndYk)R)4lXrIqVS8OGR96Oyf1raMA<l~Rq$&=zwc?h0nVSedGZhc6<1{D
z=kJ<%&MCDk4%|KR)Nm>2V!hWP&^v2a8VT>+NMa1<ZG{*u=>ch#MJ$j9QpC{GXw@8g
zs=+HxmBMa~Y0Hp=-qDLM7ToC8dcdMXQNLH-=K+E?>IH_^gxW_}htN9JbJh;ryfEqY
zgrJX}vvkS_PHa?nYap>wIY9b^1X5Ewzh7=6E;kn3D4NjWBL&dodaG1214tu9A2@#L
z=XwN3o;@4f3I{VY>KRu_?v!wNWUp^)V3|XEyua!AX+P(vWfq%<%r5_TXFz^nm?7yO
z*CC9d4XS0hr)0b*?OT9ZF(v4tCJ+cbgni#{&;oYJwTM)wG1x;7Qr>2|mP3I`ju}I|
z?j?g>B1BeH5G=_Z8cnI#z8lwr1Ce0PSoh~2e~411G=THpB_30LO<)Le5Yjhu*3l&I
zPL5RX2)WK?TCRKJfi4g-PcJl>74Nq-+aJ3TlxqZ`DyCshP~s3;U%`^P-7LdqPavSy
zs07sU?Qt}Bt#$wT4|Zhxy=WTx_lp0D7c>Ov4uEBSt_h&a6EN^^)22-~KppBVGxJ+X
zr7>x)Qug6P9%0wc1jgQ-#^wd$l!Rcgv(MFMPu2^@2dVU$QFha8lD4qvahmiSC}Olf
zN=D3>W@`<_AW?%7c)k*P%v2|XZIF$&$>N($_cxd|pRxc&7r*)~dg!aAs}A<buH4;d
zCHYd*lTLHB22c@DXKG>Ln&#cZ_`Sg{@x+ZHi#E+(1SMXiCm;NXlNaQdIB!>OIeNY#
z&-hPlVadhM8P7f%hticUIZf|}XBO-MK;D%5k*i!@_VUcepIWy!_^%W2Bs*K^nOWPv
zXSeGjWEW4kNd7LmabyfMWOU&VWhBxEm^vql8!~oBXoOS-H>t>8N0CX$V7_9hlw!8!
z_fU`S%bXpWPlic{w);^<L_!nq6k0F-WWG6i7<H`JrRZb|^g}0unMQ~~9t@@avT(NN
zmbgkQV7#+b`{?a8H!YeTf-ZSw<gPARGmB9kdXP6VB3*~Ed*I6W(HVJ=|K*VZY61yB
z=8(^F^iGvdAk|5(*J0USDI^KFDWor+FJz8fp7{r;=F+^UEs9a^*b4xE^U{xnea;k>
zGHW}Sc*cpE1>RCaSE)4m0Llr0(2|(N0}LC`+Y)Lw@N`i7cN<@ACVtH&jYUvl%iDW6
zVncMug)$|*^{fw=i6Mh+xnnD;83(5ie|2KKO-aBPWbW8cv?ttV_4z;@5T@HNjE0IY
zlv*!!4OOx+<K6v?T=xq_k+goMQ4R%i!It_g>xqQB@D8O*d|L7V0|L!7gj@{<-G2Ww
z_=s3XDr;sMj$bHuF{5m-&(8NBl{bN{Am7s`(L*h8EAaoG+|LiSM>$jdnDQXK2l70D
zCk+QAG?{@d$a@fVviB=yEXu&4U3VC87fJ_OzxY8OdA=h1$UV!<$_WV_dQJ8jT=v31
zUD7DR9?o}s;$2|<Q5nBVhA3JlNEb@&eOsY)^8o(+c+>MV*zO^r!}$34=;vI(sB*tk
z{VX&)bD`b3CgKdBgUygVS%m0};9<!gob5E+^ce1OSAk?+zy!B}z)tv2WbmhGVo{82
zbpb$PUIw1x_YzUK7wrPcyfo5ll)doL($|AvN3ipt#idf^0#bij?+6^XL!#9uO-Zd@
zl?apr*qw?uc$$^9Wkz!~Kv^WyXJ#hb-@S%UCb5F2cpL0--h@7v^zyZhycL$Q{cwV6
zNt&ygXFt8qYMc?Z@HhOu4fwW*l8I`9bjEQ=I^?wy4O<w7*q|WFc+!L;VEr${hSA0o
zhz$?JGV!PDy`ai=8op*=%0k?5AR1Y-!S^-K|3<pm;S5eiTAMUQDK?{}Pp>Irmh!SW
z4<I0w9+*)&Y79<4<J(&fP}hJVK5pnxAKXGPu4B^06_e1Yf_XfS-aWR$PeP}s?fJ>D
zA=fWm`WYx`ERIX?H@u7)GbzSl)ntwqPc{^=)+r-8g2*AhL<DGy=`@i34~baiERs!k
zgE1(u5}!_#)t%jML8Q6mz=5n`-i?i=o#q})$!$F*g;9GDOHJ#_mbGiwvgn<OI+2Y3
zg-CbX$&)A3T{kzVQPg_TJRk)Y!JmnZmC=F;NM3IA*c++5)@;nUQ0*$jw&6FP`WFQl
z(ZU6R<LT~IxmR*G-LHsNv~S@b106)m2hx*OOgh0w{i<{$=Bxjyi<REOBytNq)Fl=x
z_wej0OPmx1<9@A9=8r9fWbi_2f<t#7=K+<&5#FZ4V@PmX@a9bwZtB^j%v|w#Frvn9
zPQ;^WR9MelRB$Jcrt3#XErFo%x60FV+@)}V-q%eL_O28`s%~?s^}C(0h0QH$E5}-3
zHZe!r|F>vsn%fyI&zTiPWvXR7N7e1EjfP&>g8m3dl^N9)953pdAcMQBK6#h*ioq?v
z|9*=0*O*n`|H#RciBEmoRK^3`WBYm($JL+5ZQ9~CwU46gR8M-pN2=$gGE66eR7ss=
zkT~!Fn;>@T@g0@yjB|R~Ie)V8K*v9N&+gcUYG4YJ(i=W%KvNdE@Vd8@y&(I&i`-u;
z3T2q2$b^K!O{x<ppNy_#*{;i<-D(8~KzRYXU~S5tpOne3MJth>bPLVXCr|cMbzD8w
zL;IZDcjOy6zv#mDGiKaId&aXq{69Wb@a-dH?Va(CO_KTGOEgTZ&)0r*;t~6({S*#w
zI7OIhq*P2e@#TKdm$TzYYz<r*P}{iZyJM3$@>B|^E^3&0yg|c;hO%tGR;Njm&sZ;5
z6WHTMEv2IYn=Ez#qdZcWhe{Y^rYXYwNdQ+x4tgEVbadB%Xx(~>&#f9k&~78h{eM8D
z(osYCskkeuP{TF0dxzE{9C+HmyX^iWZN*Hhb_zhbrAaMWjmaJ0S<JGB=>1yD;`kF?
z15x|bN<kAM-aWUi)0dQP@)-`}K_Z!v8d0!ClVfKabmoJF+e^~3Nt^zI1NS3M8|X(K
zB1C_UiiiU9YL1nwqD&071?%VUx2vRTLvgP+ztOSh0P0Fi^60MiDlbmM_DwIetBz%S
zCOKmQUC-0~FAusPn%{4eMlRB~_qWQxXv$RK0QN#i5C*h#E>}hycenfQY0_CoM>wsW
zgN0!vYjot~*~FSB)fs5o@CZ#S^R&#XnGtry6tYb<m^8}KQX*fn1k5HcDQK;0-cm+(
zA`jTHvT8rs+xoQunEOs4C2c1Vz5(o@U3)1wPt4b=K?8Y-Cm%<hQAZ63GY0#sWzO9u
ztZ@hIP+M1`tJ?1)u)V;<G=2343G%6@SMdwVq`F<f)v8PxE4W(1RlsGb)L6=~n3$6h
zyK6kX5+?giwP&*3RV@=L4{7KAM{>v3i`gy0%k+|NRxwm;q)0<*StZ(d#i!h~yzp<V
z^UfCB+$`A0X^|^-EbqKF)hXiH^p4sS#Xo$mcSl77mTZc+%*@Qa5WGUb24`M}L-svQ
z4dbYpPLHt5rZb*IDVBL3FUBjI=E)+k8i+hWrKR%%RiSy6^r;sU;m}C51`H!6lRBOp
zbuGc|3Zu0Sf=H<~yO+K)mi^a!t<HcmJBHRcSS5@K?n|$&+8Pv0#=psyFVN78q?^4=
zxPm)sTPTZi$)M$-TE!UC@la{;V{1{#Y>m#nN|gBMOc3Cq<I!Vwhb1cO{fYj96BA?f
zR$F}G`*qy&pY`j<08&aDD=-HeK7_1VZhyyIqEit$P=(@k$+1-=a4#&lC~~@BVj44z
zu-e&wA+T&Zqvy@5qc<jY94!GJQt>x2al%)MNrH41Go9h8Cz*WuqgEu%og{UY-jl#;
z_E>BO?!1_OG<Edi*3UpEs20ki`s@^X2DGSFS%_<Bw0F;l34{7Zhzm)mU3K?A!?l$h
zSB2_`T7u23J9f<0egHAa`bR%+3M3RAY}>*@U4uXMv<>K5GOa7<dD8+0>M)B<?6Y(5
zhvz?0{4>)4Lmr@;CmA$$({hfWjDOWp0|rRNS3tnZ=U%Xfs4AS0M-xa^Lk%!q@LL0+
zx<@a}*`NxgE=FvaJ?zoVsS$REjhj{ICbDQTb2-0bpi77vWW9RzMCg-XLKZEv_e6j=
zE=+W+n5mM>my5nUe>MFk*v%MHFG>QY8hU@61#+x39#0)*hf;$^o2w(EyTp+3$ogwP
zvaqnwU`6wZ<TA1Yflrs+y+5i3jW1U4>it?F6Va{{nnARC+yhHaOCEW39-@KP35K7@
zz4S7_n!H#tCVDRp_?)D0{DHv=GpknQ3=U-uAc;mamxNsorYS6nOjij@KT-8=okb=+
z$h20lYIXIA-}HJJB4!fDt8&4Y@~EZ|M+nVnNn7%r15l`SNhh>ab&AfraLFfL6K$ta
z08(WP2$@cPRZVp3__+*NLBxvGQfn%rdXDI20t@AH{V|VQ64tkm%4q}{ij4KOpB|-6
z9cAqGg=^oj1A{hg*&`&BdTP-M5aSpX9qmWI#-@n`Ek*a7kIvzgYXU(CW)nSXscCUa
z$E6sOAaW12M&e#VSYc0A`jX5zSI9%qP{xr(YE+cmtVNiR%q;4XqC|D3QAyY-Y*>|A
zD#Db3aEibGcD}d<fz@YO=*Z2T?2b?UYfX!uJ=D@V%k%>6uBTUHKEtikn2`W=mGIOK
z3Pqq#6Q2xYWHm`sX0G!_ue9>$EV;FN#fLexuZHt^nm`B=`LDe{pCwToMVCzMcO;0?
zzi1Vc3vmN$`SjQSd14EQjYm<+Ht2VkVeCgRPTgRzJ*BP8AHDG)uHK38Tich$&b9yu
z>u20bnV!g=6iq4+4^z$H_nCDBMyTAv19#$u5AeBh&}}q=4CT^OmKkHI#LoKr@?}ZV
zg^#SUWe)9O$BPc+rg6t-)rc+%r1esi^o6phnu8_sdKr->+%B2U=Jq#tDZ4_5MH;5{
z{V2zdGBB3R=Y^CgWxNCttJ)eUvY6$#Uv**WqW%v!fps_22v-G->Fsrf)#|uMrk{%j
zc{?MQp&Hv$v#o^OC0$@{MaI>kxX-eqfNwIxfwGN`n)F0NW%c;?b(CYHS2gZ1md%D%
z!U}aszdQZOt|{WDvyz*Z(}E}(Wjbm}!;uh|D5=tw`|#(^{9af4$Ya_r7)fiWuuWtB
zUd(~JFml9<!80GGWdu{A9W<^{{h;dd@WAgKJhPh#o<j8GtICBIC9o*O*#+_umq80w
zvb{xK-c$o^P?89+mRRkDQTI$E^U+b0|9=2d5z1KAg*9cF)7Wpa8fpmaIY=(lV-F<A
z=6FHeBao#~4oKlgYQ*r+3mrl*8dYFk9Yw`7Ie=v~Ve8^^m>z(R4?WGKOatiA-s#^+
znY@;;E|>34gEWc;L>IMJZqW8qP>RPKb;x+?&hpVtfriSl-k*4u;Vm4O-L9mocKiw7
zkCa3i4c^_=$jzlda}i!EhSg8|(GsM=!uUW@g~O$*mOd~pM+dbd`=vdqam~!tPlyvJ
ze2x;()nOu}8^JTk1eiVY@QORpFS^ItX$11;d~NHaoWfPq3fi>GSMCPE70O-T|6#8+
zNx!XYxer)HY@58McMh`IMX&kWdt}_wjz-wls_xGa!*yu5!qaOFljJHw&cvz(izf#S
zjQX%l8}j6RAq*=8AxOavmmKNS$HLKBeJMIE|BxFjE$e^Q$(d5yCtdWGU?xgNJSV#I
zxDd;qQTiH3PlCC5J*9C^XjRgdfNAcM>oD#yv1}Z15zkXzG0kb}NMd@?j)M(yaMR&{
zexbZY4D!J`%19A16IE=2-F!N6uH`fH^){>}X=EO}p=FaA+~d)wU{46ePxM0a<~89$
zMKG3aZUdraZvKrj`XI!9C-%7?nnz?eQ_sC3hQkfyAy>bh68_YwzbH!#rI{8ktd3d&
zJqk-=GOG6Hi%8GHgDV@G<mdcc#Rcea1fLk_d7i;wUVxJ#26F?|$v}7I%~P_kJFs&E
zTL$5~yd)z%eF8Bffo+!UvB{UNH(OQh#}u^*)qNT4z>_}cd7dyy$T|`}0i8-Iva`-t
zJ|ACGqY!?UB!1A25H0vIzKnj!e>nJ|JZ?YVe<^O)0FY=#?xog}VHW0Ci1ulvLCTyW
zveN=85qa-^=Fy@>3n^%F@T9@U$f&8}b{h(th^R^j4m(*2tZ#SIj3O<@(l?8lUH;&9
zjkO;+59$|_evY1^Nz`zci5_QO26s{0NBzQvNDCZbiGQ$rZJ&&_Gz1XyBF{?{c}Orw
zP=;uX@A8H1+O<nPJ#V5Y7D>AQ?VMU2XGxxhF}9(`LYBf9EXf;BRtyC5X^UL=*V5Q1
z&a6+@w70N}q*Ysn7ciks6fgntNR6ldVu!f|r(E-&|3Ht7lokOzbltDN?rLaQbRmlA
zrudI7B2pu%aFS__n6KkBx=vEqbfWA9h4ShPBjwGNP}@+AsDnbk4p~-O0Gr#Vtt+&A
z61LHR_?uMHc1(+H6%RCa4nyv5OQC_`Gv(w*@*7<&h?-L&ILlleP_cdk8dci|Wr8tt
z2BlY&)A<~9fePN~^vz1gM|#kDMOA^55dwPc1$8PO6?J22{wEMSp-rd6eTmYLwn?NS
zdVJcPxKLWtSx}MOkx5`28dh`Y1skUQ!mt#@H9k53X1UX%z?2%#IQ9tDT?qjJPDE}}
z9w11`B7Nmp8?s&=u%+;Si4!Gf>k8B;tvG2k^aw|T;Cr)*uKZlAr5E-?A_aN1KO@vj
z_<Iag?)Ppdky|{?D2q5im){FHQ!F3gpkKTEpKDx;{HB}8I2EywX3^tfFU(48Z_-Gc
zRA5xLO!?!_^l#-&RWF~9lC4(MD8c#I==*)DY~1eu<*r9%zIl9EgwoKxQRTAftw^fB
z@aE}B`rj+L%R{Hg_nWp1@ZsIdaC%Lu%F&AQCerG^$>~pFg|;L>ltw^JpfI6nBCkTp
zxcr9I&R=`|##fvo!eD=C<z9s4EHND15cA44q=Hqa#<t;ihBjh`%V_PSJnPM!hM$58
z1cCeSN(;anPS%Kg<U5}FKu(=NL2|o_{en`(Xxw#;OEUkTOWH^|ZI}dk+9eVa262gb
zRcnFLtIdC3Ps~q|Ud3TTDTT75mZ2NMyRI~<e0@RfYUeGN<Fm@W<kcji1`EuamWts-
z@w_q^mdkDSKiAXvN0K<I;~yS<{?tWH&XYttuZr=J61-Ks3hMvwcl5$S9=4CN*2Z2E
zXzG~yR^M~Sp8Ki;9{GsagEY5~Af(?S;ATF9=%lh)ebced|FbVnTw>34SYEVQ6*u+x
zY!G6x=aon!pXd4ge=m~HdYLu@;4N(%8rj*RsZe#(@4a7Sct&nt`1g=Ei^F~Q*G*m5
z*ZEq(H%+F6UT?4VGtgVp@^+IM&QE@v`0Kl%(&5|Itu*!h=G$p{*T#Rnadq!4Bc@%u
z*GFfLaX?|&ft%Y;%`TqjdH%>vkBmoUpOy_SDO)<<^~gdm?;*;e(CYsG$*s)&je65;
z?@ng2RVCbk9w6nNDth=Pk2+YrtA|VmqZf@vMo$&0)Q}k5aD^=@3A4^7)ru73*HBjr
z#({W#E-%@l!qp{Ep0H*iJ5^u;lSj#v#>$tOB{D@g=QK~kGmQ#=9?LYLoum4t2gb%}
z-!r~Vbz6GO5B4sX>P7+cZBezEcgm{0_s6$$qq4URuh^F{_iCsIHP1p<=?3xuyHr?m
zYeelneN6K%AGg^?yI+p=t5($VdF@G7o{V|^JD(<s>>blK=uNHYLc24fy5ajxJHi|%
zA#xzTrBVZ)ExguhR@16AIB?@?qnPOIMDI@v+vDJ-UYE<iGX?V<e{k1)yB9JPAjU*w
zLCmmD1+ejT)cF4G?1wL((M`u<Y^X4YqnPBnww;)}@zM}qu>|2)(Ma_FY2&~?HV4gV
zA)(kxC}U1av$kS1olnHpJ-sSzq;l-xuEEQ_1eFjmN;Ly7&a7+MyHlIF$9~>iab}M7
zJK3a5csvkG0R0H(?uwMIk4FrBGXBB4y}~O0JB8X;0Knce_|p~|#+m98TEtyzv%tPG
z#5?rT*J#4LMd>rPw{^Yp{nZV!zK7}%<B!$W3}Q^2pn2wEP6&N=snXq(M-?&|b2?7F
zC%J^Y?$m9zww?Ch<R5FY0j3hTB7apV;FIug+bz1gFE&*Bj`7t)DU@Rl(R89DGbTr+
z2DB>pXISGIiGxb?D*=Z)I@J&F&|<WfH+4G_*-32;Gx7lnA;vF`U0zd(-+QdD7HY`f
z%8@YSxphBxfjU|;@nfgX+A7rKNkLz>Zj4vf)fSHC)B;7xNfnj(M*}wk<f<c)-?jQk
z+-U&Lw`jwlcXw~?9yCDh_GEPzN|=MIc5Srz7ka#p2e{Vx?ZX-mGQ<p-Y1(?EShX^8
znGA$rS+&|Xs}Jz_;BsjAx4!?a?Pf|`ozy4DHD+MAL+PZcdd5d=zQDGV0|ZXXK6bg$
zP1$3rxb{n5Y-URhl>XVOM^UfFnr(sA?oXC&L?F3BGaXfmTQ;KyZLN7a9jj00v4owy
zr;XS!upo9_harR2N%a4JMB6QP=B{~Tt1f<Y&5K_KK5qPM$Fu^t%U#qf3byQdN?*AJ
zKx)-~F0T1l)&bv&U@=94Uo?ThrmYNSnx9UexvR>sEDx-HIwn&pH=WoBCYsts^r7yE
z5N~9aU2)X=($IG7j>r^(y!4^jLKU~N&HNqo!s}Ct)vH>TSv8iWctzo^M^6(5yf)X^
zQn6)#k7c8QZ(^$K=JIO$7Gw8v|Joim$V2yNtx)?VnZKWcuUom7tM&DLBJQH#NbdVV
zr53yhhdYR7486*oJig{TleUg?*eN?X?vL6z3n+<vgc{PL@}2LuRc~bHfHw0Ze+2b2
zJ+U{&^Je!mfh5J1yH06PV_zJ0OCHrfn<{wEb1=5U7v7mLt6A@gwr2g^H9l|s#Q9M=
zgx>QXd(ae}1VUALFnn}sewqs`|7ZRS6ZFnWMSLMdv*uNw7Et5U&Os{~0Flm%$v<0E
z<Lq6B_lSPJzj76Qf@*x?`bC`TK&rEU)t-gGLK0bC)!RB$JE@q~`HL@>O|ls9@{})w
z>D4@-bM{MLMpWJL&1!XhpT0?K>m6+72+PcYIsOaX*osw0SARf_qiY1<Z%TA%;Qi^N
zszH5!^oXWax{#QHY8Ntjy<6I(#o5GUJ(?Pg<_aFTT>kFM7guW@`SiUePt2WZKq!!@
zWw>MZh5@feQYKwfVZ)VedupuFzvT(e3h)@KT_Ylr74xIAwZm;OhIj0$H)YdF03Zv=
z6I#^w2Qj(jkv3^uWi)uqt@_aEh5>9m`=db*-E$XS{C~uqiC>QC+y9%@3^ROZ##po8
z*2<P5WtqitCmE$wT2Qi-t&}Zk7GubI7fC5g32jJdp=>QuAtYogC1p*rv^?+QPR#uN
zf#3alUY_r8FV}Tm=Xorj<8yqD!&z?GdigEiKbxp_TRs|LpDN&KGTtqdd#Lv#0w}}f
z!8hLefqx0~Zj3>-R{n9`$u}kHH7Z<W4uoU>{?H$M^wCB<+tY`WMuKr!j5h0uPRkTX
zF*c3wo%8WjKHC#GVx_k{&Tlk_km(W_-{Q6mh<tt9H{?5(pM88(-YsRkRauzd3)Hd^
zO(J~*Yz+r%81+7kj{K;<tPxp=o|N0^uX`5Lm|?0u$B3)tp!z+z@lxJ?yzGNQO6ZoZ
z>25+>`6{(y(F+>pBEaoFe0jjf#}=7QD3ynvXieFKmBuR9BbP?E9qzTHIg`HL+bW-{
zvL`Gz6G@;kH7_t7jJ<ag%>^UxRe>{ZCdYHzHGnc!hS;L<g9``UbdAe)n+dMri3H{u
zlOTN^k-^lz>FRfWOY8cSdTiqZEJ&Z^+r05IiPO0awG93o=w0*rbUE&`3#}oxPrNiz
z-`?A}vc{3@Z#yO5mWFCPX~bbdga_PEF2yt~6`T>2NY>O)m2Y&>uNSf!seJ~5y%2YJ
z1*X%_*bOp~b~5b>d6k3%+6%ozJ(upU`r147*;}y{SK=8pocFS#Ko{cI0x>P8-_YB&
zm%>28{k2*$sdZI}38rLM&6K(E7IJ6jV(!3szvWe#y(*u^)7-W*gAZLX;j+tdUE|r4
zhpVnliQu71wCZ_lVfu7xQxt&p@SD|cfM=z9HV$XWi3)KK>q5V6i1zF`sy<hDg3=Yt
zJyy7NsPma`2ix9{|FikfeG81Zr(}jf1P=9>^W=EZhHp9=4cR#Cm6h1^b2+tCFt$=n
z7(s41N>v;&qIS0T+q}>c>_-Nu;byW&1K9aa_5h4mqNglg(^dr5q*a8PkKIErG(JTx
zEBxeO4WDywSYz6S?!0__b1;@^vc!L2KP5d!yip$7BbRz`?Fod84NySVrKL)d$@cfB
z*5~A_bF|S#zh1~0Nu6^Lg0U0T6s<wW$QYN((H~lkrzuS=kI52e!u^NAJCtT168;kx
zj|2Kd@E32`7l%)M+1}}H>hCf#P!)#KJ|!)89OP!{rJ{y+fy&~^>)+g*mo!&ejv(S|
zo89+<?pg`4w{;o;-amwe!{~ThCZGDOg;9U7stI6%(m*_Yj0B-myGsw=Yyc3%PO=o4
zNIP#~B2U+#d5N>%M<=NpW2YbDy(3(C9v8X?*s3teddh+1&bJV|@?H0i`GcY8>J3)9
zi5qDetTcm75e)Iw;ND+9s1(e~{Z&@}-0i14xp4%6481vs`PW%Amx`iN#p=e?wEs-w
z+H&lzEK;7(+9-O<VTPC%EYRikOI)so)h8f#CdB4VzsMmWdccOWxcj;M%hVGR_ql8~
zH#G=fP*Ynt)UWvWCLPDBCPi5dpcZCb(N3a8&6VaEH7D8MC7XU}o++U^wBS}b0b940
zU}JK=Z11`W96LH=3O}`*t0+M+Zyc^Au553R=sq$&#R}r!l!{sMo7#WgL4fEDW6}dj
zsCp`ejDeodKoY4w?}L(F9kG5^zOE;#uC0<v@RD6F_(+|WhKtcJaV0YHwQ~$z@k)uM
z@Kd0Bapn7`9wa(Fl~mBph76`j(}dPC^yi%Pk%9SfDJ>dV#aQ|rBDWWj4z+zK$4mhC
zt7P)(48o~%8wAoxoi4kVqi(F8HUYl36zMZFBCJN+cHfM{7N0RV)FxN~oNN8?25rYp
zlO{b*uFM%{yNBIcFl{xYSim#eS=mpAzPj#c&c}^7{;Lk39c;Lmhvs?i4m~h#&`@n)
zYWO>DFz6vBLi)x)Aj6ZBlN%nI`nej22S4>i>z3a3L}oQTILz}!1p=I3UcY&H+cejR
zJcY$*mNdO@CQUK{D4F!lPelCB=H3znEFB)+fJ5FaU99Azp^}EeT;GB?sR=P4Qy`1F
zI?9|W$-R)^&H%^`l07dQhSklZ`r-wqnYE>Z##D>f@LPBAaXm;#E=O-KD`yJ2KI>&u
zQLQVN6J?LV$;|8-8qBe32=`!N4#jJ0zkje+?qY1`Jfo5*lI-I;RPydHQq!ztP!e6x
z_fSPM?@zpM%#P4mv;s_XRT02&UytbZ`Zs1Bsyz7=X^&iyz95`MRI(cY{|p%|$Jomm
zIp1m=Oq7CZ8In&xSwT`AAgb=UW09Ez(+y|Ws#LlTc_`<{^*iCO5KKx;BarI;_wn=8
zY?;j$`4e@#ILw5O#|4$=J?(z|9A>r7Wy8*-=OKamk?yq9f1U2xMhIRGvu=a2dM|i9
ze-hNf1vO;AIty<h_q>D4?j_4gd=sLWee>vJv;30u=NEKyTN-|<p&dspc53{cB54hw
zBR7PmkWyT{2}g${DTt>!GyYC{7Ix~2N_<;4(XJOrLD6jZNNJuH*rL(iY#$?T_fJY0
zJ)r{H?@GWb_!gdX$NZQ8aJ?8&Upb;+Wqoby1p9;+$Lgo_)6$yqXc#)O&C+Ej-#-q|
zC_>Jv(UJvJZ?~}QCHwYlu-SU-)ErNaK`SmeYV-N?=U)&_3*A++3gP>6AzjFyZvC9J
zS#0CM>Gp^kLPI?K?310c-8gcGiJ@9i29;M-V4qfoylo#Is#lW^i+*c9<gSn8-pUtZ
z;)x-r=odKooy!pt^}QuPPvh)XQutU^)TxiICMnOVB94snE<2-7G_E;OkjrI@Z685C
zwzNF@*jmJ$S+3~8Zp9gR^W@*|-kK3$N30$zv%`6%nq6m|uJy;Bo~MSHaxo#_ez^|B
z<E2IG2g~gwGZ6R<!A;`DZQ1%aqZZVV3u{+(yi+s1_FZMG(to;b$k?^I<94xf{^q9E
zX^T%ix<<M%eT;_Bj91jnW}?|U;zli}aH~{*VDK9}+PBB+D<SKo_E|tI4I0x|MkK3Y
z4j^rZ`%N~fuT7(w91feQ`1>3v)Nvg4nJCCX$Q(yLduUZ!1sio$5fP1!#|b*~dnwJ>
zZkYV3`yKfoDBYz+O?9&jaEVFSd;L3&seyJ&A8d&{an>4LgbGp;*dm!<_YB@d!6N$F
zJ2jZmkLcU|&4YkauDo9`bJ8+_4@%F($x>-dZ97TZlmAyxfr5(x9e;ib&e?g%EiP7x
zsyeQFtV$NFH>aawD9!$T{f)Tlr&qVN(U@B0Ld!`^Ldo_TSd}>hZkLgn{6ffTJ{C?q
zQSo!F`YiB-9zYdwVuM5Snc)OQ$jaAt(WOxdDBkO0Q6$V--NDmJu!7^au}C#3_0{#_
zN>R*hHv3vsL01<5pxp~@vFe?=y)q6mTIyE!A|R!uJ!R!Z3H8%cG1D(%SJrAN#hRm3
zx+tLp@8!$|uF5A2H>cY-#nwebBXTq~aC0KPA0`+L09Ud*R3s}A<;QW{>bP;3K4`>r
zRTxI242EsakfL>st|8tR&f`=aDKRX1dV3hE>o@0@!rF6q_||&njx8{Yx-0u0zxxM3
z$?{$00&_v2@KXqujw4$6*A*=~Zx@PM^u163;6(}f5BY^@bwnj8w<;;XB>eU_499-e
z?KD!?-kk2Ma~aro9Ykr6?_BoyUR0<!R@+Us_3r|J>98v`;nL{#4@A56{mT9*qvu=#
zS=4?+;tN$_Btn6Pi|5_DA)y!=?Fyv`p3Xq{ESvbo52UP*Br&gepyC6RDn54TU~`Ss
z?fdAME2CFia4!k435hf1UZITfDwk5)jFj3}S!!b?U44~eG|)i&;0FvX`pVXNBO<8y
zIMcpK6^GTf`%cQrX^V3dbO21i*tlW7ikb3$9!;~vi#6D&QXrLFWzQWi9mgQY`;*)W
z{l`0qe;=SO)wg0`bFLuu(PGvh0tR{D;Q98d;+Z4fS&LDNK6>Loqm_Rwkx%4UC|vFr
ze`?YIBR&YZgYvy(HCdA;4dJ>50;^mI8r+XvktL6D?T@Wp5845<Nu&@F>}E`J|Mrxc
z2mPS<)cFx1Im_YYnT3K9_C4&3nQ8jw*e$oLP8WUyt~mYrT=cO%lw4vL#47e%M&EFY
zJ;cD?AFu6mNU3wnt$$K!cS(<En$v8vVd$fMQr5@lTkWdPSKHS6&=f*_`zH3ntmp(A
z-v6iDK&OmNuCRGVnKTroTvD3S{Qclu^HYgxn0R)?j;#tgNR{Od^@$m4Se#x*A%P9#
zYHS|bQfYL{=(0MDC~r1293laWutiZiri4_C`W>cJ$-RMp+O1V9CtdrNa?e=YQzd@5
zbq*o74zNMdz!k51$%EH8IBD`tHRSjDk~CO|7A`R2V^<kn`<s&}tPDuk^ut2*Q%am(
z%o)ig0I<C^RX(k8uz3USU>Qj&qrc+qOQDQNTGZN5o~nit@=lq=9BK$rMvJNus-X{e
zHl9KcUv;i}(1r;OC$WLNUT>hzS&YlQfOS_XlAa{GfMQSVl&Ypr;^)sz7oVYiO|8<x
zFmr_6sPln0h{MHjwQS&-Pt`m0fy9?ObRH3dSkz8+&sLvK5V!*{-1ywu;h$-$zss`q
zhYa4w<GKW0bdWgRKf6Y$`d0sXXF9mXa`4~dfg0CV3^#7Tr8pHGJlI9f93ZWhL)s6T
zwW+oJ5|MzFm-oMvzRR~E2~rGUXabpuc+Z|nX>MXZx1OM&zEC>kb*<CC;SY;a4&`KG
zCY<7@{m#s8<O0OD%t5WsKn^@9w9VNDlEcBef8X+b+<q>}jzzBpjF{CSds$5E)Ld=T
z*;k)_yy19T3Z@c+{nFe(L8NZ2a!Pu9G%FMX6YO@r_b8;a<y9U_X+izJ_#<^~5GX5D
zZGZTMt#djELOf{JZ)A9O@2|$^Bdz?JnIO;blgzjlI8A~KkslvWTR{yC7Y)M?7--Zg
z0mun#4blJeP~)ph@BZGzO@f;DLCLdk|8?mZ$cnmA|8OGBTIIg{Ny5yINFW0~=r<lR
zFNc5cnSx}NvH-u?8fW(ShMttM87VHWsNX*VRM!@8E=k^}xM6M{Py{eh$-$-VXBNIR
zfIYGE_$TU|-sju_+Y0sP<qz!w2_C5(sgv=0w=gu7B=Xmv1gYP0@NY^T@86u-?h&b7
zHo?NghE?H6XF*Z;@%{eQ`8a&^;NOHG*&OVxXPO&ohoS(llDb7MwR(5KQ}I`$OdG_~
zw0-~mkd8^#EY?<n++aiGsB~!#<SfSilc~N8p9eEOkbHgg&ay56)Cczbb~g&I*%yH=
z>HdGeo3H)s`2oZ4br^nz5yWVTeVUa@dLeiPSc8sqEAW~+Lt?jv*Uo=BGKL(WuWz#-
zhtOIdOlCNy@mt;aR-daC5?0I4OO`E(f*NTg+?z`^BeZLTSX3rST9GYo#``%jpmvoz
zpvO|#F2tC78*j)0ZSJYx4j>XS0-;;Vp!yeB_?fus+4NXL7W1b^XC^VQ?V}p;d}sgp
zE|6v^5*nFAUYt_@F2%j#Q2Jgx(75DM(t;BiZy-uKNobB4J`5sXOd^`itgxhu+aw-E
zvS+M1Jfy;2*5YKkU<aKT{ME-YY8%ks15lOB7CF{83km22HKb6Jm=O=U)vC*P8mSwf
zijPD@bQ8Lm^wm4Bxf3b@X4C`*eH93Kgv##3d$wR2QtOPM;Ly0Uql^JwU3`F?2Ft+Q
z7ZUuwdW*L4wivsly|l~w+Dh%sbZA)^kI`+j3pmJxqc`S6f+&4fz>#c~U{BEqgEeh`
zfuy&@*dxSG&Lfqd6l*_Bc^K;-5f!CPcAbrBeF1ZRAih8A`~0>8rvJ*?hA1+7M+6_X
zZ#jfyFhn<JNH@?=Nvj2w7ervc)uHJsCk&OA+s_(4ozfvZ>7ow44$*qq?*N?vvOi_n
zg+4VsT^{u7iE!u*1MEDlDV`>(QFMU@1VxuXW^<nU<Y`21|GwwQ%_P2B;9N6QVJIRQ
zJn%P{1jhN#Z}l^zh6NM16z*CE0Doe}Ei!3Mq^#eZ=~Tz8Kl~@%2%+qWVgV!PC6?`#
zJX?^6I>c-DvOUIKkm(S%BuT6Cff+9VON2mxD`ll}*|9qSzqAFUn`Q#nK>fo26b3X+
zN}1<e%rLSV35b=4GkXLTZ+D5YuIqQjMr4a~NTB@`)J<(&H@6~X)Wv}czATkg7Dh5k
z$<<O`pX(=u!iIIRimrQFU)K*$ogv`nk2xKP2Y3j}51(r_rR9Kzob6zuBmzEF1&~Xc
zeUT%X8&T&30@yr49?Cvlsf*a7J6$E)V82D=wXT&<NjQG~ounc60@#Aeq4v>aNrVJ=
zFCmySCh{0axs_OX#eQoU<pD7DW1Ru^#GQEK0VUZSdhB;7+ET*%xqA*#lFrQFg`-u)
zeAs6CnmG328}f<}MHwCL`h|;_Q}lCmIL66p$ZfE6Uy&xweqOj^aD+*qN+JNC&zg@V
zeQ^WolN5dw3_l`{Wh`3H09!ZZdso616>vj3AK$x@^8VdTeY#2<nPXIWK|O{{*X#8&
zSJvOR!%dDw&8uT6cEvSd+5S$a`Yn0)vMes6N^n(mJ1Z(^J$QIIZlzINa!lf*1E<R&
z4KCSL4o}*Y<F8Uk@q@rF<zf{lcrU~WWh?*TjEOjWtL-&Q@mcqlU@EjlPE{|RYI7wL
zfPyruGvt~PjLEh2xN7mHAlG}Voo7LJ+2lP;w*7WCj2qO5Ov9uFVg?7x?+LKwBq6F`
z@o_mrwLs!~ag5|O)dO^^mx{frQ2T=~l4db3Yv%r<6o*d%uv#<=y}bPTwAJA}DBm4y
z`Sc$*pn{w5vvrW3i{%h+ND-|Qr3LZl-w|eeE-`28L~nA-nJ{vdttv9!6z&MPrYFJh
z&q;ysMGh`_OvG;g&<3#l1q8gl3!~va4};*91wuM`NHrjl6wFO&xpAkmT*77-@G<$U
z9=;*k)-AKU8blNa25df_GEwUrU0%M+8Co7C5>G@}8G`XhDmSKs$0Q^v{0NwwI8F)V
ztO9Mx$!$vr?dH9O?OiB9rtkqsce7}xep-W0dX~UAP=#BjB_jtf?1ebMpPArm=XH+V
zQi=5bKl{aaY`p^p<c0xLrvS3(iv2*FF~{$(P3BVtMB;0M8-^~t9x#dQ$EWHlC&O$A
zp<@-X|GZBQl1h!!Jca}Kaf1Vf>A}ef>I%!MBFF+dZnZRyW;3oQP&@>VVYXNre&GQ$
z-{lOa2~|i&?UwpUlc<Ze`K?ygzq?a;x>-N%S2twTN=5>kizOyr?*{#UG$>*)WTQo(
zT+iZbVt}3mYiPE`3xa=;JO|0+yB6VfPQAn&wzggHlibRvm=xAdI4uLoNt4lVS+&C=
zl#)s^VY(;^iBS5FX&d2V@L11_B|lqDucJZ|HCR;JV8}ZYISaDtMyB}y+{=_$Q8hoM
z-%!rUWo)BN)!;dMNj6gaQnuzeeZ>Ey7@WhrwPU9f2q?Nq^!bV;rE8A{NAB&4*YBm8
z-*gLjR;F0_Ta_tE-AXJ$SI;h-;(BP5l%ExEBoDe(tyF=%?OY(>ML)L_FjOU<mp2fE
zBp=ijfGbq0@@)2mCyC{j4{y8pq2yVLY1FE5zpU)}V~mK4H?t!?EIlkXJ1<cV7=77d
z6K~-t+74T`NT5Yrf5$+M$gWv$)^qnj{nVmXK7lSH8k%J!yuvV%ER!$Dp}vuLcDa<7
z4e+hH)6f2M+B6E{Ko(7xVmzG@tX%ct8B~^JO^5E1D^k;vh8~GLnD!=HyY<_HDoP@y
zeAjs#Lx{ZjU{LA(;Z6ej@3exW5yH(xu&q{oS}KJ&Afq0XJQ5q@*3}hfzy3{NY?H$~
zovagHyr?gIO$4o_IUkQUPIEG)3Ne6zfY`#DdeCCK_L0wufdEpj4|rD*H^G#CF7Z85
zeDMU#HNvhdRh-5_K!R4gmdB1i3Zui76lYaFEADmg_0B(=+w3Yeg7JR~9*t~|%Q&o6
zF#sg9H+SeCitf-i9fl54%HMv|UO(!$`{Px9rbBy+xex>}GA6MP<sB1De5-_~QjtN_
zPG2bnLp5xs*N+msqM2vT(ARhWtPDu0=`JB59L+$SZ|f&1Mna+`@#t4=q%pPd2;MIY
z{%{%R0tMbs;(!O-T`oA1N<kbVg@l0w9U_b?^%n9ex3k}kX$YHJu*gxdQfCZanAY|U
z*1BowD?~!2#=GhcYH0E*?<8*JHMm=|q@LFfSh%dZMmpzw?>HxVT!(krRZnxQTXBSp
zrTUF~?V*$jD0!xabJrBb0;+w}X}A#>iKKoi1$+Q^cHL3i>IWp6RhBul`;TfGocV{H
zZIkAj+t&V=QfT##BVnwfv9mW0uKedOa-2@}9Cn9dzQJ9e&jvyQ5a!S-=1{ueKzeJ=
zC%}?OOAz0<(Aua4bwQ`+a8T>2Vpq=KdUB9+ki>La!kyClo=d*a7~9Irfp9#jm_M?_
zy<jf;3BWC7R2KeN8<IH*3As?7p}z2@yr26zXunVo(@uTVA6kg8;M(?|I`W<^IGx+M
zA14<_#pT#MT-?yUpZXS%d*@eDN*ZFnxv;Gt^}QntYJN$1U0c$TC{YbVIyl!eI%}JR
z6ZW~=^|S;Z!!oP1d;P)0^d<WZ=ggxTAcovsSWOaH%=P@mMv0>?`95|<{hd&XTRoJ5
zdNE^Z!6A3KNFn6}f+|j{siqerX@M<}D!ghbSWF<cYF3&uWn>74K_x^K31U{>?Vxi2
z1C}*x=8DB^=9nurQdSYvWxpQe>m5{B7lvP30?sgbP8|k%8#^ubN;}?go(kM?GnXYD
zn=R>h=Y`S92TQf@E{$p@@(J`mxPeobFxeuLzCp1LQU&2TUqB&r6Jd@Mpyha+@MW%Y
z)$pAX_D{7)gjeXEV^CVm@AL{blrW6Azf>x|-@PZ9x-?a9$9Zoa2dRo*AV;w5-s`0B
zi{}EV%TLQud3IwE>Z9q4e`6(`rC=u@6u@a-ixRrlT%=NpwVc78$vV}fQ_Y|5mf=!o
zJ*bseE$dsr&JP$`k6XzWJoCDX!0*_Tq|0ps#$ep?I8HO#y~7uj@VDb+MB0)yMPiP3
zc;4tO;2Y%F>T?_3c<+A^lz(SfePI!vT%jb+K5}8HwE<EM7mTy(=S^HZO2(vW?QonS
zJsny*c2PFe+Ht1RDz)<U*3#!XxsIRYI$niAVb**^%`;6>NOdWP$||fL1GM9er(K|2
z=6=;;@`D4BG3Oa~v$Z@XsjtA#-<gY}DOWL^DUALV;b`5n#eKR%HHlJ%7RjmR-<!)q
zifWdW)p%1efxW{nJuHp&kUEZgz|dFND;uYrYMC?%SmUAkqI(K&4rkFRQ_b~W5Rv=z
z07SQGUIRSudSUYi&b}OW&Yr~UZHt^xkdEnDQJH@nPTe?{Ox?ER(4C5rTeNLk+6R-B
z49Qfb^tjXSop+^4%JRCTbP6O6C3L|?o(BDT0ai2<RAXuJ<}t!|7?q~Bz=cBs9(<;h
zi}%hDN~F~6Bu&2eOJ6@3o{@lSkK@XI9MC{zJlI1WD3YuUlPpfuO{nMt0nAS*nGej<
zP_kW9gqyLy{@8uCp@RH)Fot?c%r`E(DGj#t>{xrU(~VWSN^tj3A=n(s-+IKi?9n8t
z<8^#jxMM(RcF?ga+}Z6!aKb37gWVtECjnb=>hBG8>}BopL}D6F*3+j)6T5zip`)JZ
z7@lw47UN-47JKeRjlq=f<&hUn2}wU@wDAk0jTY+EO;7>IkF7kr)W-nSmf#MgjQGOW
zp=NilJ7*-EE8oj?^`&T5@0WtjTjef{0k*?YVYxLT1A2jyE>^(6ZENW1swx&H2Y_dQ
zi#L0=SvrCC?eEi1U<iG8#Kq7|Rc4$W3+ih##ktVQZ@d)z{i&hL-Zo43Hm@Oz89#aU
z-(ob^7zXyurXS~|v3pbtfxs%hBHT~w)4OH#I;tEo->vHU-y7Sa3n2!_s12>_JX^SR
z!buXuFe-WiQf?m8G4TAB+T*f=tvqnfev(7-);__JY*mq85w>pa%n7m$K=`xo%psT}
zTqz+kH&qE3oMm_a7D1Oe&&UhS%%vn!0Yjc7<u^C1IQV0;ESp_mMsm<Ax$F{{e9EOu
z-%MW|g5j8hzEG_qlUDEqV_h^hyF){L>6Sa?2V)(6N}?8m@UapRt@1qfdNf1;H7L#I
zBB9>$RNyTiix@zI#f2V|#ny1FTIO_ua!B-4zwG%Z2O6&WJj}XYDlr+<MueO_5NyW?
zfB)uctatqNLl}c>%u)#3v2_kaZXJ*pSK&{SpBc&%lv?Ddl0t5RgvE3b9zE*h;Ir*^
z3F#Yjtdd}=gzBHMQkmJ+aURhNkFQ7O&-JF!O{IN{m9Z?_u9TvIEtHZ~`P8+f`Vru=
zUf9?2NtLK#^4HceZ%F=a_w8Ipb^o>Q7aEb*g;znGF#}9#DQT0rz5U6HRjjEL%Eehq
znS-iqtKPl2+k-LhAjk#_Jfbf3+fs&i<EfM$$upf&(PXw5ez3z0?9cCNNl>%1`lSR*
zm?{()L5JlrkXnybT?id9LU_*t@;nBD+I+fD=qhq0^dQR*cf?*u5UW&)Im3pi!yxTm
z!EeSrDL+Y*axovvVn(;R>yd)aYE8GXsxY{7l0->5Pjv<2hGbi#Op+>udrRc7>)A4@
zyICjFghS!t8h_O?Ygo#WBa3!@0~_RpTkIzq+K;%>3Boy-jZIm*uh;dng9QYWe3tZD
zNIo#d;<|PhF#lQ;%4(TFR>3&JFt$tPAE6V3kSLV|47Kn3^vmidb=H;>+5Imd43!8w
zOxpOs0E<=Q`ewpf+{O`eMr=Qjy{ks9r#RKI%{Q@ud&&Y*BzoacU*(_{)gvrZBaADm
zEL6qB`{N*TdS9s?y-<iMLg|-;0G2>~rWB6n9xdnr`$YYVu&Wfx!vyT5I876>9pB(K
z8djYIsge?)1WUBzIP(~M*U3*|qArUdA6978*sr<cD5;UX00tQ@7y=y*zNg=aSr&d7
zN$eClG3Awd10wd$3{SFmAutQmQ@%TenORaaZ<-H!8$#l{m*B;B$4a<^VHDpJD6Uq@
z0`ZMHwk}K?6po7EfTT;2wI>cb4*?daB#P)Q!HlrkWkQq;5!$RMn6+oUgz7J!6dFqI
zwb}1ejq*LjI1!YZ0&(75^W}1m&|S-u$zsi|{*sGLefy9UYaFxxIFe6W=CCMQLZB7w
zh1hlfKV|AW@`;CUo7!$CrR)Um+8tX|3PesV$x8+EP=;fYXuLk<@C_B<L2ztsxt6WO
zPctYyi|nUhmf<-!xyl#Vw(d$0sHF3^bLsvyv$WjGF^f5@mp_`XbK`$+Rb<s>)#6;#
zd=sIZ{ah^|lP)ZS3xt>ywt7Xy0}DiOT*n13CI*`U+ECz@rPF%o2*9EXz}mY8j^bvT
z)z?WiUCZCy`Z0!Q)$;CY1F?KP$~(GZj#OcUS6RdAlUCZRoFaC3+B~5kC=(9s9#&0&
zNiO9tA*2xsuJsp=qg5vaAnMQ8gX1}0-1*ZgcOh4<=^3(({Ks>ZH`6#ZH0nc7EZ>-$
zP**>sE?M93U|aWg<1Y+2{l832$FFwGy4U`J*W0e8H~clte*S0obW?{>Ym9$AoxkgE
ze{+-OXBPKJpB6ZGTS3)3$CMSv+jt(2S&`PUnmfGdabiSh?z`ILS8u$W<I6ox@H;*?
zrC-dzWX<>8X+2$I(xLqE;~l$prx-1L(Pq@&uVi3lQtq41$bkEWhC1xs5I^~oUl>(U
z{Pbz)lUg(u151_=06%Htj=E|_mETsxw49bk)1n<opJL$f+Y_r{awSZZ`YVxCIZlRs
zy8dB1$+#G-bd`4dmjsyQcoK8br?$t&i!R0NIy*JpP2SZxuCdptc>tgueUu0A3nF+a
z%9EZ7c@HAb(R5dVoK8aCb$}(Efikg;(5v{&UiBOJ!>6#ZIZpEt&*v)r*o+{U_I`Td
zrg7365A*gK@$vcrw5p1@lzL79;@s1RHh#;q{*A()L;RWFP|#2W9NE_c!6fO!<l02#
zY~6F5Ryh?eGn!(?tFcS<H@JQFpVqL{U%DfRg85uS)*6d2r(-fJ*KUM&GFLJ4<F9TC
z`fnFq$EGxXUPQ`Hm^iqS^KJSu?F7-~xg4Xhbi4QH@nHcIzWq=0*WKTbp{OakY&V#y
zJ)i_JClIYNj0S_jH?C<ouUzZMd80ncXYt=lrov2jkTzooezSC8Q0bBtjt9IM(7OS4
z=1o4uPS6BCzI@h+2C9h5B8L+6X$}X}j55Y$EUXNxd;i+I=R-+M{$|Nkn$c<USCW@)
zL?>kGx&WOimgI$#sbtSMWF-yO?_Udk_|Ls{P9KM8cH^?3aoVQ()hkQp9$`LZ($o(>
zH27IQed}Mk9{lGfM{!R`{K$Z1^gQd^7KyemI5&CaFjRN1oc*iuM>Q6Y`OnH^EJTl@
z3eaX22ZpZS{%JdcgCT?UlsyPQGGIWL#*e1fHtxZOzj5=&w`dI+VkL$1w(~hrP>+8>
zE$&R1Q{!he+7|tHQ5oJz>lXm{Zu-=3Hq3t${g?kPYKxZt*=aueMVsQ}isjNZ%p(|>
zCZMqhq@s-}P{q=QVy67+z}_1R#0)jQMm~|@23TVuu%Vi&pN3{f6rN4yQpj-Hk2g5-
z+J86d;j4vf2g8O9XB_nu%4D0+n_KhshcB=A^Iu)*JOv%;{Fh~;RX5=39KEy^D^}<s
zj7WuN+<$lV!G8a}y<ZBWE1k*mMLJc+x)6lveY`9m$NyfI<{$URH{RVQwGnLmFO<pV
z&`>9GJ!G3rtfMU*Z!v6`=1|eGLl<NG$KAJSSfgob__XICgbrk?f(^4t$3O3}V{qe*
z{=bJQLq#+6_~0QEQNGxdJ>*<NZJ2NAe9V0ybUS&aK4x2qEG4tA13NUZA8xI>I_E0(
zwd%^Wa!=C4&Kk;Jf0|@IVL#!Y{ujX%X)bfbc5_zPRHDc9{<X<P_T7SQ@FBCNACejK
z@)!+G6FF%I7z%+10;s?F)>34`w<B4=#-|afA9}7u(!p)rf-Z+={}VhI)k9G@A;KOH
zuC|qDWQagMuh%OXvMlP{_}gfBvyUKf4?enulS?H_TGs|mP1Q(9N($Q6^8H0=q<Ya?
zTm2oA<INXsd5b`qg5Mm5yrT>pL%$?HDd#(c^)wOLb<3}1|E{m4);l#WniMGQC?(lj
zRo`BK#isx2CN0q=_JgK2q3@nr!!d=D5Sg()uj>3Nxeo4m7U5yrp*cuRf?xP4WRw*q
zG{bBI`RzV>;H0g;ufxgU@C2@|qbK(w+zg2;y>^afo-j6S50KN}F%1$`+q4G^5dG;%
z@R6!O?Pn3$rs-qkY6%=g#!}?W)_!>Y8g-pU1&{2(Zn8V+Wevu6k@Q~Nt9r&sti$5h
zH>k~A0$Qzj=J9N*^}(sXPfT52obg>;(JTyMBC_s`Ws-s1p26SHhqn_&+$f?(84Mg4
zQu_LNk;vXFlp<TVq2sUvG8Jh~JbHQuNGM;em8X~Liyi@KuAWoI|GFy_V^v;WUTE+t
zl&ItE(Q2S1^pI(mST2jbCT-=)5Pi`<%AJga;B*-x+!Z8bXh{CoBkWvC>vDz<!cu!)
z%eWE*c|>DWWK)%w(T_l`<gRO`fETXTO)}P@($emnFHb=C_L8T)cvv!J0o{S+xwWQ9
zuOpwX9%oYO9TE~EZSiUuY|&|+6eFS}>85~c+)?(c4PiVcgB_d(%E_YtfONG|#N|XN
z0W$O~cCA`4Cb41a=%&J)kYSN9F87g0wbOUq1|x2UYotTUN#-^0qG)G93hMd#&Yz8Y
zQ={XgNh*y%R%lY#*zcr296Q;dVgoWH&EKU5J%$+>DU<}`mP8$Au6=TNI{zdJk!TV_
z1n9mx7DrM$fi+nM1gH*y$n^DZ(s2}n%!p+2YB_uQPVN{ukdHV%K{{(@pZw_gPQ9S>
z?=9hKmTVK499L_>+gf(&)KA%m*;91{CjwC+*E@+u0U@3-JSOP<?sW1)tXCodZ*On0
zWFl3pBu2X|iWMT|fzY@VPBN8^21>(iIQG;~Ml{j{)kn$rtiK^|^;{jzY_g>ypwCmd
zhQxNsK@Xp(ZH9|u6e6C&&o6qtjNXU8=oRTFdZ{UOZzHKyMS=7ft3;_)RB*ALbMBlx
zfm|Bq`Qp1A>jbwMNF148&d&Fnh(u^w5fP`;9H>P;t0&H>9?v<F0Z^B-83cqV2(h#4
zt7<y!K{Fqomqbd#=TA{EB0-LVJu^V9Wk%b-f(a7iiTY%2Mm!@_)mkL7QyN|^WJrqP
z^Xlrs9Sslv5TOt$3^U_t){i=wy8Dv3A^X3NSV-_v4t@^;2Dx|<0mMs-$O$qnCdf~k
zQ{hC}`*~8%Vxm5o#6(^r<K}P=7R+c8k|?`n@vtW-pJQ72Ka9x@yJ{_#mpwYMczL8y
zx82#hV~|*rP*i40J;)B4O#hwwDd8BpWheBU5aOyi|Bw~^JcAuY!qoLBHkqjcbuk8y
zMcZpAx*cKuU2`fBTHeiFmqQ2=p*@7<x+`TQE<$h-)Tu;|86lnV)xnJPQ-!gg+lBUD
z(NA>A--IB`d9BBt$E8jjn#DUrDvm3bQj1!y4}sxb1PHshmq>(rL@LfM=`X-f3()k*
z#Pf1M&>cIUo<0U3?=s2dE;QE%ZFFIdDtC(j${>bl$^=DJ0OwrVy$b0KkGC&piy$s#
zugRnbS7=4Dt#&7_a~i)aasKUZ?F`gA^I!_9l@i*cktE_qma+D8YX%<;2nYZ^+y}yd
zrphG1>4641jSo5X&(%k#!aOr5j9PQjyEgHceA#-ShI<i&&|$t72W#QOWd@gsU#=XT
zJEdrOEy?&J;FRBh0^<>0aZC|~irC~AYmd$=T$ZjW8-P++7%|Dsbz?@2k}%OGdC4K*
z%{eW{4K@4q*T2{@;ow7U9O3AaHAc$h^I8tS1!z)-?WgQz9G_A1ufFQv{mkQ1t+>)y
zwwWr7W(TZ}sgOC`j$SwpsE2orM?=R^8!026{S^*rl>BT<L(LK1wurM_fz2|v#DMxC
zhkp-KuVz7OlHs6Cy4zH;U1UeZvKcu5KH<^3p%of*`aVT~{lD${S@Mb#_I<G_WAQ|&
zNX?-sVf6F8=cZ4jpKTDK>Y~L9COkghwq;8xp!8<c`or@HC=_Q0>ZU)urapKYPl$_6
zvE$bOkQTjuHVpvW+gtQGpbsw3PwC$H95gzwKqB~hX7o8T;p{%NLAgczx?(m&tLAoE
z5ol#yqY$=-7OF8AKV%=+tFo@_U7Sz%6)!S-s)s?h-O<H7yw&gac+T6mZ*SL~xq%^r
zv48#bxlFk~w_0?i!Qv{O<<oyZLZ(OJ{U)L2w0rk%3KK&_L4#}xiLGZH(ka8)q9OcR
zv@s9qbnSZDIS3!Z+1IQas+f(mzA6l`a}x3#8C&(j{qL6n?1+!g193wA8i7nP>>Ui;
zeq|b{M1;7lz$XUfSN1!$oK;VfTV1#kGK&74O32?gi<4@DMd8`bppN!2EwuCG$lxp(
z7I{DzAs`!zD}or@$Dx@h+V4moQ4tJBq7&R@Jb}V*Kr5)w>d!Tf`^EJD<pCG=7n6<$
z#H1iwX^zq_y3Tp1mdof1W8>bhKiAMW9uT$YPICK3*JsevH2Al%WB(Cs64dxQ8N&gK
zarnG=3^K|{CZ9AC5;c4}5GP+hCBrwEO`uev7fr5F{!EX|OAEP1nWpg;u1trM_NGM)
zf%>l7Xc5_@(q9)28bTZTuAjd^D-ST#{t;5=&z+xf9|f8t&<#aAp6TOgg3FX7RBbKU
zxfUtjD?}p?!|JaPnaCx3680QFetZ@o!ZK=YTgdo_kp{M7itEIGjx%ANV7R~|n*fWL
z&gpd|bFQj`7jtzg1x_QW<U|0!SH@LNrm}x^=TDzWR?kXrQ49k_myA~uZDh3eAzCR=
zcVbC)=!lew{@BQQs~@BUNbEp`P&A9g#L&x;g-;QEE|^E6T2Qd4o%FN5#Ap~BC*)8k
zUa<ugL$VWT^rhD6($axI(iR6WJLV=6!ek(%2z8T&4!8~4dX;hA>|>F4RFt7-V97)b
zMY`Q&w60H~=&HxuB?J!<=N+#yOPv4eN#T_DKJf6Mm2)QNaOT?-4mMA6lbKGQ^&Qlc
zReYKhAhah6e?Tgx@Bk(;mgP`1D7Far$RtKWt6ogE>CvMhk>HdrIsPo5b?0WNWXj}#
z6To3I!A3C~%5xv<!(nIgL5h!v9IPaX7`I8_44NmMP0g6q6*Qt_KpzWZCQ2MG`(w@_
ztC#c&JcMl~Y~TUjkZLhiCaNo8${+OR-Fm&@P342E2)G62_~uCj(-Lz**gH2Z7S7;s
zWDxImD8k`iVogztWPIB5w3op$otAMl0ScLUbMu-GQICvxK(_Y)-Oehrrp?R~jvs$W
z-aUz;`XGq)-*|il1AHI5S|}+=rKTH6gj|T+M^AkI81v*(=|uvoQZOG_@Q>LYVaK;w
z0d}u4TRR-$_JZ``lCF0|gv%Bymx%Ys9_Cq|n}~^-QEo8ie9+u^rO4MmSox=vrpMe%
zB+!#6Gkhtq@qC7=yxQ;_S1@DtZKiw;dYa3L%RF<YTV7?49zF0vD;BQ{zzd?~7#K2H
zg-l%p@}ofmP-7=e(jQmZ5vaOk{A3jav7{Ew*N|e}s4*Fj2CE(()_K`&MqUJ&mh5K+
zp4R;NPfc>Q7Oo7tUw3@XyP~}n$s>sK_Aw%~{kb5s^H1AI`NZvc$z!u5H<1r%`t7${
zNY@hG_Il1vsrs{@+E8ccps6RKapOaW+tIDHI$NJXZ!&3t6KIZSH8K7Gwmf?V^p)Ww
z<4zW@D4I{%=Ovi0$buDYjaXS<CtHo-G`nSHD{`F$M~@cJ9qAep;dv|h=4))q`a_(u
zM>K_~WXfQn$NDM95PU4$H7XeQK6}FmdX3YHZZ>Oicn;L2?M6!v+m$Ah`H1k7xG&Eg
z@9bsjC~XYyo|H2C7?GqNzqm)7+;HXol?rEep|mNw+#zHr`nC;_fjL;^<y*0oW<fB`
zF}s%xs_2^-N8X`dw7)uFQDsN>vmUFCKoNMi;See69JGb3w;$uY3Br<XHFD(0#3zTA
z3nk=o#UDX$b-Tq~i*<<Q$=w-nlFLY{a~;2tWe$HcEM(w}wU^2U$5HiJG3L@_@SK4)
zm1UkpzBY-A_GepVs=BXlu{|S-tt%|##p^<cEbKaNl!KgYHHKPG+H|<)UCZh*O`mKG
zIzD3F`J8uWGBcHAqG?5xdF|NCDdkUlS#P<qMYmV4v3$r3rTI%++!;=QQ#9RB|64<5
zT_MtoxUxJf*3;K3JIkB|dL&-LGS14+&ky_SgNsgB?N{)L&+6Y1a1;ks4xE#7%>yhc
z24Qj`3T&s}=xEG|si!`**4{cO<%va`rCK9Kq^R?9&s;}4TieTC-p8IxdEN0?t1oXw
zMhb<uxT3-<eZYnV$3DF^#Ub~x-F~;-IQjG?2*Z2-;CFq(oJ;q~=9r*f<?4_E-dXnK
zxHx=q5Q)|CE6QIb+1)HlNT0XDt9%%3GMzJf-Ss!~KK2Az=`&4-%)cr|b<WlE&J`@n
zRkl-ylltmgau&gJf%?!uWjL5PI+6@XIc7Y2D3&8XI=YeiyME}o=@TYT%1DJU|34Bj
z{5<C2mr-Monx71s!aN)=8qzajUH-U;RxNHbA8VrMiJ`!$9w7>|V_<U5nfvoPn3)&e
z$P_8fC0?<EH&4-SC6PRnm`6_w-s4o#E^|%OR@$?SHmqN-H$Z8;I6BI+zu&x*&1cRm
zHEh~G?iWz2%O~b9i+w!hSVZ-WoQz?AS6pJgs$rVIj}MkgyFYMCwsVE|s8eMVd6I*4
zXIk*k^Mdo4d_1ml*^(t;2C*qAE4P+x2_yO|ykwU^ONe#IG2*_(EmuSw`>Mts_#m&w
ze#w$0PP6ShbqXC+8lPU3R=fF+cSSweZimb3#+sT+8q0W(C<0!IRcO@qm*Cy4c&r$o
zrP)VZ3llA%?~0!Bx<fub8YnqVz(%-bY*(-Uw#T&aN=7T{;K<jWat;FS6M<Jq@B&dG
zE7`dBG77=Y28x6(aI&N;4=O5ztRm4NDMq+Bo}d-F=C8Nhb|fP`AowBUa<Hxzf9GdL
zQPw)ZuCw0WjS&WYtp`53Hu!2!h2oaS2y_QQC&#MLh189|#y%V9Ve8q3l!z%os(VGC
z9*g*iLC1dVZT4#`-%(5e#N1z)b5;{5rI{)Wt&Hu5PbMWTPplsQRQBQp`<si4{wZ%d
z8`|vFUrN4h-rO@InHrcmN4#SAd@>%*7bh5m=y|<*2^drJ!2It<`Is3$pRz2;b>^=;
zrVg)$mDG(rRlD$q>h$piD^~usdUYh1ej~v>Lgoj;11y7wmrr_yvfbRI=#^EEEY`Qx
zm08UC-ZBCrUnb&Em0Gl7Y(w)+i|zP@IfzNiAfBhMlY+>HWQ~BbGIDcs{k<9&S!3+;
z1xLI;GtnozmcUpgdIRX`-*8QdH@rtu`*dh-bUFj7I@`IlW=s_xUZr55k#1q2yDswD
zMbAl+o+LV8XP$rgY|6&XuD-5*D%EA9;rM3yiO(BfX|jPwJh7sUNTRoUH+Dq_@t_8o
zwR~x6uj1r<S;CvKbGf4B%&a*;$NS*%+xnk+c(N^!GFymq`Vd27ZC}I_HYWkqBUdb{
zt_IAWL+DGrVQ+(c;<JkdteC)XOls?;P@LWC6Nl{(l#V?gP6ML(dj>eawdpg7V}Fl`
zh@i}=(>uKXBe+cQ1y9P$wS3PZs1AM3u2iz9=f~j|mD#7@zSLQ9AA*O%^aqxp<bxC2
z^L*-J-W4WBldkFCd{LSkizwyxwxL5k`eYX^uDHH1cemYgcXxN)j_K#l6)vuqK?!r>
zonM!G7k&=|vbXXwnPgymYyZ!{S=*i4K<qZiAiLIpaYvLs{S|Wy6R$p0EKeelZ8!hn
z@!A_Xea18df?Ik{=m@YNBK@PQvq=;MEGBEe2E}8b-cv4*85p~<br0c{Nt|f9t^ZGi
zoDzc;+6!$&L&KTqJ&!Xr>BKS*T-+2kjecG8yPPx0HYndU;*>?dsH=|Z4^36$K=3G&
zo-2<Mot`8<Tt8Pp4>pHFDKaW{%?wg%UW`-rUS%aLis~DrLUYLV@zb<W`)5wxwETEN
z0+eHWl|oP&{K-kjQ1Z1EH2V{`>9Z(HgC3ijnx+Mvo*Zp6V^K}QRZ8>~9cn{3p6k*H
z=d-TG&)>4_aR}>89(>`7CY3oV4Vx)8RQ<3Uv>*mAuQVe_8K1TJI}kE|#kutK_W92q
zlYKcFnLaKK%F}T;dFyx&5QyxEwGopoV<W{0^*AGQosDQFNRTO468g%1e669n;Q8Pd
zddGzgpl!m>r1e<8N#HriXivsZ>m3#R2PI7z<0%6k!GqIYPqkLBoBUC%37+jKNDK+3
zv2oearOTgRA;s;c)`bSe5_L@A2PI%}9<_B;)(7v=h=Vr+q-XN?wF!^Y+G3lp2B@^w
zQ3GMo{S;iFrTB6(JVxAGh)%8uS7U({;VqMwE?ahcz!-3g?n)8pW=gK$aFRL_nwxP#
zkDv+fxL(G$&lR7fR?QR4K@RWHDs#Qrm(MOq2FP=DxD#SlX=rE|Nr2KLLK#Xr5s9%?
znZ;CcGkyB>F$SlTLG|!?rnOv0?pfKkPaBpo<G_`eU?#+8Oq!f?<w+2_6=Cms$ZFi(
z<x%_ALlObJ>@o-`f&$5OuIu)6Z4juBWscSM{?q>0-|3$QXn_y;yd%auC`$Swb*l=K
zf%XvB2OJ#+O<5GV5hImilnnbLsq#-sN&-Tjw`dV*@fL#0E%)M?pC{wcl9D&v+R1mk
zMOVKV^=pJs|CM-3YRKtBzn!Sag^@6`a|QtLNx4Gxl4iesgzhrCiHgcgip4VV`+Vq#
zl}AtXphWR4AcCqT(;M0D`<OmEIzRP_OWq~n1R~8$k_&^()jDtD$=X*_lxD!QqT=qC
z7q$_RU@0mwl6)dz-7;D*1TIWSNDx_z?Stosg!$*}rm8SV;H-@3fq7f}8rlYZRu__J
zMx(kfL_;bpJM>+zcV<Xq?+3rEgFLM#+Qkfbn@Ef5`ue>DO%1P+Im|$sItIGmB`BMU
z@I_}efrGPOp*1b}hzxW>mTuwFB}+2!#HgCeSS-oW8Cf^fys$k1p0w@CTHc4mdP&MS
z$f=T#Sb)&Gd3ab5*D~L_iNLj;R@NNSZZ&Ruy0{X0xdpH;#^O>-7v}@+EHe{j8n&zV
zC>iA!DWe?N95Rc3_I#%aCt8U{KX_M=s!Js0o_M6YlxEO#OicX-;8$ce*&xONzt5o0
z`}N!?86j0r7n($F9ACIc;w;Ze*ni*fV_H4mALSNtq*($Yas>g@>_w6J{@Je`mxHGc
zns)kgQ5u#~4#v#H1j0hY%jd#DEAtVFKOFoM_Dh>{Q+DmzC3p|xWwv!;T0I-X49Wt(
z9f5NQ?9ltnAAkPIaZ+sQZ9W=QW|1eQ<^?+mrOFX#x`>C5<tmiDJ53Ul3M)EcC>q>{
z=8qidLoZP*;K_xk<NcM~?7Ey6j95Kj7QU9>;?=owgQmgmix+>>u{t;se-KwtLViev
z?EtMul*u#gMlASHQqyShR|}ctb<mKkQtF^8S+HTVLR<f@efvnhM0uHZO~zwzas7EP
zV!%`C>ykiwd-o1M=q}0fC(<e@0|{$68}G-SbsWp_mrRCTD#?MA=P5Yuw;L?e6lq1U
zp+XV<LFicHZWo!so!W(-y>BJb;*O*!TCi}S6F&idbqQi$499$%D1da%m&tNRvFgMd
z=dWJ9+NQ-<UtOf-K|uH6Q`PS5St?U4edPNDNtVq-8}9lRsf*m*^I&}TlW_@R^nh8o
zK4-V-@1LWF#&tDGV0AC1EyySb<z-rIw*&{fy6t|sX8roB;C%`uVjN$N5^_L+cDz<8
zJ9!t7{-qV__SI)2*YY-^!>h?l-7WT;jUqA=(B{yv_;P1LBbhDFlm9E`Llsw}#eO4a
z?#n0{%G`c-?e-4cejyiL^!)y)I7fb^co{zP22=eC2B<GHV-t*{Ine6_|HOf~9>pk>
z2_{X`-j8kCjJ>2bsq)^DeSi87`yf#EvTO||kYAK=?YjM*(chAsj>ln}5!8+!`yqqZ
z(EShnVC0v!cnU4csEgcOJcFR%Koq$4W1VPKiiJ`<o*GjB-X9@dxBrSKk6n!`c?3!Q
zBD&ekh;fX|Ms;4~-6h=wBs<SQ?=XM$9m*dW%Ky~M@Aj3cS2#8oiWBfAjyt}SmT)4R
z1E>LzvS-!#eEjUcMc{ERt?*!%LFWXg1P9KX6Mw_Cnqn9(5ZartgsJ9i^Z$4-3_ARq
zD&k}Nlw3tjMytNI`<^n^j0tA?Hz95?)oZ6XKfE&4twCz`X2Vr<RQyK@+AW^qg$vEY
zj9#qhaFfVuw|ow)h(e2Amya^C9i2XY?yxrFrokT)z1|!KB@EtgASbVN>j0CP3_Q9D
zy05z;S-}tm>E*EA?ntBni}3u*=TayEhmP~_LS}{)$hxc<ys>5Q#-Jgc2ETN&_ip3@
zEF5mOVEDZIUbkNE3~4c8!i1ntP4QSz*C+9;L?6AoQVI)1s(E^yN*j{DWy2enIrYzv
zw3v}wPe65~F1~zgi52Appcma0UPMLG{vQ4faggwwx0e31W=-bH=OpA(o;&P64Ud4K
zn7y^+u2<MyuUlu|9A8tp`H#wlKa>{jZBpfBMC2f&9-;s3XT1nRt-&3C-J24@%?RxA
z=uP9*Xte0jGp-?)<*zz=!IbNdtdYjZz3>RM2uVovCXt3wq`iPB{ri{C>d!yw4p>_r
zn`gIU#7cYI6}s^mS2URrV152K0tA5^#<@#_&A9Y&W|)petglI@r;Ip$@QviY@GJtt
z4Tf~;%@Qw^HGO|OWj_t?1l*hEn4^Yj+)n@df@P@1_70!6*ofEIM>R+lhS60|kNH)7
zeRro_WU7E9&MJvxKo7U6r8*j~UP7R^shJfYZb$y%coM#2?(O6PwRW@mt)2fpkb=L-
zIOUuh5>1XRLzZP2Xu(4D`}kwzmmkS57g001utEN%^S6+^kI5`jrD*J{r>s!FXcPhG
z9c3R0V8zpwN+odl0Wl2J(c+5aD~^xnS?hV%Ecvmot(B@J4n}^T>{`er1|JT<@lGG8
zI-LV#o-4xqQfIR}zThXswl0#RUqFeUL#;1(j5rxRmmW$P!9tY=iMZW7=WXxRF?idp
zW*<)Sre^2Pp52GFkfMZo%nL!-mf#&b8btp)>rjV~5R+2f0ZPncRYzS5c7uxd*4MxG
zUXx9JcisoP<YUX+dmAYs6Cx>O{;AMm7^2vqfPdE;4$MBO32`0+6nLQt@?LCM<CFou
z?()wc40Lf<X52}X#r%W5%085TK{_nJFdD?-$HpQ<_%nH#CDQo{5f;j$y*vhx_oJA6
z14omu<&|(2cwg|R4#SpBQcVJgXMWsd>Te}q$}!nAjT3T__IMeMpu8->nvgwu^pKHI
z?_ampeqdZ>d_l%WejE)dYph}@iHyXfikn{SByS!FSU4F7Y}2u0#}vZcUPgb!M-6(7
zfAIs0KczV@TtZT{AU=W|#C<LeLcrD!N=qZi&b1s)da1%lTwTCK{<X{Ye_Z?3w}5p9
zIi`K1lu5uM2gea?m^9;a!U!@!yJ5?Ykk2V*(^Dd4f{zOKJ|AZf8eNfbl2AP!@(rpm
zqKHVIc*8GFozD(a&yLHaTFsvB(Nmnh`la$u|ES`T{)SFwkzP=W0@anI#q(v_9YK>n
z`$8(+l+oQt{2PRB<n#aguqmHil<^5N_)IBdK}K)T>`m#cVFbZ~mP`uR)4r$r7x^Qj
z?UG{+pbpVOm_**)d-fcs5}1Ry<cdsc`}i99<mV4k9ZSZ7ob01iPm;EUvWt_a=EpZm
zMD=l`Ku0rDZK=B>!QM!BegEFhlieEWX8-;)|2y@}KOZ*AfB&h;&i@W8X#CgIPd+@!
z#=mO#{|hueod5s+gQm~_$Dhb>{0~!AV~8mwV_{zcLyK&=l(ZRv^8S^f*X#aYXOhXT
z+cEd~ebkov%U#fC=P~N{-2Gh;OwOmV3dm%<X`|obp?%j=vxGM$pZdPx+eXH?-8=n-
z^drxn$5_^j!h+9UGqoS6;8hVGDmh9+#6JG3p#4_|;|M4u5*+J(lg{{EqG;&eb=y4}
zp7S|XdAE)dsJNPbmKwtN8GleCVz}NQ(M*nc|0?d&a^dM1ytf6c8+G)Fk6FEq%o*}>
z=gEk#9aisEnU!$`$v3|rPT6w@RS;iigRiK<D0h@oWxMdwivt5z7a$;>Xho%O>@=Bb
z9u*`RLrDx7N!Co`Z-f;7fQ)aHhlgd6v%aLI&IRKp6WsAXa+Q)3K!LHCI&~*=q@!TN
zIlIJ6dNsV0MrS4Y{v|Ss1BhnzW}F`MskOeoevy%p0C4?)_GKI#pX~#ZvcayK7U8lC
zpOG@Mcr@V#fTc39bPQD`8P5tuxpz`*62ou=fJ&>RwLsDrPlr<b;NW0|(lSzSN`7Nm
zwDU0l2v3-HVT@RNzgiex=}TS&XNAB+hOj!PXI>I^Ci$MEa#91t*z}|{qMbYC_=0Ic
zrZWIxEP&e+H&SiPmt+^WDKmh%t8BL(%WozxjeGu@1||J|WA$Ac4rdpov55(2U6qNc
z6GOKjUlw`<-obfk3=erW!h)2<Lm46#P$FI28xfK1B%Xi>UW_zoV3;()WtHEVKxzu!
zy_>p0Zv%q@=bJ!>aK~b$9tsCg9Fb@;xQ@8<%BWLW71i78H8qczpNK)(GPk_jOe;|Q
z=c4AgXzGfw3aXbN-r|{M(0R#l_JJ{j=-7+#E8clRBsl<z-Sq3PCsJN@KilrrGrE@^
z!YIEAOysER&1M&0kKRat<hlaxQ1(6Vmp**{{vlM1opXX`2MF&{{er;r!7MBpTY-yZ
z>-bS=Etu$*Osqh40HqhW{=$rW%j%Ce?Jhlg!cJ8PP}7u2#{|0LO+E66r-f;xmh7G+
z!{2`B5=pn$E@Co5u%{dTNV_23nK*Q&QFT?71;11Uk@CQ=Yrc2$^t^^@<Vytb%V=xb
zXc{^^8VpI0dE)&`{Iw1Rp#be;hA`aGQt^P9pj6?oBBVCK^vGp&Z7u?`laah&K$mIQ
zq4LmHl5%>!x2g$(PKjH(<upNv^@I)1HRSm%7-&4RIfaSw!fT^{o3m^Ww0Di1mRlV`
zo-+qHP>2jNA|52^(o4O=b6_N;HQ8aM-RcB=(oLH-H6t)%2J9Y^a#<1S7nGF;M1@5)
zn_J6yaYKHP;mC)L8oDu9`z8H9riU3Gtkc5WQs;ydv$P_LjpG*<jV@XUhG}B*>2i5!
zxV8NfI2e~Hmh*s<V!k7Bf+OA=!5jS|7#3I91X0|y3|zj!D00PY08Ju);TlzJ|3!7b
zs7Sb|dwlY4-V}62-)@PWot;I5X$NR~V;C^KA8-YUG{QGQdC2H)44ZtQVn`<q%_63$
z<Vk>*`TEUfVFApdV0-{FXZCSQAwodLOhvLM+YOlh7w1$$eIfhNFJ(d5#Qj3MwpCo+
zEY>1TP1ISmvM&>$BnFnrm)w>m{0ShIy+V2?#k00~Cp8-I@Q`qW_xn^M_0R0gj}e2@
zG2qAjEcp3=exbBN?}Vcg)PdnBl+T190{2ZOhaxlZbzHZC<MIDUccOm3uMSthdaz{U
z3EG9xJU}S|lNfB5-%DCLFlmO=JjK9bNg3Gdm3AHTwthqGE1BEVe#gA-a)Dqm$H1CG
zDf4v?Z)r6Fo-7et-15G?m$Sddno&x5+AAj64ttB<t3}aL?vD)=rx`zbv@KLV48!2z
z+Iujo@=d}pCGMVXRFC3cJPbvX*cJ5U2}eLBlDQ_ijowY8qwB}y)~y3^&U)m+C=d7w
zDq^wkw^J2k)$>Oma$v6VK=l^f9T``2eF#*Oi5in(4uU0KE&erQtq6iqG8NnlmF{r@
zU@$7p)2;~4&6a?G{1w$&^p}qxFn;><-N1-5^R;DG8bu=)qkfoQ>Ojlx6j3#DHiF}H
z1RlcleB1qpi@Y>^qy~xCidi8PTLXhaFT+#9tAl>0d1&-#0ivOq3=Uz5-i5paxv&9r
z-uJu)fGX$Z4Jm=(VPr}qU`VgL%EqVf)8t>1r|zDd_w@8^X-)Yvim))x=abI{F{ltY
z`C1q0#Oqf04wm!HlBJf3A>k~t4u*bE2n6&YcIjVy{`vJq-wU(q@pdf>&)^msy;y#b
z%xy6%nlYc1a#x$&qWkw3A+atO%$PGk^$SbvNh2t}K028$#qcG`Y#&Tb|1&PJHu1(_
z!J8FCh`p$EwGH|3w2U<W6{A%{!0e$u(-)e3nk3Q|GA=i$`K(umsZ%_@eyE$>j=tv7
z`%>~Ix<g*YG*(DIH}B?(!4Qe`M{Qxx=207{*goE^;*Ew64A)uw)muPA-9;j};2LtL
ze5Si+)aJHAasV)Uo=l~#Z0M6$wWJE0c6s!Jt^!$PtV(7BJ!$d54r#QAP{U^pOKoJ!
z36A%A<5r7^rIcP!Nt=X^B@EDl2O;F+g74xKkk5sv$&>bYcK0Xh3==tVeV_voc}38^
z+QqIzNQI0KmHDOuHDU}Oagv1PBb7adCuT83i)i0W2&H=UP}GE0tyED(B&+ne;|Omx
zo=5|x)cm3DCF>Zm%e=qcpMKGFAM(kvanUyrNs$?S{JNb@PZ!;&iv%0UD=ie-KlO?$
zC<yI_EU)H-HEo&kvPMEGxMniq4mbdsRWA&_TsfHd^I>$69y;Uu#yFtw<BM-};q%`x
zTD)-Kq4Jtzt)7h)a4&B~6kS_*<@rW;@Cn$S`Nv$%atu3FyFq~x&>L~>eEb-FR68X?
zqd+0$b-IUCyG&1Pc>88Xyp0sW)Zr5jT4?f=>Zl0RI4ob>(ZV93eqqgLU-fR<d`V_+
zw~)Mnnm?Z#Fn(Oz5$%ZN8FNZT?%5iU82-0SSyh15W}6LxaRy;Ma#Y=ODmVS`)##>w
z{PStklo3`>hA<4g<&b}#=G!cet_`lNn7J~yV0DwHC%kedi>wa~%il2&FoTc{+LeKg
zA=PEN67mQ<p;<3v)*b&fg?R4>7^=v42p>K0^z2om*i^!UB`Rq5ZaF?rLR_hsN$^E1
z^4M-~Z^Q<#aN{0#Q$L|Xm{nLP!`>``duV1TeK*`{NB(kmZ(B!dXssOMmMZ~GAuoqf
zzCf~nUZK>~x1D@KgT;@nTSri(DWg|W*P(x-73k&v)d%59lX9x2gc1P{+yzp{zahjh
zU&au&${jU&bRU2_?!&xcB;7NU(pcvD%FMRx^440G+IgLd#+ecH0iy&xFrXxNm0p4I
zSD#Yntb8jxxRjKXQ9H*OH+y;Z?AeF>8{C|jzgu*IO1iATw)`m7{BjCc7hWWwQE}CI
z>RoR7EgCrQaoriqh68L6OUnsrrERqnTU?HbvqVJ0<f`XQ9QC?Qqp(aCVgFRsFYm%G
z#Mxhl%j~aGO@D67U`~PsDSA<C?Zvqhh9?0dp~dvi_k->4=k(3ANy1@bif#6$Qx}+4
zRa-~U0xhxB)xi-4Dz#a^-W9DGyu5E;F9A;|YMqcgALs6zWWNjyRwCS&T-PJWz=0tf
zeSM`d^t%nqiQM%(E360&dXC9Y2$)fk?0nRGuQi`&RTTXB1tzrXq)DZC%l0RF`}gk`
z)CMxOZmsK`+eO&feTX^EMtfx$-C1*E0s`Jm2FL{anEO|5UQ%tP=Zj*cj$NYgelL=&
z#4_R3z<kX0C|AnY0&S4@#hT=bQ4yyj<pCMcN1ys7%J1$vU(MQhV!)nFoBoGQn)!Ce
za>5NM^Agm^Ly}$%o7&5N*DUFPv}zzTueU!ifZpjx%nmDQC$W;1QR@yJqWJ<$_AJWM
zLVhFS+xlQA-!sw4$!TwTJ-rkRIi0yQhyHt!VOCzo_(D}m(JmD>qYU93IBYmxN2Wz(
zxpOZ3%1_-}x7>YRlk1g5?H~akY%_Vk*|{rATNpj_!BWINd$vG%8O(=a>aKm9xW$xq
zI}Gp8??}8dbhSmJ*YG+0_x~CJi_Vj8C3=jL_*L*CqFm7sb6c=LDi7#%50Zzf8U%$^
z8pK3wZHHD)ps<!{bsn9OOL>6&=PLS`)X(~qfh0>wnZPNWee+;kbqp=`zY<<=_HQ*v
z!Wpc21nbbJd-qY))P$4=y0`~8Q0-|T*fSg3S*Ggkjg1p$cAWUjvQ(av%%mni?jG*(
zSky1BCVIFCHI_l|a_)~X%vIcpbm)WR{rJEC$uN3ij>paKtVxO^8<oNsZ}wM6Nmv?-
ziMpjN3cPU^@Vk_v<?xZG4jogT8#=;4`YIGnnlvHS90h^coD4(&0Yc8%^b^W{l-@?l
zJf;XhY-d1Gdub8NcbNZ^kX3yTMdudm$Y^smnZ}zkcik_)$qhE})6y$+k;S2zMluTN
zh=LOq2}h|99XvE1T_bVke9CC*SqxDj@;}_R_(Ei@mcG6vy&b<%ol=z1Tz*kvafyjp
zDrxPJWQHdsDWw3i^r8INvE#hY>!NA(-#aSfAm(QoyjW?hVn`=(tGmEVl)I&80fG>i
z#E#1tMfY2e<2nlx$F$r@ho-C5eCFV@c(^3`eP?6uBpU%Fklr7PdE-Njtl6K378h6x
zoGi>_Y0Bc}WCSZ-@CaN(HF>yYyLJZ%(~NHR+tw>eayAm-c17g4#jnS8ZhBfzR@3)T
z;K5gOf%ZrLQ;COw){`t9dzHcLkT%QLKx>qw@XXrAE~k;xe*J)5O|^Nu!2)>m;8$_7
z6C{NfFx*$ek(QR0ZQAU+ePVh0mHX;JKK4QJWRLeRL2=odCulim&H*Vhv9HG&zx(Vh
z8|y}es;#D`cJJu+w@^fi^!INm$Sj^l-!4VxeXW1JMg)u;!CHJbti#dCj86D2gD`8%
zxEMu$L_tg;wRBK|Q*8zVEtm%+RSYpS6u=?y)0I3g3rvGpK$*-VJ%N2~kM9)z7$$3Q
zyeD&uL6^ISpGcG8Hd#xl*iuB({WO>n6e$9$N;tlaNOCuL;XXn!;Sn%-ChD)h{xWe1
ze|i*f-vK<~it~CQD$V(Z#sV6~W<H8nnlZ?@4gr%kpXBgp1>$ej<$J129*=KWfDGi}
z<EldIYtmo7Z7bvm&UpNiD99JW;g^K^k<`F(G5-GkVOXs>WP;1kbbj)BS@l{H@%HK9
z!nsv<rE%`uYhtv$x!Ko;UA4)-bGm{MO*WXM%aY}{%C(5Kx1($zy^I1av(y4EAbj|{
z=;Y1aIqX<<EZ9xA)Pmiknz7#s?k`>sc!YFOEcPZ+GINL*>rKxbjj~O++*QiTBT33H
zPB-OFgo4M-n;>ntfx3P;MYBGP3>J~Lqxl)8-cQzB%jhKOTR@4xW>JK;!$~Rf_J%Dd
z872e<OA-{kO*rx60ph|pbwdSJ+>=$sz%%KAElR*PNzXS;k%t~`vVz;a9^n8siJY;p
zBk$_y<RF?f4vaZOw5IK_I)20Mw&bPxOYFY>`fG}d*{_1Jx^3;0LYKhsIxo-JapREV
z>gP!lWl;2rT)QH?dn-G4^66+#%WUJc+<8DzN4w-m3JG~Z+#GxmH_MN4%#jy$E~$;@
zq<R$>&luwoS*bb5pzTDxGPBK?(W4(h@wbjCVw5?kdix@rIDDjYlfGr-@<e0fwW1Ey
zJ9X(`njoYM+O`N7)9UjfdG7cJeok084<A0XjXbp#|61UjKAIhb$82AO)t>RlWJWdt
zABA~srSzU%eRf6^ZmpBY2(M(6kM16!1mZ^a&dC@@x_a#4d4bPt9)GHqduTs6=c~#R
z0nY}0!7nacc!{2%sPDTQk5$#d&8LT<>rIaqorVLoEX$;@A?OEr-IIO8mlps|d{?(6
zjAj<Azn-}L0xwx74QzApig|By<HNN7OkY)qHG9c(FGuJ0=uk=|>C;0;0P@2E3weAa
z;l#@D7N0MEbS1N7rXT=h3l>N9?D0Y<jr3205_JXNI?ux!7)blJFAisB`$@^IiQ{H5
zeZ+EQ3;%u&_HY38(@20t*w%Z?%p35KUlP7eVf~h)DYmZY%9Tk)zFnUThCJf8@D}_f
z0^L_3Nog1v?KFe07Z{rn@e<oy!V<>Qf2LHyb%6!zzGzruwEEkU@9sAi7K_)Cn%I2%
z>)6XgTk{4CCoH_0K(uk_;qc=~X1T_O1_wt;Bb@3b{mo8>ccO@iXOnWIT?ZlW8tnI@
zR5Hvhz5D;L4EpQQhohxE$J`~N#jj*HL$eInoY&B7UAGW#NV8|0YyK1X%rZ>+>gLws
zfUu<PiUijA#!qQR?REUBlZU653<qKbyiIRIRj;t52J+w|7X2DKS5g>Ow@j2$J6xUF
zLc%;)_XE6Xu8s0sexg0>(YUWFg;aTEDVSMK%e0f7ehD_SS)E*wIOi22KKECIKtVrR
zUh=AL%W$U-aP1!7aDH`wfWYRj6Yr2@oqO>(HQTgzTefdMK)U9%e8(R#7&HAdZ&XyY
zB8f@c7x6gT?3mbWEqiMpDg&0Gv1ERuw%xgwGP!9aj#J#d>hj5d&TxM6&s4oxxPIL`
z4^zV_GlS=Aom~nbC{lN-Fv8wQR@5M|p?#i5)a%yAT*7UX=TF;KcU+cne`6&B%A<2U
zAfu68uYV4@C>lvY<?|uD5YJ1mP6$#Ceik$P4H|Uiz?BYT8@T{oTNXcltTTCXzH4qJ
zRwKS$)YmxUS+i#C?fd673g4MShU0sVRzm4*`Th4@m2RX{Z@1H98I6~Bbk(T{oCvTe
zh@825<!QnPoiyjC)}~J%WA1-!*KW?G!tAZVw<4rj?W9-1{X_IwJ>Wb^%=_IY$%nxR
z1AFE_fAPYM#A5wI%FHxEZhO2GSDy3IDlNyg+V|ztP(1(<>48}H96&kx%eyb;)F&2Z
zTB^KJf^ju071o|a(nz##&Ao$S`0GD3dPn<(2JU>0R}n3Pwna3vU7a49+&b{hH)9Wf
zmB<(=Y2sRHak;|)_j<+wbSWC5t*zZJtM>xV{2(gUjz++8aCshtB>k=MeHO2vAfXsj
z3IXz_ZNC!)qcX)u(1TIccmL?2Q2Z)a0737>xJDa!_vFtXVu?*!GQ_^}4f;13g@uJ}
zX6AR%aU7gok+_hK94pmJYKbLt7N^s&KV=<dow<*a@o`iBTGRB8mmR`^5f1OD2VQJ%
zr*38jedv?5l;4xYMtA0gZGu)OEs7HP;B(hq3sC{f@OAs9?5R0nm6^GDbPhz<<WcEE
zl&!iX{^1H?sOdM~5VwLk3Q-9l=0(LaYQzHi51<G2VB6*n;4RSAbccm_uYb2{_3ANu
za|89-R%a0B78`%-TpjzeOF7Tz0LY&}Gy7wLtqo{29yK@xR#af~Kf>>50m7K9Ds6U{
zQvLSr0h2?2zojZnnXIs0<elC3E8A_vi5Z|N0{Mp>X`W0zexfvH;gmE}7V-9lOK;ki
zPHFaB&_!t%P?Sk|>8Rs}9^T{i#TAh~8dv(@>AXDMn+9d|+iqJ9?$%r9H|Rxndwg#a
z>Pb7fcD+v>nuZc2XliOw!K&7@oOhFuSv##P>C}t}eDGpT&GTy{3&Fz=UGi+FSRbP2
zN5ljAXI2)Rs2AD&9!1wJ7iX_Kg$PA#g>PStCogZ;iTLoI@^i@1;1oCj0YWvW3Lp$y
zbowlF6Fkbd0#8MM>E}y6Bz#7SPEA|1nA{MZEFM$!AcE+zOo|}{I#QUId1hlqOj&#y
zEehV12On<>9iYNbg=fqjk>(+32d8lVh#M117+pvGloI|+e_@pQ#nT0Ec5z>OOZPsV
z+AI8V?$P2`b$TeD2o01jiT|ta-2ZCM*EYUpj2YV=hP_P_Qwp)O9a3q88HEv1X+$CA
zPzmKw357Hm3<)_)*h)!65gm<T<dACUB&y+w(wR~b)$_h%Ei?P|dj5c?U(9q^>$|?+
z&*wf|*L7c?pfCn>7cq?>eI(4^)8gV#L8mDv!W}aUE0>mS@DEy!y#y`Dm6xyEZ_m#h
zZDrE#b^4v2M_T^qG2d^tnwpFfQ=a!Sr~0%x@^;j%9c~5t^p_uHvMsdEu;|U{F>k3x
z1gX}EWV{1yq-*CWWpYt8wj1-#D1(D!CUj$-4%rQI(YUnwGv5E2s28DWFniVG_H84+
z_eQJ~pG{wknGMpD3QyG0uUIp)<h6WzB|Pl)d53o^>>U5thAKj4^`uBjzhYw>8+??;
zhj`oF$*osLc6N4>?~cm8Mn*<9S90A8hgYc`B!+WQGXDq0=Np(|RE(|SUFu1%wS~#T
zz)IVUv~Mrfp`g_w$DSQsNY#j4-O46xx}zA^9e_cTIGC_P4AP0hdDKACO>XN_O87aT
z7gr?F68Oq_WQ}xMpf4_7R#pwM0lgxaWF1)gG^b@6)Dy{Rl0XUgn%MQx384^v*X~aU
z!XWRYD96U{=y~w!zA~4}rUVA^fFp*EuP<8hy4Gcz2g9s%1-*hxc|~=jhr#(NnU$_r
z9PcDEMC4j*9C0*rbq<fIZD{wND66E8?%cU^V}a4VwD^@zhwcpa@Uvb0nAtrO>5l0>
zY8xB(hlV<3{9Va>Dt_wO%j$ilpsOQNge1p5Zr$pimq(EjiV2|mC#Gt8j)(jI=EH#J
z8HXg>DKnoLUUK=o@I+0&Lm|m;$F>mL7E+;&@OvBoVmif6Hc}@cR6scHET1e&9&8P+
zWj7)4H@rGYa5x%)^M_Zm4J<-*ylQ*qrvYOb<}Cik`EXWd6LT*@9e-U+jE|%o5St<<
zAA`#h>Dzu6>(fxTf&rkixQxt!ev@e;^c_OH)y8d-auKpeRF(qdv9Sh#fYB5wfy7vs
zNy5QP0!3^*C<8oh{^eZzE>Cov_r54?WxF3Nc~ajZT|WS$)rL<y-6PsDFnt=BcKV$J
z8ZeaJBjv#Y10;&2`_l@G2@{5RTN)aCT2x~+e_aF#>AsEQ+tp7Uzk*W6N?~+RHZ2|&
zN8jYNcG|6#ck?Eg2pufkc+j4P8Ofm0Wu~>1n*&q3k2E8g+mkwXxB1n;W8eDelrk#*
zxtFz`0|zF80kf}?DUB=0&oYpq?+OS3QEQFRIUA&4*4bW8c_%14XZ?~I(&6*ZFLW@K
zo`tlX9#>CXcGtMHFe|k@8GPY3lZQ|t20by<8k%!3=LjfFzwzV8Z_{?=VBP3EO$`nA
zu8-V#AmLUahl-;w{4^S@LDab*owu)FAIDO6x%`>&Qm^I9r!u|n?ssu%n)?5oripdj
zJ5ATQppW4Smp!$9t%>P`L9tx}Nc&PvEu0`JQCcm}ien;St9UX`v71}CJ)pj^ENeV@
zEWy!j?z|F1|MkV->(=N!bs{Olgb4QfZq`AqymR3sDYGp$ZFN5k>cJ7intf2f_yQpg
zI`Rh?(%s0MnRc%B)FQevM7v}2rcEUaRB|=z=O2pXZVT|N60e>Blc3qe;c;8MsEC5X
zpzKCX{`e!-^#yfm)!<RXoW9B|@GMU41mu2C9c}7NmdouNx3DifSbPu4zXcf*-7*DR
zsuQ5)!J<oVJ?ly=ZR*HbPfAL{@qR0gAt&7~Ed1`FmzU@4uLjRrpzlW@s!3D0QkcoT
zB~_>KV8jdqkR>I}pzqqx#-rli)*PWt%X2GE+uG`C1&KVs#O8I(Iv24por2CkM;d!P
z3-EA1LrkCh?Jx7Lca$Mtfnto$Vu_<xTu^z(CuzW3zadG(9t0gYu;5gd=Zd30ChhwA
zZ0Nb0+Z*ShuouK8?1EnR1I8Z5w|2W#XwvKE4!8Rpo@@zhsa{OQaHyk4zkdBdy6>rT
z>gfy!JfA6jl--B!+uG8KLCW<TKbAsA4qgs;K@WiT@=J5E7!r|%I5xogrSGobZ0Bq9
z4#x~QI;gDO`h+sFq1x!H2&0DDZD=p927tI>7arL_fVpR!bx68+{T1Oo!FJn*4I4br
zb{wvHdw8RJ&C8;aCzl!6{@r*5RW$q6@~HiEQp-45cNG6mXlR*2q!W*eT{$DXD*yZq
z;|A0K*dkU=-@f+A6GJU*sPr>ldF{)4v%fnqVGiaOA`07fgJTGeE?kn@T-8vtV5v{S
zu$fK`UONk2wZB`P9JpS$`Hl9|%+)cRxWb(_4`NVVQxJ)%6D}Qd=ra=P@tnFA?jPAc
zl=kPJ88Dv~yf9Ek6vmFVY;{HGpjLVbawg43c@K|0?}fVm-Iwlj&H!&;Yz$i>3XnZ&
zop*{>$F+RlK6@;V7U++nW{a$&;^hIKzyXNv5{|<m<KE`3?)euM(nZn&x|h0Eo?OuI
zU6729-hzrp`Z{%A=2y^u#+PBcE>^gF{zblLL}LR9CcLCD-eWABRjTo&nyeIUtUxfw
zy(>VgxpdSVY~Ws}!RP@<wHpiX^$*7_iMTTBs9(+au4YusQLKrwG#41QXt-3)I<)^y
z9?;Lzs;V;_R;`P6>EUaT_3&}71*bnggpW>4hw7dteg)YNF2O6`du37jWRfCw_x{zP
zM)MmlBZ`?3u-N--+>Hv|Ej?E~`e>VW#KFe@S#o*EV>W0m97$hug63UP@(JiY`w*>e
z-E?FwqPszusl&AZ)}0_UsjqnWL%fTidY9-gfe4DuRprELy^Ze<<KmXLcO<1rVdj*K
zmZ%xUsoDJ@Yi}A};=`t8Wo4-otshQ}pGS7{vE7aG@4E4b5f8F`qzI1mDI0qjFezh2
z3o4GqT{li#cX<#AKL-Y6khw@&_IU|&r~%3{KWTnb^w0UX)cb+8M@81%DSAjmp1E8-
z0Nal*TMSko?U*Q;7W0;OjtvMb#1FH|HWS!~dpQU&%EqBuM~rIBR>k1C3DoH0xNrKL
z6JcS~H_kZo^VY4g9XxE=p{(V&1G`z(du-1h^bZD~yk4{K<&Re%YG`T-8|<Wktl#%2
zzF;=+H#(72kuqY)McqtGxqIm0*}fy`(7y@P<k#_Fi}I>^F`{;TXmMflAJu_@fodjS
z&mVCau&6OGx=nFGDKXK~avBK<h;@O7mgiW%S=R;<9O?3|we5we>aReD^aHgZ1qJJz
zT(0CH6pf#$ic0^FGw<I00i)Q%`hi=v{Eq99H_;TBA#48rvjy=Dlk`%Y1L$i7P7W@m
z8+isCbm#mb85Zk$r~t$VdT#L%FX%-ZhezP3bC%oJJ%9c?{A^IwaAu8SI!w?23Z42$
z1nQ|VaoS5@1OkesYXB{`&6C+m)=z9o^<s{cKQ~UdhNPRnVqE|lH>!a}O+CuSY9YbM
z;=?b_m~3Hj$3wwUAlG1{_pia!5Gz`Xkj_>31B^ywCp@~2xlki*XJh0ULobU?<8@Qh
zBHRMZwC`IirYpF;!jTbe<Wl|H+`PclEY7f|!^Bn-G$ogw)iT6O?*OhT7rjIe4rb(3
zz%^TV)q1v0TuKv@pJ8rgb@5d}Hb%W^J}_9jsDAZ3ckE~|?Han>fG*3J+Oi07vURN9
zo-4W};&{w(S1Mq&j~)<R?ALT1W7e=r)2G+k&9^YxarVXTBhGBRt=nvMYcfIcD;%iR
zOeS^byA^?nK}$d>1i+(ez-{$EaNog=rB+XF7<ndUinj?=cj_y&A#uak5t!9#YR=8Q
zR-N2-E@@ko`;dzZ`iQ#YZqD-+!=hh4{x$g?gK68AUFV|RH#Nicd83a=s=nhQU@yJ$
z(tv-$S9a+#CoZj+5L3^<?SYvi*M61GX#6dP7qWe*d*W8@kXiL$ha6bBMH@XNet9hM
zl|-O!|M%a+SvSD>=R3?is1IJi_~#w<bsVi~JF;EBzI|`e;1b-z^J>7guCKy3#dfT?
zw~Jzl*$#K#=Klh(9zB{v^LPE_YoA`Y2U5z2PB`pB;~WG$34S3^;ilsFgAP>`mMG0q
zUnU6B*?mPPG=%u0!wnB$uYj3GQ4;4zCn3se-@bi+AYEoD(Y^hS;LS2`nl+@2%=ZaP
zl{1{-Od0Um?m<anu(stya^Wxcf^kvm+WTS&u!9}%7j_}~aQCFTq7ooljElZ57D7oz
zyvDA8N*Q|ftw>^yAniPS`IF#U-=%kMQ@nE{v}RN}e+?(>@2_yG#19A&&yA8pvOFGB
zdKd)i&5GYy_8PqV9m4O~NruV=;J0lTg)*KizUeW<lM1&7%5GUAyar2TTcQ_nq1dV+
zUrzd=%^K3;a8Wd#OeLlzh(44`VEYSkvjb^=TBI7<Q#79F-7xrlepDz7@$(2p#)Y2F
zJJku)+Zx`g_U(glTf5yQJBSvEwt+`x&JjzSaEOT_2S)Oq$aAvUk{n6>t;nodDa<Fa
zus7F+g)`&OAJ;K%cu*MOvy99lR!3yqll*4|AFc_&Quvl}-+zDeT^${pu%xMrD|N6%
zp71YB-?$MBnrLq7v8QB%IyMQJ333}nU6SZi7cQewl$Xzlg}e6T`cxu2IG?zLEs{fj
z8anbRZ}s%qvy+&h%h?seW6JgNrXmPs(MX$Wna9z$?WK1S71NTU<VbQaEx{z=GbNWX
zrURC{kT4}HMWi2MvjFdJ5Z8jibtn(HoZKj-1;@5sl0j3^X<WNDRw5gAb5JD%p?ov5
z*Cq}!Ffg!rW_Wg#*ZhLcdy6{Hm~@X@fo`XER*DrTG~d;4xOjMD-#)#2i@k<0Scq_c
zmIo9cZ_ET&e;5&Xgu7g>2r&3G<xHIF$1Q0(wom6ocz8LtKYYXkHD_6hXq{hz89gKJ
z`!Z<G5YXc9ILc0}u)t`IAi?`=nZajATQ$8?Y-vu*-Xa=HBiI)Xw4B7&ND+@hvZ}bz
zNV^I*HUI9kJlqrJ-#<-VA&N;6&<c?Uxr;Ryry-`i<k&}Hn??Kz?{a}MfFXS}8I6MJ
zCiE=NtVq!xATuqpJ~I*-9}zW}*lD?>smV@-m!0yUW+VG$TlMDlPraYCXGe%D-ISRM
z_@INnXnHY1m18`*7xu=J=$M>5nPA5KHge6-2pw6;F@~p*m)H|uMHEYQ_mWLq0l{EP
zy05=CNlzS{!pk30bFn#G-hR#_p0T`pW~bL4oysyA^I2qHAD!F3Dy<)$P|j;|!5=R`
zcq8sBd}jO*<#v|C0cDY{aQB!HSN~;<az};LJ>IM(SXh3>LkdV9gU$)>8OfU2G2zTu
zH1We&hn4RKi}kA+mrYm9F%!oSs_fX1VGP@{qy}ddGY$&=konEPTxtwa3=*1Rdz{2B
zYlBe7N-fL9a4^1^%M*5o33!K&PU@X>FP849$JaKyLCKpP81fw~5;6cYS}W0>3mpzL
z{|tB_?wQ{rGgwiu3lZketUA3i7*2Rp%PS{(v3n)SqUTlDzdGM8h0~F&Av+ReVPP=u
zGCYDOVB1bl)I;3yMRn)P*|%){NVIH-SZ{e+**5kkdE+x1(;>j+`meu^Cdk-OXdCey
z2bK;*qSt%DJ~$Nq{K4_nVh~$Tz|2q1WogS62=<DVWAsEGYkPDD71@LIArVlR(K7f^
zUZ;2Y)p2mj#Q~hw_BT$Sw6iTmSv!MK#pp7m5wmAQ7}-*jSRDN)hy<@W^V@H{N-PA?
zEAjF1Ht~dU9|KOAUYo!&Sja{btPT~Tn^%ZfwQ=D;$gyA<X;sY?7r|x5n!J6!0sXhj
zd7F~O01&w%T4xw@{z$ut6~;}R)1V86{xE?mHdoy6&?S;kBWmX@t)#Ng8GKtHa%2ud
za7Psy_l=a{d$?bVm3nu!<;Ad9wnZPy<(!SLMBc1J_Wa}3`j70H#zsXq@5OXTbq<}-
z6#Ayz2e?oM_oqURMtODC8Z{8zKhlfoAsju_XS<e(Bh*O_bG+vd*Fln1aB)LZ`9ybt
zDNaW(v?YY0M!7KXXxa|92`evW7F`721$4>npy7I*!cmkvV!9c2Gi06KMH;TI*56c!
zLKQB|??#>q#%w=D@nM#*F@PsCRjBbXCcpwW^D694O*IFKvS#L(8!IED{ehUoZ;67w
zUV;TQ=r<c&6K(-=uq^IUF<wA=LNESX2fsbw0kkJSP+Qf*5Kvc7dB9XadOaRI?w{yU
zJRB)jMC{4c%%RX=&$S)ii8~}|<rz`V)5BvI(nQHSvc#5OiJ;A6j(L+1IFMAL5tG3Q
z8WF?|Z8ajxL7G~N_EE76YZ?xMoRH~Xs3<v0nmBO_c^pjKZZw4)uhl)&aF$hkV&cEG
zwM}KrO;od_4s}h3!h!;E`jZ1m<TdH!CG}*v!Jkx+)bfyqcmuG?*ePYs@U^cS!|)|@
zqxu&=_#Q(SXyeuwT!*`Ny=c9_egu|BsU9leI-NOexQLKncrg+-lJadbVCL{sk-VAc
z^-=d8hiiF;4BSw2;J=^o)WeC#r&ttn>I!9Q1TF#<d@gwrk<3IK@tw$m(67Jzw4udY
zQk=}FC>`y|WZo#i5E=^Fivz}9BCB|wiopX_Imoq9^C+r;StMW^p0?;mDGKCivlZ=x
zC5xQWmuYotk%yomv^cx^U$st4?{HAOAB=qtX{Mg_C6=1^2cW_^&PIzwAp`b1{YO8|
zgmmJEAnxR$a49Od)l-RFOfpWB`^<ghGWPPg)4h4A2nG%-c<58;$=-WmBG^0$G+xeD
zL?(q-W5gO?m-d;3)#v%xyHJ{maT%+EJ!~PYA!-}ePz3#48EJOc{{416FU#>1ZC}fN
z!5vHaKRb0q(m|sVbnjzNlKmyGampXbzl}pjs1%S2iHP(mK|0`($T@Fhd`TwJ9wsg@
zR9WOg8y0(&gL2|c(b!4V(J4$(s@fql)QN<C6Rx1`eQsKE_{}=ejqvFf(#AFn{_OqR
z`@i{A1z0PNk<6q{YZ$C*7x}eg)*csDs*;)f-OZ6KEy|@Joe|N>-`trgmI?2^e)^5M
z0+HfEH}D`VLCPBj%MVRI>Y}B!RAayH4_5>CzW-&#vepl9%um086M_ySUtk8c=bGoE
zN7_jsIi?+tHL*u!Ny+>h6P14$-)j6IN%=pf)f2Q3_EsO(93a<fy)&$}<}<e8d8fY0
zUny9p;)YFo{eYoQ@heJ<6{qD5Oh<cS0Z?q*yl$!T*A+fHRFb`J|6rRA3CwEUB`#_H
zZ|s}&d_G*YX-n@9pZ$A^!)Ykp&E+_{J%<i0D%~otc;6mz2Me~Zxh$KWey`@f`(*O7
zE6qRs>74-l1ew%foo`*zMgUz{*Xjp<Hc{-xsq38Iy`ubO{4ZUUVg&y&VQjk87p(ru
z>2kCE*UtfmN%bJZSG9dZ{y@?AFV$+X-h!{{l)%7+tywj!<GpB9-(PDQ2yz_yb@QO}
z=M6usk4pa!m)RS<ogBVPM^w2uNwuv~#ku_=E}&)&3PZ|%l4!glTIrnN|K<zTOX)hJ
z{aT`dO~RFT3*lS@7vKE`TQ>RK1C(oh)G|&*4|hBWE)zvt5fnyvSOiS;?v+jbq)21P
zs*kyNV~G>XpKt8XAFkK{EJXsJerHg7CnI@-%Ky;F`l1XG4LWP~Ha%@oU*%d~HrsQ8
zrTO|9Jr#;i9p#JwOUt;GUFGli@-N_i9S3YJ$00B%3~9wFs+|?lvj0u}#3vqF=5MN9
zJxA-K<sY8ZP0}a(%qSIYi#6I@%>gn&JLd~VkF`|>T>hG>zTJNeHJq(%37!5%M$=F-
z|5pIH(tKh|imHQq5<)2>sGKhccs;GIUQ**Jm#qBqf3tP$A}WQHZOZTb<zHRrk2R_s
zlsDloKl(FY@|PcIf1z53fBC-RX1fpA-(P-Gt57Y5zkJ{STT|6D|H}{l`z>@I!P|?`
zE2{gVq7%yB_1DmySEHIomha9fq;M92U{zs~omk8|rX-5gpe9!JH-A$-8-=qc!edUO
zvImuD7%UiO{8EQs{2asw+z~!TUL<HaB!wW{8GBW?>AVJ)V^gI%bLZF`9~5#j@rA$C
zqAUI%VFek5-gjQx+xov5LzI?8B~#rXoU00`Zm33cMM!j`TzO@toj>_-W2YOF_r;r^
zhN@Vfid|wW1@mrFnr4&);*Mw0k%;nBDiSdZy@44=cEn^Y8JH8rvgQC)SvLHGjB{r`
zS3!OpD$ZDCt**}Kc;yI*ws~h%>7t<toF+H5@iIibC{pF92cRdn7cT#%s_?~TTKGel
zqWl<Dd|tL$6>k-%#a5C;DPlls3HyUb8lOLx2}9QPAnp(@Czgme<-9-#?+zI89im`e
zGwg);`=7G@&sB#rF$6)_{Gu1HN!)fwU8+xQ;s6lBPf<<}lJXP3EpK-9Y%JC)*fF$V
z$GT-EEx$HTxYqpYGrqNQwFwa-luwDtv<;@&886r17LXlo`L&RDP#=lGQ<RU=f30fY
zDV+6i$e;|$LzQs++_`h^rDrkn1;a`%t`Lxbf+D53uCKtz;1{h9XI@Z*mID;6T;oRt
zGZdI5>C{}<e`)Kg7j>#0-=q45NxW9;Bfqf6es4FtdGUqnTk#LyypS%mZA^F-sY|Sg
z#j*>}jribyC@;_S*Dlt-6@95(Ast7t5ajVvRA!??m$R~(Dd(ljqbsd+AbKg5P39oQ
zO*)Yl2Mq`$a@1EiYwvbiTH1#d>|bD}+ITzrNKH*eDm|7ti0FQ?Y8KTav|A#Xm=>?P
zya%|W1kdV4+c`k}Np!}3F$zF7-4K|GjZAVy51Hi=Rl&>+Z+2~x&nlmNVcW;&vn#(<
z-QWimWJW_~#~?-ZqCc^S!^4WIS`C#SUHO^Y{`C*|e-Wls*ZhA+!Yi%qhB*G1xi9Yi
RX8CRNaaP|)kD0Une*r_c+CBgP

literal 134558
zcmb5W2RxT;*gt$VgchYzvL!-P$es-up{U48q9}XIx=|`wA=wHc$<C%tDnd$jAtNIr
zd%WMP)P29t`~Khe|Mxtf`+1PxbzSFq9LIMY$9Y{>S5;WInqf6T5bG3=96U}C>pci!
z#RaP6_=(W1>vQ-&tIjGb93&RW|A{Y64j~9uLh+!Src?AltFzOIhPxsiZntEg(QZ6^
zY}GP~9qcRBM^>|Yv&wsMcc-mrJiRHG{erJTTHdL$(|PR;4?G_~ORE03A&ha89Bt@b
zPs76$VRD-`T%&y@D_=cXQ797bcj5Xf2dd%OPfZsGOREDXoK@AnIDWkT;%pXf<<FPs
za%w}$Ki?39XXn+4-T!{exNA`7-|s|^H|78PomFD4<+8uub{sKnC!h59*OTD7GKxRn
z5S|=KZfpL2M=0>@8u<HN#gSvjYVRM78XX(Uw`?zS9B#|(Boy?*cuv&S*Vot8^?%BU
z3r*yukp29loO|i7WDiYD_r#S?e~TLHsWEtb>2p;W@8n=Jy#qVz%3Zs5ZQHiZyeYS@
zG0P}U+KoZPg3*DU2rvrP;@je|YUvX^_dRx>pP{DPS+X$m{Y|(4jl>%Dp<L7Y`yPu6
z2d=I?_MD=F^8Mc@;0eudcfUN)m}Ob|^W#4&DA&iTDJhwK&NlYjvb+C7sz$=$b>(t2
zTweF?-J6@6D=8^Ck#;g&zqBYNrMIKu#fukZZnNuIq)X1%#2Rw0>~Ko_yBM0Z!=;mL
zOpYyk&sRs^ioib&V3L>^Y|h6&#Um)Hs;=M5O#8^o%ggx9?W0lqcQg3&Yh}h46cn86
z^IF4SFxhrl!f}Z4tRxY@@po&USX<7|{1j=mtM7UpqHKZ%khJT4&vxMIhYSOm7iVOr
z#~V%=9KEy0aBgZO&SSy(^|^BQ`Bwo<%_+wcXuX$N>HhoR0Y)!hz6_L}>BTK(UHtTD
zO6Bn3SNLcA#RJ*6S*x67-7Q!T{mX-S9*dJ6i$9*zbPA@XrcSmxw)=~;-3bb!wb)No
z3H}YD4yT_#KN%P}cZUs^&05T5A4@pwF;gG+sHmT(RHvhoNz|(2^ZB^>`C*U6m-GfN
z&by5Fj?9hKd`ded&~k<d)%njuntpk91zVw8?(RmrGWbZ>%=e7Nz{ROR24ORwm^GsO
zgN<38kBWyFBpfbTTU*nvT*n~f)Rz=qEweb6X}e6F`QLD@h)a%hpDFtOh?kFVtyjF^
z^Q$#dle*#jde7EVFix6OM?3#)IAz&Vu$RgUfu@F)dmk@n=({|=MUhJ4Paw<o-Ne>4
zUCi)5Whqed<Jpxp^bgvW8{fmKDc|NR?7ipb#oAczu_*jWckIau8Xu<Bo;C`zTbBM@
zG`7WUvQ?zr)pjoVi^9#Vd);QnXMTS7Wm=sg+W#?q_M?vFIu>b3DlaX*iRShNhYu;o
z$Qy7~ESViDeqR0OCaey=y1C7Eb}+x9BEQw4*_%fCPEbeh-uRoGJ9dos)h8>3@eG%Y
zsnV_#JYTJH^r+P0+}QJK3R&k3*I%ABJpboYrar~Gj=naS`%0gs#kcYL&256;?nhay
zme`>F)PI|x$%|_n<Y*7F2Xn=_jz(U5L@lv_==S<sG^`l!h}oLd#7g<yxZ%Z0MySVR
zar>1RDp9t%J~fXSY%R-5xg@2q`)YCSPbuhqVyHZS{JR(3t*`vIOW5^hA{SYyERb*I
zbKL@lT+3Bi^IL4*a0E(jT>11hS6s=3+I!>z;je@K^Vg@39Xpn9(TeLct&|&XE9Fv)
zjYbeUaCfY<>3H&wGJoN@rd(4wTD!hFV=UG6>({Z2tSZts|CFxorWG`jv>9#tEn21`
z&c_}rr3lDEq|@s=-Bk?PDMwp^2cnUgvueCfP(|eYsehiM-$T@-O`9JNwv}F{qoeD2
z8&RKSRFO%yj4P$kw#T8buKUS9D^L~m%3PguOx`_HT`7CPb^F?-73VI$x}K>BrSI1h
zpKQ=!J=lZHd?(V*xbkVN)Mbl;FIQ0&X?LFgTVlI4g&5vdSFiVq$3mfiWt>bu-FV%T
zt_lAfPdJ7wqLABTTovjkP$oKXknnl_`A_MVb>8Ina^Z)s%!>*dsqvux-LypyPNcPY
zEY6RN#5lI~wH)xIyZv`R)tO62t9bqB1=f3YwAj`Re#$Utv?jb*saCDs`u^?#Usfs`
zTU!GZ0P+Uc|4sCGNd_qwJ69`Y5<<o5Iq~8Ud$7!WpW2A^>*h@aSLC&^nkDe?9R2p3
zMn-b%t)Mq+Wo4ySp^Y)Fun?Y8Eq?Lkjm>hj54Ez`7fdLgP`~{584AZ5jRId%lELx(
z=<*-tpA0<QT=%%oO<9Eyw5$KKPRe{+E;eR<K~$k8&!F-(q#WbGf-Js4omG>b?m}@r
z?9Y&HP%gdkIvI>cRsUXPI@@XFtN(V9RdQCfhv-w=mUpYWxVTIXwR*EQW*R=vHh%Lg
zV8xTH`+p01sDNeJw27<LYtC?=sFfYX<Acp2(sv}CZ3+|#(TTr!={agV)KWM?Mksmr
zdP;5aN^RMMsP)0KSz3+T{v_gpg7>O*h(xVohrx86{Hrk(9TE+x8e9Sb87Q1&?QYa2
zMDIucdwC9LXXouA7N}z8&5sSwFPoCa->vsL^Ze>M{02YhxbrVCR|VHd&+<uMXn1e}
zw^wFx`0!!ZOozcHETPrS|0GrAWAo;(-s1NC<Ri_7l^hK#v0i57^)F0R1q1e`Sa<K<
z-Livd&*(BYo1GZ^vU6F-*80B~S{1BU?C_GZA?*~62=$Z6G(3Q7rfpC4|Gd}{6l+)7
z>j$4<UFMbA?klww5TbGy{sull{I9u%?45hH7rM&d#Y#yGh$@`>^hl30Vz{s;7NJR0
zg#Ha_Ebspbui?DH^k+i|_xL2`@Up3|ekh~WSJ!PH9UZN!t5X&4OH_>!H$HzJEJS4E
z^|?0{ac&cz#caQ!j&e;;PrrWs`sepgGP46Y#rEI#pLychly{LON``{4a<%^Vw%DBT
zuQ`H`?AZ9ax|%<tq{C|s`=ZYGkLfhCPWt+7@p9hyGskwf$=$jIR`vIfUb%8bC*MM^
z(5Ab<x-&Z`r$($Nx}~LM*yU&G<;jJ)9vRC|k6KaA_E@%+APlx_*)r6W_q|?C#_h`$
zdfdDNE@xo<ax++r_*AO>_m3|%lJ4KXt<q-yNiW#)bCLb`zy&obf;;(d+!CVI5A}*C
zfXE(0g*}MX)*}0go7-~0rQBvG%v%aZz{_eXE2+F#MMUxeeYi`Bf%V}A?$a7-Qk(r=
zo_>63dc0n(f<=Ivd#X1faHxE7AvZVo*|TT#2CiABGW07Z)K(FUl>c6l>S>L~!Wg)l
z-%A@C8>OA?s8fxLkI%lmae%oc=e*j<lOOvV(!5zOd`PLKWO3c&I`sp&e(I6#bXSO4
z8A_MTREPIaYcZu<ChiAh#RAnJh-s&4v?!`0E0wBPcPK6p=S@iyAQKxM?fPol`4)#}
z^ELGR4>gjj^94mr>vqJN^;E}jhIq_%vKY17G*&G0ydn1&yFEK!o9DMqeDP&@RYA8?
z<pNfLO++*GcsKAPRln4Qb|oU<xi5`O*Hu(0De20pDkU|w#R+90%eJ}6@<q38J*x<w
z)qhJT5wJaKzd3`9yQ^yp3M>jb^TcBf+-DVU#?3%m8yjiFk@u<cxzRUMtTHl-#ri2&
zIAdCRBIVkxXX}5*s=CR<P^*r*dL{^MYq4X+`AB@^^@9PfzK8^V?d-0O2-Lq%DExdH
z%ZVQxl0QFaAZh&qDc5fOA?2uV^O|!c)`D-#<%!Sd7w6j-OV6ctl{5nacWY<w5ELA1
z4$L=e;Mw9ArfosQFRxun^C~~U<M8*4^0vq3I);W{9_ba6?8VPu?w+Q^+!q=Tr;dNX
zo<HYX8gy5M1-kk=k9|X840l9;SJ-|3I9TZj)~<l4>2wSD@Zm#ZVj|d>p2z%8g|Y**
zD-Rtyq^P*SL`ih)0V~tqH}oO3ezN5=%G0fpXffOM`c>7{v7**zX;)sJ92#sX9J{uG
z1IXHxs2Gy%znak0T=@7~0B{R}Hn-sK@V{k{5iURbW1PQ5B)QC0OAJR#U3PeFjFPL=
zTy|7dRr=gFV0Ti}mkJiklj`a}fXc38??gBSQBadw$+bX6rK6nrGYCVnTn1z9ky)Hp
zSha#k*}7%-Y0Z5WZ6(u_EzWg{TwyoKX!H(WMZ-K-rz`+=qa#N<&{jHI1{$}xFpGLT
zAni7z>pJqXC-$<%*|TT0DJcn_Kd>5qNi*%FIk**PxJZlDbA!d1Psaz<ab<4-V3JPL
z+TA*jOS*0;KRA`aT&|*~rgr2Acf)#;+#*07e!NXhjy?T=X+5Lp`-j>&5Gyefj;rJ{
zk-3(RUW-Xg5)K2+`Ic{P<5MnC{~{JitzQtt(*rp**v89v0F>|H#1K_d`<RL>Be*Tk
zW;H%FFc<F*)d=|U6ze(IUS8gHYQsi4x^|~d|I4TnPbpbiN-sOQ_J9%dqi}(`3Sa!l
z2&R8jr1b-o-@Zi-g4pc2R0nAxPv1p}pkz(I5h$Mnt1qAFPeU5zK^<yFCEmZ!+)~zx
z@G>>hMO>iLsj8|z!t#SsLMs|HF=25foqlxGN!w@Mym@nWxO}l<zzU_#1EDNq0>T1*
zqydr$nT3yZs|aB79lsU$A}ynA=GAt~P~H6(-gkgCO$DbCggmLl>J|#*t|M)jTRx_1
zrq)6K91dWdMiDF@uTutu?f&eHK<Y%<5wU1Nm4=pzx_$fPs@i3qs}C-f3965e^e#7<
zCNoKfL`Um3O_(RE#z6931onatMhahy1z;rk*FquGNgqFUD52I{z(e-s4)+Td*Abmz
z4!+tsCf(n@eatXe{Gg#mmRhjj=+LgX+k|J(iS*OLyLP3fq;R`~ZdTuFmzpv(Yj_}x
zV)*5`Z%=nOJ`WVu5Kqn-S4Li{`%QRFgkbXNJTsa7;)MD?Q@k606^@h(Wms;MWK;&`
zX^wVj;|VMb&)|3R$4PEJ9HDpaXf-L|Stn8IEfEKZ7nG}+Wt7wk3AL2WPVrs)cI`4n
zEpg!@WDD4Xj|?{Dl|yAubq89u-%bI1)VNRN#LW+!-@gBn(+`A)+jy;0g=1e`5-A&{
zCR;RiW-KQ(*Dh&2tCm?H28-*Dr}m5K@$sw-{wQ3Nm5wQ=^ZM1?M?-M0D2_>p5GXuR
zu*g9DGXehy)<}*iEQA9XUwI3poyvIRvW5ty#Fl$}PtX6_39yR+HX9dvAbXhIr@jCo
z3|#t>Mk0Vi$FOIjcb4LjaT0}eHots+up=Vx;-}jVAwaXjfsMyDkSOu_^JfFMpAAqV
zkd<RRCmw`4_>v3#_Rd~%Q{h7NCC9bhJLHFZCNb$kA_bw!h=MF&+Wzg^w}s~N`R2)%
z5}WR-kLfy;a(gbkk3S#leE*!Zpr9=lgcQkOF*LJhAkm8)2E}c^z3rfMD*|5ZU&0G+
z(*L*>85w!&mQ7cn5Wq@01zI3v=TTJrCPZVx#sE4+>=3C6j2yJeT&G*e7>Uu@H%78y
z#FK);3W6sRDu<(d8iKCy`=jF3t5<umI!$+n`_iy1NQuZyw%Cq5^<1{1taN6ig2m(1
z?0zaQ)QMNk`M&cMp2v-GLw9^KSNsa0C)bCNKIG13VBf1_B=7{q*q*U(?|2*5!yO>^
z^4q&8M+637JUTl14g3V|q$?cxC?MVf7eA!j#cna4G&3`M?SyJaDUf%emVM*(T%_x{
z`}ic0HYX$3IUvo^IO+M~$Md6a1jry_rHT}>jE#wzAKb3*^ySL1VjePUij_pGv=a|B
z6_u34FMR`|9!5~^xD^+-C^fr+s1R5Z3$m4@=8I~d`_>Tw%OeHUDyb>)fbV)^hSgP1
zi~^-5!prB3$z8?sl)Bi@c@`bl&b?5J0uvV;T#I}GrDFCEB4nN4W5aJ*EA|Led7+j*
z1mIG6LCt*cm?iBVKpmf6kr0)aocF3y)cy0(*00zuy8~0^XD8bMe>*mAlX4k1415rS
zLP=iujlsfbvoaYBiW!rrh%%Q3fI0J>filuNY-v}bu7dC*(oIcG+ef%o5doIfzX91Z
zD8~BLHaxAgMN5G-mQoE&)rvz-TTxN*b!BC5bN+C8e%nlO`~0KaK%}pZMPU-S1dt32
zWf<s+Jq|dmO0eqw>~E0hiMIXPS(~6Ba60z_lneA!(YMHptJkdYW-UN<ped8#<mBYy
zN=9<Y(OzoGO~v9lwpdqued!0ZEd&!G^;WO~vuSJ5PjEHbl~1u8&Bczx!!CVM7DatG
zcpu%_Z+;qF7d#C@<R^(8$TFBAyZHGbV<6$7h{OkSID7gIC)9$DAciiY^1`2+y~U|q
z7PE~=Il>OxV&J#7R^KqtUhV-E2?0BVTSr<b{Tu+Ww7~2WtQJi`ZB^i6Vhn#&@bv~s
zLA$b1HvxpA15fP<%^i!=af{nz%<N#5ebRH(YqS<U3~!+|=X}j`=G%auPl4M-hMLFl
z2U9;YJmz1}sgb~zdHG|iMh_meu>V0?S?M@ZX0d#6axvCpVH#-bb}A7@%=|Zz^65^N
z#VI`&na38bF4bZ^FVuRZ4x)0x42+13oY9NY*y~ijT-5Nd0tv80%`8C7&<p63MFRnY
zqG9;Vhcs)Fn$BT;$fs!^f;vH-xXq1J@Sb`!2civ|G66}o`2;ebe)s8dGL}C+)HZM(
zIb>*Pn0`8sloo?|pBBcGWuyi;gv~x|m2sc5^q6fX%NhO}Q4g|z@%ZuMr7l1B2@1}R
z_u0POe9HCI;?J9c)l;+Wiwki7%12&pN1P5J0Z5euS%@bE#-$-9p#Mq{CfkM0#-$>@
ze@G39ikf|YfaYkd<aADrbTc>tf~-cg^X9S@t6D&@;N%+rh+9Lb^NTHksCy`wL7ZKk
zB4rJ{pxtmU)U}-Bm&3Mu2)>=l?2PonL)9b4Q35$R#qO-?nCjqD>~Q)rzq-SxJ7epw
z0@b0AiXew17BO$K1KrOhGrHITkGDo*lcq{idfO_wA0X!Q;7hB$;_(EiOTZ;t7MI?;
zK0ZDPK_tl*J+>rDpDbo0wBZbf_Xopc6EXXcf<IB%weg2H0Q(gsxad3>VWD~R{sz)a
z?@(Bh;2j4~X=xR}$Rg`8Xg+cT8N(!OrkU!1pT0a+wQBB&(W{%<1y+U@7UY^r&yUxS
zRZ9rKznHs-v>u=$k~;m@bR10hZ2>%?PKP&)x&j8bD?&T-jNCP6&+G=^79Q|b1#}2#
z?P}rmghC?1eB>!mkMtkr9fn%Av1n%KOF_Rg8Z7~D^!tq)H=fS7m>()0ekE`b3yX}m
z;#j7jc>33uY23`9rH2l97i#<>kW9zStiP6O)e|<*;7)JsJ+!@R`K|oaTO#~~D(#<-
z?VXt%ZijmU#cbdIVHXDn2OAsPz(HB(W15=dQ=NeofkMi#*TE%&*bZ2fjJ$$ft7*k)
zaJ#Y0?eZ_9;&t-}nyK6LM1}AY2&&p~9KJ~<v0Pm>T`TKe*C!vCkFcc#3cGIMhNG#-
zCIHJr{ZjEIf8qfs+<*Ovc=kr5G)VD%)tDkEj#K*1oCBh=6u}V@z(9jCs2cp<eQ|#B
z)I+UWaL&vF%QRirZQl<y?0|Ab8bQ^|f4JS|*JD^Ihnloq=mM?779t?^2;Y`962EbA
z&WglC=o?VVQa~9i6?<?ca1g0L=fEGoeCAt}<#7!S-V+ZtIna9E$iSw78m{&_0|x**
zxL#uY6T?fNvq`1{1M~Ywa~PP06>xNv{<zR{+$0%+(c9V3%t{3g-swxTEVK|LFYK|4
zj+T~|y1G^9D4}@<m0Z>J2ei&FlS&1R1!->QH_JO-++B*Jk#uB;9IX%%1V$H_aX*j%
zX8Z;RZqLPi;1VbolYRB)$9ro*uE<-nPEd=J9(jtE!!Jh*MvUuw@N3fE894N>tJR?Z
z9((kc1-i^?D|T(Gpoz+BV{n6HwYWC21HrM3ksg?WhmMMiGXPBnt-d|cYwej}zPK<C
zwb%~w4;Y3-hz<?iMD})eEkU?di$*C$6^fY|Q%l~vni@;^^w*q;0D5$jZm@X3d-+Hz
z@BvZ6H&WF4E5x4{D=;vlfA<sW$|ct?lpbzs97H5^0Wz1#VT)&WE=uh#jr%1~ipen!
zO(2!9{j6N?>@_+Z$Sj4=fP%xSKMz>(qug$=$sVa9NBcHXI88a6pWGp>B^!+O+?p+X
zcW~tw*Xd;7Pq0$)L&cvma1$O-CkP*S9C>C9NX!$#N6vv+lh&QYh9@4{k1uJPn3&9&
z_roV$E3uwnJMw48xr#1klup1Qu`O}gzVa!0QEOJOe&)ZDknKfq33yDuVWomZV}acZ
z6KpQ9c&Lci{j1@e`D#zPpe2UORtzV>oAro_%D~6;=HA)_nVH^%(9lrCcqsjd$g4~Z
z!pErh{au9nTCa}0Px?_Z9`3(A;4OSwK_MY@D^YRQU0u6vqXQ%HY{PXAn%@EcEF0)L
z+3rzpQ0|VUUnj9fb~@X2dK6rArQCbiOKIr)JV<*a4)5XldI~EQsfDRd2wNz9DAQQs
zizns1G+ue`*-TIG*#Ds~ZgK8B?aI!I!1AG@@1%n+`o|@Tk1hfxD)=*;u-SI~AZ((U
ziNWn>cX%2-%*@QhmB|qG>Zny<`N=UXTR`raO;;r=)xLfEkg$=zgXq~(=o3Wu2M~dg
ze#joI3o5h!%d>FqTOk@jNE-{0)DeQ`9fsSaw)nwIHUhZ-UqW}p%Ih?GQHWa*+pwUZ
zJ*b!{jAV!6{Ss2Zlo@Vk@jooMZ(lWVE;EPF<o^CC1FWD21c^j1SiGz&flN@6y0j~M
zV8^2<@tk^u7G{T!hL%<%Vgk0}N;#|gqayxH68lMOU-36QdhW|aoI*Z)yUrEcSmHbe
z?FVy?4jDr51fU$heft)RfwsWe7uOV+op%fzd_vuWq7xuu5hZ-_@wwL<WfTZ2ucct^
zsOYatdINTAU_%yuWZ_OOF5j)xgy-s=F!!)96ciM4w33Kpnc2@Z*pO)1g@ws-nr-`b
zP<cTLy@kF+c|`Aq_1vExb4P(A0QUtYupPZS(j7&&<T)9y6eMcjV$s^V?_4F)<N3Q4
z(W2N%EX5X<KqiTkKeu?Iq><brQ}7pxTe+q|OG1pef|*H;yr6@FXMwDd_o`<hp5^07
z;iPCj(k+BSEu&wiNkIHT?$6Fmj~QiYsj5~}(Cb?DrzT%TkF5hjp#jqS7oYl(W-#AB
z(QHXWHn!w<Z1KAiv(%~JCR;y0z?f0w!Gh3TEZS~Sa2H^LF4EL7BpMJa*6HhYR;mYR
zHo)Qe2|sv1UG~am|6hI1q0c#7R9>Xh09iPpr$K!E6~fnl=Ft9`1;De#pPeB)W%%<j
z-JP7A>3YSFfk|wKmrsqLFv6hwopS)*oho<s!0JGzZhjAg)xOpK)8@^aDdgT~7?hJ`
z1<Q=e5=vj@zw`u~>q%z=w0bR*xDePnl^4HWQA*qL&9daDsYW}20<Mx%*V1x&C;VwG
z@GUcQ&w#G%OF*+7+KObKXQY4d#3{X>FeCAIR(t-G-V_8Fq&SL&x(O0o2(1{~YO!QA
zwa|6o-i|&p*}p@{{2C5M`9)@7BoHk#u@+5PicClsUJ&s4$@V~l07kLVyS~gf4zho_
zzdXKV$yw|u-$YNZ7Axrtz=4*9jU3^jyOM~HCM_2@awTA3tW@9~i?F>7K+L5pmdmKX
zzWf$(-u-8qL3ejIg<Mc{G=972+*KKD*t0xda*2kqHUeEe7J57an5A5R-6Ii>ZO+LG
zfy}r8!qdZaVQw0++LCE_U@jX!CA)t}^5;h>7#(j57)Ya(z>^no{|s0FZ|IJ;t)>>)
zOmv@=Z~VRJRPAd7N~~Zz+}5<L5&!=1>({R!C1_*xqC*YQ@~H4z<Wx8C116FLb`zq{
zXZu=FePq4~04yRRVx-@E$HwXD8jmukk+a~CWxh+DM=Hbvgy1+zGZw}Oomp*dZTCm?
z7~^=;Y+3I1e4PlXKHVcw&SPT8Z*}cz4^dNB?_9C|gs!e6fE>n#W$lGrSVz*IaYf|Q
z+&w@3)gs%8?zkm@kfU8`Sw8Oorvw%A+4|+O&em^s=SRErr=n7&0d3KR3OFJs_Y_*y
z0qqI%)}p*~*$`(KLwHkFg)*XyOujbaIeG>O2<*RcH~~V8td_4`!McYFIs>*_meg*i
zAQa@6s6HY30sZAVC<m-PV66uo=pxvqm<T~v-s+&kK;vHU5%TWQT<Cr}h87D-pULmw
z&<4#NC%^2<r-{uvd|R54_^@EcMn@qG^1!0Pcux&M{dQbtfC+_lLCc+8SXlUBG{NZj
z2Ua`}pu##JS`fS=Z$MUI|1bXFef#8CwnZ!1GetoRH>@qk+HrB6heWJTR)gB~&^BMA
zxfK^EL(?Djb7WXp*qu9DfsFxzOB|SZV(2>c#TVGPNG1ZTSZok!{)TF(QcG-7->XOZ
z7p0T>uDZxe{0^=IE_Mp{4x1T|3Ks=5K^Zb$S=p~WC^i<I3tjvco_SYS7e*jp6z>)j
z6O)pX;^f3f@Nh;A!}fuW3qea<#$y3J6L4ht8I9#%NU{yJ;oKAR2^<XUQfy$0CyUCT
zjhIp;g+3TCyvm=@Ygb|@QZkug;lL>oF)lt2Er8vHbOXx9-nw-hVFFT1j$t4=kSNUy
z?TmhFHtWFfuV2)`zE`8wi<?BzQs1xyjGD4A0Z9Ewwy-k2(c>$B4DJBV$=nc*eu>N1
z>s+{ZTTKHB!jtacpUmP`&e9qHKbMB<H~aHD>D7|0z9;1$p_LLok?uXLJzuV)k%`d0
zvf4|<Bt;a~Q7NiZ&*EI%BH6scXbmPCb^=TZtAjA}9EBd6V@ZIaaC$*00$wc;MD(4d
zz~H%twgtIEg@uJ9Z%Feh`wjmZ5g4e9cLxRr_O;vo=y;0Xp_cHD0XsyW{)Yf_Z<o3}
z;R<9Y88lRQ5@<d+B4(uQq26(AOJt9VgYwa%ZUb2rtJX1S?kq;JU|VYPs$0)hc+6Em
zZBNojPssumtsr^W2dJJ{@7%jL3mn`y`vgQJ)Xxb_Cy2WIx#s|&Ke*z4wx{j$%W3bU
ziIIb+21xvb1rPDUsD=)rN7s5#Pm~d$yLnrQbHqVZz~`ePnZFR1kd4^I$2ZrI;f_uo
z>1807KnwVRkW8yZrQwCK@7mQzDWeZ_9j+cIMj{A_@g0oBl^y*PPTYdj)YO0iWvD+)
z#2_yr1^G+I>ktp!r_lU`(uEBSi~2li2V#WEcd#C;hg11Bn(6tMI<Xjrhg|D7{k|n`
z9aPQY`HAL=Ko%Cim&_h>Bj6ENW2m?dU?2%YVB$_*hj78E?yk6mRI(#U6&jov{V?84
zgFE|E<p?WH71)UYW_QpMv|$^yJ#7pjOf>GRTtA$EzWz7JWg2~@BS$X43jurhQOOa=
z=p1qv@KqzT=nAP4Ag@IbQ>uTmoOp7Klnxh!B11w1!SD{8UJWKm&WJcTIKa_GzXG!`
zCMMrdAp_4fBa$vX^Pxulm*ly-jOZx-<NJ6HkD~?iNWXMuaiM*2y93AqcMRqhKIfe8
z>FO%M)DdhU(Q|pY5optDlw2gEjm_v&WmMqV@WEzO5D+;Jlrm#R27(B<xr8582{2U1
zMhzX8s>+cgcck51;1oPpT}cp&qz#ABALMo${s9UhV|HY4aBxgaOjsC(5R#AVuxVu4
z!msOu9*=vCa;{lJ{}p=u3+UZNTv<hU9=G|89<qgF7>ZiQBo4)zE=Nnkc*MR#hz8Hm
zOHgsBQMDKBzP*h=Z;w(SDKXJ*s6~vI_u>=JWdOCgaGe-L0*w7Bc)T>l;($l8<YN_g
zN3{^d4{y=maTO0)@kW4zLM#iKy0AIR7k*}h>p5sQ-i76&PPv}(j)<kc5JL<sas?JQ
zyGk<zAU_NOWGpr$8I5nV9?&cV^ia@i)Y||S8F$=;437{&gi`!A8a!=C#mk`(F!vg*
zhfn1Tc`z}4W$$7Z#3G#Cg9){8r#sQLl0KIQ00$nUM*tdpRB7g)@`~;LhjVgrXkz=|
zt=iT|If8#KJb6zL>i3uOv?>M@UYPd!*;mgsz=|g{Kx07UcNk;QhuX^*JCb14lP-{i
zL@{U&9Ok)^j64u6Ak@Gwf(ehReSXN192KA$8eNXO{X2(Ibp<`SWTXb^p*PBTE*{s_
zEy0irvgCqL5nQJOt5+caO8{Nj+1V&Z;jyv$j|Eymj>+dj?+>t_%nx%fyOoh3+;_CE
z{Vn#0XCkm^i$T!QEk%`g!HfY$BOnX79+5#AA;`tW)f09~pX5nwX{bGe2w!yXG3Td?
z5r%|NHVKK#q(v<=Z-=lAtW!oMr};LpG=IwK3Sc3DZ7xorv;3?F=5g$M-+w_DE(5FE
z*w^8c38J6h=EBS1Uk0K81|aUVi;e9re*1e95Q3ts4AI%R6%xV`T)Ec?bFSRG|FqrX
zb$@Z+bqu*c5pOO!SYK$YV2BIbly6iKa6ZQVB!Kjp64mB9czp@A7a$=35I6k%X!IW(
zPf~)aL<PkcX>w<7#Hu?lxpIgfde}#0nCB?#xe8^3Muv3T3k)b4VW9#}2&Jf_CreIg
zTo9T;vS4&{Ca3}Ur-wc;q6h1mic-r>y^M&L{C6jVvEN<Y-J}M^Xeyiqt;{n|!6%_H
zJsP9FH>A<aY!Ts?lA7O|e5LaZ3cesd2>T3T*4@*?@;|eAqmn#KP5!@=Hr+??L?&<V
zY*_geriosLekm&zC{#Xz#fi=lq!?}GK0}t92iXyU1CWYnN4Trd9VpC)nEDk?;!wbt
z?C=hUX)NwI6de|34>t||DEC7j0Ilm19E%>t6ph)ZM@$Z@3GQK%txJu+KY&p?RCTDV
znF(@qX$*tDKicsR*B9PRe7uG$8?=xQqX>v85CD=o&}c&_IiL@ogSed>HbA(jV;7!`
z@cx-L=JFxoj=p^AB@~zo0ti%wsLBH{PMtxs&DXpgP1<5WDAwt^0f_=53_~!q$hn;<
zZ@{*s$v<~>5w$fS>j|~J7%PC%K_vmDMY3S`5p<EEiF*J|GC?%;zYt2H^#FY8?*@zF
zTm0s`xMW_xdnYxRTaUaTx=)d9V{afE<LmjQo;O-@t-!=M=$H}8JWH`!SC|7pAaK42
zTnxWjudk${B8mAUTo0s&9O3vCZHMmt_U^1Uy*!%*Ns46QaTmj^5rm;|%Mn7o+W6c#
ztk*Ew-gw$1m(0Y+I!%;crt<y^QCfTsKaRtNf;)2?iU9+&vGCh5whY<7%cio&^4CLE
zy&tQQ@xWAUMFlQ2^PFXY)`A(q11H_$^({_dD>@=%gLZ>727?#CN};UdFk!)a@}cMh
zYPN&04zP^Xcqq7Guv@GvQB`7HMqbjNH#R229`t!VCmTU%IxV%bL~lGQ8LcAwJQ(P@
zJYK8d3ugeFT2%U+i3vH-g0jN`wuiw-Hy^bV(ER`j_-}f7W+&w2<dl@Ej;*7vXhbz7
z$2TqriDUW)E)z4E279eKo&W|A-3si%zk=raf#0~&!3`k_%ZX**xg+F2aP@Mc!XLij
zD@qLYij7W~)2@U*!)12!@SMirBA8czs<`NVIDjEA#{3VXm7@*q{Np$*NaSm1SeP&_
zhK4oguN+vuq(McCVDdZ!Nlm&H5wVR0{lQ#K8fM+*hO+J{k{5z-qJ>WmIfC~4KEzYx
zqu`@9gzMTUf_ooOq#_6|(Z9va_4V7g$jmIQOW2Miw>{F_=H~QO-XK(^Kn{Q|z7Pj8
z=Y(OuOGrrY@_t7p#k>d=x#+aXe<lh;1^W8>0EzlwD<CN_PIhb*dqznkJv)(-<h}1P
zfCTfW-CbQOKvUSWlmeVPcg8|LleY%&#b_BSCW!H0UB~U%1i}W+jm3YYo%1cXURB~*
zGE7g@u>?SV=;Oz0DzvPvN6<CItP^}>5JI*zgk(SZcfgCJaQ(>Jx3`ehs^jv%)k+mK
z3JK}Os+E22b#T6ACk+O@3}NDeUz>k?`1#lxDw!7m`gZ7?UxbY8eluk^LGwO*_z=4f
zk)DY%es>4RPk_&o^d)dA0DT*2ts=WVw7&!9o}yZyxJ`LC<%iN?OeJ=ov62vj8=AM+
z54Ru3F|q@zw#2R(ER;-3B55ft0Rj@Flz)n4RQ}oVG_MtO@8L}&In#%_h5Ov!VqD4A
z2*WhW{||<El6+F%KO<H|RSXFGM-+WL8@ihO4c?rC&14O^qWeQ&)22;tG5v_D2%okY
z%$hBYBs3T}IC=6UN;shJSE$T61s^Covvlr5of+fM-Ed!MyyAf~pn8&Mc42b21*Z{E
z5@#>R^6>Mks;RXX+MWkuk)tdRpV{1d^X5$yEO6i;)H{$e;J`^Zn<NNkXCu}(T|X#K
zhGi(*5_w`tDJT%#c)eE*yx0aS23v2Uc?^8pMR8j`YInP|Y%YWcW*n{`WJi`Bb#a;5
zOZ|r0?EjZ1uH{A4S_v&h!<;MvuxGUU4G_vS4|RzFk3J7OZ8wRKUKFH1WH_7()<L@J
zD&(W)Bf+42FfN1E|Ak-k@tEOX6KHP6QwF4Y8zmyZ;IA4f)Rzz_Eq;z<ePrT+#iI|d
zp`r0>O&QIXc)!d+ED6Gt!S5vulPv=JB)`Edxa=YHd5fk2WpOedxDO;p6LhbkWg3Vg
zb`#b;#@e>6d`gZ(!&Wk42)%I;fF@}2_S)eDvUvnyO{0xy*2Zq);^68bw{HDfWJW>h
z@n1wNv<ajQJ%OgFzEv<rWBPX!jOIrp!Fm&4!&IzirMhwB26j0tiu{s5eR}CfkFN?#
zfKhYkE)AvF(?xcsqSmRh{~OHn_reb>HE84Qz^+K7kBUMw0xLkW8O(h{+C90>D<k89
zBO4<|)_DF!oB|2wgYRZhd+!jNNoT<X5S%<&0XN<E?Wc#@W4Dy~`!LRh^@Zgl{~B|9
zKFYspvM1eXSWU=*-zSFjwX|TZ1kjLdWH7%yHZHCmg#^)vXA~Yd7%C6b9u@`0wm~YN
zL+4WB`IV%323m_n21FTyaasq2?Hp<-I_0|H8yF72SQWjT*l}`Z1VePn%4r7W3+V2D
zM*#}w6NYI9iU>|g&I`mz6P`9NdmKb-8RrV69xhh=E}c)P&jVvg+<x~?4?L0#a+Gld
z0oxH}<%{+A-)WzZX}~No?3RXy+Ja>JlboIXU!G|Kn$KVi3e9sna3~rEp%k==;W`rw
zzBPlgwEu6%o?PGRpI%zAL#UxX)O7c1d*P=8#6kt&9Fa#10GgCE%mb4NUwrcZzYau+
z-Cm5QAzY(s2A@od-i~X|`*rdmr33zB90WxL^W{s}TVK8t7<+w-*()#{kblO@)LhqK
zyn$|l+&aSnf(Dd0mj}!6l%9P9rZdrAW;!g+$_RgP0(7|D<$L-8)ffNsbd`Um`<soT
zx3O{9^Cb@t^cc38_bK!b;Ot3Y(hVK-t6{W+s+M&A3eKwMJr2Li4i9O#lFy1Z_5x(I
z(#|vt@c}?l_K}XlmTi)I_wIe+BU^wEC&x2?Ylugu^B$a&W@GyTqni=q5T?m7Th2;$
zY;}SyHw42HU6|0fJ9h4b5t)T%F4*BENZ;Q+r`6_7oA}Uz$H*1LEixSjBBwIi4Zr4H
z(Cw$~L_**WJ1+bDEY$nGNjFT<|0l;fpO(vh0(zjPj$j~$9Ok<A)N|AnnGq<nFonep
ziHl>2OT|NAfO5P-W+4k}@x^3Cz*Vw2YFB?079W3vDJBHT^BHTm9S3&fT-Rb;>1@Eo
z{u#*HXUywq^VwMg9%w!+ICLn9f7ACT;+Z^O+tS?`^8(ED8j7sGd@OR*%lmlcvft_R
zCp1^md)>HHIUG1U-DV)Yl0DmEPeu=mv?{>Farsq8i?&I(aSbP@DRKltz+j>4HQlt5
zrRA`vCvo}mWr`Imh>o$tZR<Nix=h+se=Z#^loqM2t;OJuPVR+Z*MM2FE$ckm1=h*4
zeJe3PiK#_QD5^mu!pW!;4hRUqm?7juMD{<+Di5Qrf!c|tzOJ2JuH8|@6->>RM|u*G
zUst?`?dgod2uGs?9M<kAlnQK3=`&9VP#t3<+>PLYw)L#_He8m;GfU^M;=*8FVkbv%
z-MW_$|J9MgZ=OFtP#aHl?Zu4(m12N&Dk?Ya-gP|n#_=8_6O;Rw&Jfu{sGfR7c47_#
zckF80+7>Vz*azFMy?sSx0ZdL9ylyat@7}#zCk)BkT3~I2f)}=KE#X<S%2~qm;hVma
znEoY+>ne@uPSQY5JzZT_XM-*=oQAG=Nzb=vdmO_q;t(O9o}H~N-0b3ngoJx?%PMby
z0?bTJNxIKzQQbYBIy2rk2Jc}Dx4LX!0nw$1@;M7fYx(l!iBcl4f%DK7L$vH-q9+Jv
zX57G~P+Mn-hNYuHQ;Zn6h7*1kAzSwn5Pqqpp!W4^#m^~3m%Uk@%k*eBJY7FOKRWqM
zyHDGJ?;$bN#rcVdEQ?lg50AwcFAlODMBx*;)c79#k|gitK|M?|?j@L|!<3Ocm7rh_
zi2cmW3=b3hJLj#qf#@JQ=i*}yE0<1Ya_<ro``q4cz+VcfIeeXti@TH}$et(c0dfpg
z*WSTli+qXeblvmkodCq+xON;LD{DNa73xk+p-2`vjj$VhSy8!#nVA_;Fg8BENq)zU
z9j+KHh2S_T&P@=bJfjYT*bc5yhfYXg;{z9dNgBda9^)=e`IhDO-%qmnq^9;kSiK5l
zvAlHYp`9)v&4P_4R}=7kKwciR6>uj{f|cIb%*(?>?|G9%SoF@5*0DhY6j^s3c1WiP
z@_(3~4o%3gZJYC#XB2EN5MC#E!hC&w>#Ua%K@~yyD=LqDj?LFtqT8O5e4LywKYen<
zoy$|%+KwS)?EC5t^~DpeSNV4DF1G8lAtyKUEih&WKR*XN<oE$y!o>o`7lT5<uM!jj
zV+GF#1aF8su)NX)2YS#G$j;53gZp=d7BYrp9>BQJfHj1tq<0CEEPrL&zX8}~xMI~>
z95za)eb&@8gX*#SbgsHMI}x{=jxJ<9lei>0E?^hXqL(KIOi|IVZ{);BdGaGOx9vIO
zNh_apoMPq5o7+WH`>qkL)h(^9g98KWnI*Y>L|vycp@`Db((2^!=#=E6AN?GbpO$FS
z`g{FlV+HZ45U$j0;MkZ12n5^VW!m6bV*vjE^k{P6GlviNv3oBNLO19I9@(iAbq+`_
z#DVy(U5ZqD0nEUC9;OY1_Ya`>O%@q_Dnm0f7ucTg(Yf(+!o|OY$&)?%zf0bSZZrBP
z07NVo+KF>${e}fldEUL|=XVj$sIDGFdk@B-LI5Lh;us<*6$fu%?@M_s=u+)PFCCin
zD4us~5VjW!Y+&^W^I<F~GO}{oZ<|w>&PK?#(J?av?(O>u=77aJ)nXso9VFJi!2m#O
zOG~VzGrJFZ6)YGpOvld`6!6ffk7_aEoId0**uwm*|IM4B>sNa&!?9pAy1Q|;E3{Bd
z*p-y5tSoX};~guTmvQs@IZIf=h$g=(FXPd7gtd`I1-xY?-xW_fHo>$7*~9#?{561)
zQv0Um?^%!?w-=0V@6s><PjbCN!y3MA`*xuA=i1t1R2w#J!eJH3Bo4S*EW!CJ)K6|1
zb8uNYlf@c{6Bs&hzkO&ql|td}U4=(nw_({Hb(rVd_n!5k{fee@0jv?tn`>mPG5qCd
zZ}0E#FT8(0oEFT><dMt$4`!QHP+brt=>X=If@7F^`E|iS#L9cRi0WoBqu<X&I4f#u
zen6`|aRkR~ca>7a#{=lVR!>n`BRwEUcQLK^T<nLyMe@>Ohc@7WWaYr|LZnMzo{kT<
z8}wNb#Cw6RL1O&i{r^0Rz&4CM;@knd580h18S#2b&%4E!CoPccAQH&J;r4Qa2b#NZ
zZ)-`b1|ZMjV`XE5oWGivmq$MM1q2FYoZF9%e`vp=HE^k|q!2$R;_0zHy<u;=2gm;e
zMeGQ?dWtn$sK}s)ah9a#8iu3tzuyaBCrYsLz%jAvkYzh~>Cymfr?B;ud+bn(v6ppD
zAPnS#Swjs#BLuwyJ>V$C;eJ~3vDCheIy_i9pc-_ZY;C}Uk-vsGg8s(>5man1K)Iks
z1boP;aJU@ghhM;|mAm|WKzk3wqphtC5ktsR)!scoDIy}m_5!Gk@s=dq8NnThTw<;|
z_%EoBUhVDe#e-9`q3%`qZ%@WQ`(c1{9}5e!!&hk0_<>V$LY(nQyrR_T6<UpC)f1_j
zRX8ZKF-(c%-A9b3&Cx760Jb1YW6W-ODl32KB?6;9Po6v>2`U>#dx|f8-k0=zndjm#
z2oE@PKwzK%4whWFkb<nlI$!NYEP%XkkjMDPG??EpaYkZ_h3pf<WxGO)$+6J3hXk=l
zg<Pl4T>sgQHr7)uO2P$1yv%BP{uay|vb{ilLfWU>iF*EXy2L3<Xsf7Ie0DJ5`53`_
z#2f3uyPcXKs@N?l<7=dMStd^|F`Q9P96W~S&2|v-755j9#RzIbfdW#n`}`Og`CMe5
zR`-QirI3CS){;0M-`lDx9!bd(*?*QRDXb(GIe2&;9K5~}EwPD-i9i+^lW=pw(~!JQ
z&;R{8*jMEhsl_{p-Y9ZhhF@2U${O=pK0Yh4N^7gZpq@W_=0l4jM@9?H=f=h(T>X+T
zIZ?~W?e*M8hGn5z)xWih879>-)A!lN%ElbaJcUpS@-NiNQ;on$z^);B&R8AUO!UeK
z2nd+6#_3QIo_E=BV~hL#TUk5MjgY?vM;eiIqECxpH6L~Kt<%}!*@A}8HdEGyrz<>}
zZ?(sF<?|85+I7wn#C-V3zi8B348T%2e0Vh*4mE1>gkh~@n{c#=e_f+0L#5l(r%&an
zU|lIFC?pvz^UMv~t@E+ES~Z9Vp9Ny^KR!!hnw8$)a#?Q<Yyll@ZE7VmPhI*A8#Zj&
zveU=P-u{I67UDfv`XQ<jB-yoVRQO1OH6Nv}l6T*ozW`0RO``o{s-AkB6YD}S-Sqe8
z)l6aaL0jsdx-8=Ap#?RC2ZU@5>_=^e$~bGp7$ahA^v2q6gPKd_QntMmZDj@&VXe<c
z2w9Ona~_lnaq-hwp%ojqZQHeuTv@Scb0zmq*w<DhQvQ#7m>ASCRmX5Ic@M3ZE?r8L
z)6@(_Os=}3xlC4{t^8;(xd^miS0(J;|I)knjy7vvqfxeNJ9ReujvWV+UV`HG73@m{
zv1)n>T_8!P{1c_|aw_<)v16tBVQ}Z9r?Zp8Vg4)ka3`*Ic?K|;wbXqe2Eeb46~ti^
z&ad<5wmExwT2QRs6tpT4*}Gy~tb1^q%0)%iyALJZBu{E@`xgT}%QVs&>O7y<6XVgr
zwH|>*$%%>F+o{i}cP(1uZ^xtO_8)qHMU(xXi$+wgg9hkdtjFQ)r00seIXR6Xr%^W_
z9!q!yC-Hb|k;yxTU869ej7R>#R3Fc7do<zs_%Sb_kGd1Q@akx^=-yH>=(NzWe_C$t
z);ZTj4SVhh?YTYaod5V_kIy*`ocH`g#R*y*5Y&Z}2!Dp(piJ^yB`tDk34VG7{?t`M
z7KZwp&cwtW6xfRAq0!Nj9t(4LxdJy6j4ymWsW|q>N+s^$<`xzfK%y}-(~mJUlw06G
z#@0{~_PI`3IXX_F#JNrVkOQ0e^<{VWk3u1Qva&`HyO0MgjEuT4waNQA5Pa*_#;sd>
z`}-L-ZQ9|Zp4RZsKmVWrOId&QsY45y{7Uru`%tLa9T-WBd}k389nHtb=R-@wA`O+=
zR~gJ<WMmW;0EUDw)^&ekmHg(E;%-YE$+4ao&}Q=q4nBqv9=r>~8rY6vvziSe5OS#V
z>sNU?-Cutl@0+p&ZbUpu{6WeeK=2>Bu_-Gv)5^{cdbtxf(O{NWch|TL=VH;aufRmy
z1NFP|0J7@pYVbJB@hQv8d*K;8Z{alGd+4UZ?)03TG17wBtz%+t4r1ww86Z<WAE&7w
zcfYB~Qf;88&v%`+!7AcZ5v<nF&CTT4D9-<VjD}5?4Gsr@-OR7s`1UPcDR358fWcHZ
zB_*Zxhkjydq3H33lyZ2XU`8-hFTpXx90?p<X_t1e26Qne00wv#Sa0|5-^YIeGJOqc
zVC%(L2EmQ8R#rb?K9Sw`#3LIZcCg7%#AQG|><g;RHrFkr-2z_=?ErYriRX`DxCWS@
zU*;<2%}Xx#WoX3b?d|1whTFGqBdbzilEao`=im_TL1Py~c>|zR7;LS?N24Q2&fRX@
zcuQ|DZV}_?kKqeJ5Wk6&DNju`%{u!MBz$o!ZV~5WcXCt=$;->bW9;khHnz01#Caie
zbEsESHMJVdNXR&k?(jB*1SF@-;m59DzaD4C!Rb<P5E?%DzStqN)igA55E#kH$==@H
zNl8im{znneZmzB;AL&K}GwL@#^`<N>EoEY41fRLFY1h}jzCxJ$$dGqppoo|S^TV$r
zBDC+yU%}5w)W__eG{<*^Lx<9J3$w9Qn|U<3)#E)23!QQI_&36r8Z&Xk=^T5<Lr`*^
z925V*z`$`{Lu2Faj*gDLKC&3SfB#;EZz~<$7^dE^`Wdi!;q9U!jQ3Tvqi>!qH9C;I
zBr(TzpyQH^v_S(j#kZUbM5jzqrXK&r^a0N8J#TCjeYfHc!q#Q9OYuRmL5b6`%j6Cm
zKN^LW#Y8ULg!_>vv(6qGG)1G!^sqa^(776$XeU{L$zY|~<clzLzd?$T%E!QVV&Jpd
zWs18QILc$Etg3pO_u|XNyb}L_fRC}$2u`J=R-Cv|Dfc-imp7<2x{;4jqoda@=5!AX
zM1N!9V|xLG^FYiN8qZX<7%cN!E_88~NS~9gb2Y?^Ml1PZpFssNcwiuL2&hPtwWxC{
zN=nDm8t|;7iixy^t1$t|@UZ-BYAieqN+eL7>p=yD3OFT+`Kn;lct)_E=x=VEn|V)`
z*!A5CID_Da=7X_Nr}Ol~hjFZKsq1uhz#&BZ$z)X~CPSq22sluL`%Bi&`PIak%JDD5
z9pDx9%1+>b8b?(HE51WWJ}@0U+GLupqHvXwnfcK?H;CJ*v<5i+d-o|N<eR_~={}$c
zrHn3WU96FzVHPMCUzx61-VBDALGe_F57EhsiiyF&XDt<)#Z!Y+)dxRh*uFiGQEVR@
zo1EiTIOS=meUZGI__VB0rOfhhVzQ|zzs%()Ps4GDR++`QoVhGb9jI0u^-E!T4ih<-
zYY|El@(CcKYMY7y15886r=J8cLB{gG69eqvV0K18&qBx36&@ZQQ03#xx5#7MMahMg
zNhq-5eo-o_TwgjVjx|GY<iWzyGOo>a%^E+9<0!Jc6Y86iDt8{c9}=PwnT9zEgUOI8
zv2Cv9_x=z?g7Y1c6Ojj(ej0sM6eT7)I$OxFU=9VPp9Ng1(La9taKaEkouPXqeug<!
zar9%T$l8I<PPyy!zWw_rP*dSI7s9<ji>9s2EzRx@5tL?^Thb7#LZ^EH<0&yP9#9ss
z_X+pzNds_Z#!aVryD;vd5-W*@VBsOZEvRKW=vF`#proVq%z{TYtwzU07>_sL-w$>K
z>-2G6o*st0pc%h*cES$9>;Y?o8LAUG19sw~{%16^JTT*oLYJn=BO_A|NJN_JpEiWr
z<LBp>zXn_-k1~TH_EEftib5;kK|?>xlt()GLnECPBO@aqzF5sz_-?SHVDsi*`do)9
zG=6dl4i~CgO^gH=$=zC!dPsSQGfc<GprIjuWi>V1W6O53X$cB>exw^fFT09-YX%8E
zI6&aBh?gt4UbxT;-`T{}beE6Q^k`CeI9@ZAfhI8NzC!uwmpJX+xf3RY2=+JZhA|d|
z>}|Z98^lb2zP>(8JvavBc|>@>nwpx&kJkYnXc$FVjqhM|f<fG_7Zy^TE;%2Lj#K#Q
zbO#$7phpAzSyXIn(Phj%VK<`g+yRZi=nKiM#V1k=JQmBaRUf`IynQ=B4&=hUK5JxT
zfHUxp!}|2};RtKvOf@qwKUm*fDtC}0lo}fKxDLRBWB>mB?Cf9hGi=lrc^_Y22sV2d
zgWFugT;1Hzbld4hTn`K+eJ(h7Nl};p@9OO2@WDx(x7ZJS9_zdiIRt@aR-(wm;Vg6n
zRT?-?!FO)Q4q?m3y5CG9QrnB{c@cTQaPlzDUZbma{OxEckgt3|30V6o8dCh|_r|OQ
zyOhUh1IWqA!No)2gW4QLQ8=oif=xpbl4BQBQ&TtjwlXs>pfe=nGS2P0qGoTQ=Gx!K
zIEiyxN_WV}*xXe??y9J$V5;_kR1caLHV0Z@XTlA;SX0{)d9-*kU1jJ5YhAw^Z`3~H
z8<M+uOD!=V&Rt{r=~2wYaYzO%4jTW#$+fc+IZtOd2i?D!U(#?pcxS=P`vczv<3Fz=
z8|Fn^v(wYmY*6+g-_o^*-FPLOM-wFhPc}nCLx{{sI&4Sz)YmnpUP(#h%U1w&qcHz8
zzr)Qq5ovVx>}c(Mb(W;*m4kzWXnCZ~&oXY@c)Y~P+B6_N-CRrSPJsRKpIdLgc=<99
zE@GrQh$be42gNWo5yW-y@L@Bx0>|Olakr9~7`@1vZb;bRs3hbmA9AO`2NWWd=Y&$E
zI`V(qX2&_&;ZqRJ_hmdNHI5&z3KtNd(11VDx=r=Sk!v43_m{p~x$o9TlRXD4a3ohK
z8T<w`tLfcEc&58E1R?Y`aw^@#ut-BRMmj5C;YJFA?lRF#9vdyA!)S|=?K3oz6RCm0
zDz;vBX#jo)n$vlw-ALT)Nr54nTqy7AItS}}*SozUA~N><nh&T^Z_CQc0BrL(Nt0-b
zEK(cZ_OrM4K%@~YIlPZ<kQ#hA%m60=U|QIBdEW+WQ65`yZBADH8psHe;=-cPiWMum
z$yGTroR!#_0!Rn=ek>Cf63PWfsbfbqQMTNWSKXF>3m<cUXLtceZR?}sr@x1Pvz$d)
z>1r<p#7dVQ>1rQCNTQzTQ`G`b;^X5}etm(|Rh8QJ{UzyrAUGLXSzl2Y?z?HA=>RvE
z$G;|8EJ0f?t){KqL%`Sk{P}!`!PC?6IS;{3(3Sm;gtn2wTNFGU-srr$|Cr?vU5jpK
zXQu{8pef+wER!G#?taTQJ|CC|XDkj*crZ)57Qn?26cl{NyPJ=X^yvJ-z>{J48dq%s
zC~HpQ9B8P}5hbN2yyRy8VUnPtlW(>tEg>q3-jWIflrecAu`itr1E7MLfu)57u&&>W
zV$Etc%w5%8;(83<$`mK9E^gnxeUKVB4Uy;SLX&<n4OTKdY!08ytSs$v_tJ^0kFiyH
z+2^hcm05jx23xIOXC2sRhI*=|jbj}IVqY~4TnfBp;5c9!47W$gPM&H514IAd;O@M$
z4da;E-Hex=FyLTv<A)D+nAT4H2K(jnn>We~Z@4ZRKSPxOyXN<Ib91YY76fMl!*#Pu
z1<^i|?348M%a;)hJt;7tRU;)PrhT~@m<Sh;%^&qh9fe$z4lnaqzG4Li^DqeeKmu>u
za{%2v$$&|7o_DQmIJtQm?_o4;L4-6mYPDX3<y3C!PSn2r0RV;3r7#$GKQ1mV*XU~H
z6E!n*`e6swmzCk*Fou$XUg4>VfYL+ubqM)Lpg_gTmtY&Rv*AiqBV<b}<P;EDcukct
zO8=;JEsRr;cI`>;HVj3;b;SH#BUB3DVD;}wJa|yapOmy|{t7@@MI3hwYCe>>4-;@<
zN}T>Z@1nx}<c!-8nB979m|254zaW&1`%->nwcNAj1f<ECGiNaI<L%}30-SN(`t=_N
zjH5QB4yBw-OEOlA77gaG-p*rdYnz&oPWu%v<%m3bUPBumGdfi{IkwPfw!GqOKIFR$
zkX?BhI1u@d<bDwmwVZuV8X8V4B)H<omu^;7R4B|AJ6zAt&u<BLgf$FkGgS>Kl1h60
z@go@+bxuHvCOULjcdi3&MYXY;p4@x{!w<VpX^!K~GmpBCBq4NdYzrXCdmA_5oHc7k
zZ$n!6hi@9al9PPkXFMz4cGK1P(2fp?M`9<1l=pxF#P)XyU5{q-y>g`#eH+_lm8tiS
z$TSNH3Wi8LxStFo44zeAE05*zk-2Hx!L#q6P0&>|y<}*(_grO=5Peo|Zm6HMeNcYs
zJ`oX?y=M<b+Tydjq5$1&F%BsQTv%jvb#;Ym0+h#+O)p*2#oMw>=W1zoiAxI$;k9aQ
z4xPY|qECwT;1$T6q~odkFEs|W`AMFyX6(zAVtaw}N?k^eQ`fRc>-KL(+R*c9#m`!E
zu(Jclljg4>BX5iJzPC+{;{T<6K{G7OfO#$=VQ5sq_QI$Gy*$WJ4|xV=jJh<>@cAm|
zhn=Y~)Gj+ar)%fz6glg)5r@CGGc)rW*xB39C%RljY3-`q@a`fb6Vn^K*i>;Zh|3oc
zz0pJ4w@0bXv)F;@BFd}i(Mu{3JF^On2+0%8VYsEu-0G~Iyf0q8ij{UdJr)VDuF|13
zQO6g=v}+bdHvu4Vx$@AdpgLWCP?$}F4t8s(si$!kML5G0BU6#>x9)TC@Q|l=B0u#`
z4>y1rV{Pm)m)(by-AuxUN6sB;2z8u>!Y_v~v5T?pNiYszIeL`33(Gfae*(z@Z9MTH
z9rMLmCMVKV(oYKL6=_CBx6wZmO9VGI)gSKYU=2hih%nJH4pZvx?d2AJSLf<cEzaaI
zbcL22R<rF1pk+rx7m=Lm(t-gTjIk&deR6bgxV_5MokhSSVG#sL7cDD1eUmL_YZE^<
z(rSl%)6d7a@B0mhQNiKY3;0k`Q;VU&tZ)%lBYY;HFmjfB6dzF#TP!Y$K#CMDX%4-f
zo0Ee>*%}sC&tJL}ZL*(#L1gx*zyYwgTNPiveA#BImUcA9$*wANSNY<?<;dwpoKqoR
z!qUdx$9XRsXThsFF-!6R=X1tB0^`svQZ%BW2^_Dsp_j)r*-49psQVz;l$4apouusi
z?02f}-k~+lB^XCQgQ!nye<=;CJQXFSFHRQS9ZTM|m}uvEM9h7zAb{9`Wk-*RKF<BY
z-P7sJVm4hQg*7)X#f8RIGokrzE#y#<{=0p&uPM6lU%#@)bdskbpFEKr926G9YeaQt
zm5hu!OR`(n{b~FXn>2t^<3$e@Nq_Z3`)W3n--Moqe2^WR@bw%V2|~PQ&Ouzg3yY7x
z32JyjzXgmPGsPhz#ax`6Zg>+$<fofBKZi5vahsmwwKGC~!Z?&LmFh2#*G(iAJ}<zL
zC6jd*4>L-+<f0r!ej>-tLl!tdbdz0nkq3$OIc|>wHxzR|t3ibbujv*%i_IRL7?Qt+
zHI;_P8gjA`{~vNvD8m`nB$wL?7t~};fh%nb`VjA=se@k6aRWwl`k`C;-jt}h@R`4Q
z^$MEl5*Q%PlOq*yFap>D4xLnNM!T)2w-+C(oLbQj0R^ppcV35V7(y;-Yiac^Y!v6x
zOi^ch_g%Q$Tg?)|Xi(-_*8#`<^VTplGXr#l^SPwax!U<LKQl9E6+9lZJn<7R58xac
zUaGLm=bFF2y`v+GxZMRP()#Z&qm&>MU`Qlp!2^MYwO9G*8c8||q|xWPiX;N`oA^<`
z@O(|SxB0ZuDi|9J$;oc{($Nv5|L*MJYy+{Mo3$(5F~#z=tBX92!1e-_pGC?=$YuNj
z40*bc0X|V79A?|UAFsyRB7Y(FCr_;;)2Dauj#h|+1LS9C--^mHtAA+N;3jPjA7)^1
zFexV(EsCJf(8SJN-=_$6MBJQE&i4a1=4Mmp8T#>yuAZKJ_jwnbIl#kXe<VsH;6L5w
zcz7c1HnujvSWizcslOSopTPO~iw+l6Uw-%?l;M%R=eYP%X(X<up`B60bz#48sN>;}
zI;diPfq~(#K*v6|Hr+k<`c~n_O+*)t{E)LvuiIUk8XH{^SlR-mVJN3xVHJf6NTb#X
z2?-T%@hwTPke7dk5|_($q<PdGr&D3b6*5H7K7RbzR3Dwcxmq!Ks>0^n@dkH?+L*WJ
zc7$BNc1;T;oTPM`DQ7NTyomN--4DD<N3s6aty{Tj|KK41WmJr$5xn*Y@2AyAR->*!
zm-j&!6~*=8(J^<Cl!gBc@vMF)KA`DV0GD-?8@;jVm5E3RGkObBc0wj&W&xwmg^W=F
zx%)XjXM8)%8&U0!7VgQ1T5RF=fYOO5%d%akb5k-hhQR=YYpmztv#Fw?`?j?eaQGF)
z`N1NmLdrZx(Ef~|$|(6n#iZk5{CcM=R=&HRiH-cklo>t$>cxweB75bEw>Re!_l>e0
zYWnA(FC2TEl_ljow9acdfjGDG?IO3OT|&ZwD9$3@fx3NUP_9pvg*gSLq<r?AjM%Bh
z^OSgZk6P2FBQ&+YhoA8_@UucX)`^O?zB3SbT~?ynn>TD=3k20kxjd>6Co!6f^}1+r
zcB<BL=kdM!_Awy0zX?b5Qq;lx0v1%!3s1(pWyK5~FE8)(y>B9}+caS=XsRXR@#1p5
zU>-i$Z0A_4SPw(0d$k~&&rsP1Qyc||`L;9h%IeKCA2yRG7W=Qenh8HBEp>aad3<7$
z^ogMe$OG<Z0h5c3E;9>D9A+Ufd$4KO@h%OPy(m+7HKI)6q!kDmaM&Jm0wlM#Ha>!v
z7ooNM89Th14SW(O<P)Yn7BSV2l;6y7sHkuk$NZ~(tsxKbfi}hSns48|gYvH{P8gEu
zEfU<8cUlUFg||VRv(h;+=yn;6Yz!s{;)D+T<0F144gHwNLH=5gcD+U<N=r-Q4wP~_
z%S?10;tENHfmV@;Qs#&n=@pnP4Sj6q1Q8D_{rI2}M~u@><H|xttyT3+Z;i_#$C6!F
zg+IniW!^~S1NyoUR^;DkbmKj_2oxBja!JP<pq@d{@ye+88(v{*2Ct$>`26f7<)GbU
z$of1}2a1pPBj!==5S@><Q7clBT#9MCR(G%Orq^ipVj;>PX9hnh=p6n=#Ri|bw4?-#
zgjX*bv&|qg_KXv?u0OF<_!1oD{%RD*1m|I2K|tHRKivt>vo~)_XG?{{5SFZNbGv8j
zW#ZhfK=bhbBJ51Sdd}On|Fb7s#+C>fLRmst+9zYHLDHs@EK%7LsVvPXdt^yOL`92a
zODb7MQfOzi5f#;lQlhB$^9?iieILhrJdWp`=XuS@|Nr}azt?qM=XGA^we+-aSGQzz
zei5eO2g(wtLeh?m{(1N4j{USJ>{i`?Jj}ymsl6bA?N4W$%^?<DYBWK&p}SjED0b#H
zb0>52XCguxwtLj%kxjP1kT_I3JX2;N666+AT|@EA=bfSu8C#bQlSefsNQBT6`rIJM
z&At<zL-_5|o~M)fa>2A%rhDgdKZFY<`M$)OhQeV2TxqE}V`U`^vq!}r>|Jnl&&``n
z@HZhBckfQ6Kf=>SSw8*I$Io0+zr5#X9#G}*j*%?n!2p2vDH4Ak-(@!<jT;JewY8mO
zWTu%#`<6W{hwMC>@H4mRMj4)z%#RNU92|q&!lUv$XwDV9#Gx)Pk2Z!e_5)I!$<3uz
zL3~IyGhc~}<C_-X-XX|Pfg~RXHXszG7Rx;|qtDxhw(Z*CQ2ML88*GX&)*+gr!vm)p
zf5;w%Y^O${W^?ALaRtXF7H9lj2hahtQYA*XWTd8wDLWKg7cO7U+IRz~?c9*J52)vt
zkkBCXkx@~hb3SnkGyqOe5EBy<iqj*PIynWz+1K8m+IhwI!7tB0U_(kSR(9@LuRqp6
zaYnUtOJB&tc(#H#vnqt{F>(4M@|nhvt-y*oKs7;=hsILqB=FZkoJ>5ZG(^-9jSH=v
z>IP%fW_59)x7w%rt+P)5eDE1B+QwbD)6*P)LAZ+|?Q+eySF$}+)By#Q@?1Co{2Ah1
zK$DgziUq1FzYzJ%UnIT21!S7kUl}=)+>`IzF@PNMy2ca8mo=5S$Z2@<#({#HATg|a
zU_Zz7AfeEwVa^rM!pe#NPwIFe97K(9hzhSHQ&dyzP7jS$@pFP*hZZ@<<TJ4>$#q_G
z(xepu&-h)Bu_wMRxo)&_ru_Jo$8FrT7^H#$V~`1I{WsU}iz}f|Q9GK06DncoSFIW~
zPxxKFnGkLEA*TTy(3r+@d;WaC$v+g6(mjX6At7a4@ahTOtP_nW-^aS%pM~l0kbTe#
zF0&rL=$9q=KK=aZQ~q_gIV;@~7+gTu9j>%;3KhZL-eb>T(_(V_B$O3%oy<uhEjco$
zYaeZz=>7ZmcNAukTbjOFD7-_!sI8q(>Bj?4ne0CYWaMAiD=GeF>C?c6Z;}_|YC342
zs9xhvTdyGN5_%@^CJ6e(G370)bSRM*xI8*)GWYtgO|DdfQnC`#GT+_5Xr$>_!!pR7
zhUM;GaBWzcW5#Lkz0MCtFG>n=o$6<3Mw19NytkLv-{{+zOv<c9?LsS;6c%t$I6!=@
zU*`y7DWEm?n^%C(_T1`)ANzC@M3sW0O~|zUa7Rm@rf_|@EK9>pLZSMH243|)aNuP{
z#WCB+-0PppOZKL{Eg%AmU;6!i>e}^`#~(ITKYKB;<P>48VQr6MFeG-zk|LwMWSc+o
zhrcI1#P43GTc6+J#pV5F%Ga)3H`|q8U>;>|WZ&*(lT%lB^l*-j;g*Z*r`Gj%Q9SyP
ziyHDfRR(nLB^@;nj|NH@2L2}9SOUHX=q6aY*bD9u@a_p;VW&419uEw}ghkm0S)sC7
z{}YTwapgvj9a~Y55PLeKY#4~07>?Pq@PEqGps3MX{lc@f@1rSqz$Xv#TREOd$M`f|
zXKwlBV<#YJQeXM{`aZmOFqKR{(!YB`Lc+15N9|KDtz5a1+VAk=thKJLC)FJfxO|1i
zFE&pPFTQ6n%GLM>6Q(~J4m-2J|0TpzwWV&*sLRya9acY2^zbhnNSsMC*mLq0ja05$
z=7hn6OGq!8%FnO6jkWlrrL?JP`i#|%ZjtBEkS5QMX3j?$o>-77`}$jin{pJ4$Bo+*
z+|b3{{U~(qo{fk1R#rB~?_Nj)=Zs0X%kr)jFsDUDN<P5a5VwODFRmfthg_`r`AHS@
zAQwFxL5LxfYla151pw>$IsfvfPaB5i?4POqp{gol`DaH5=!qzk@Z!ZL$Kv8XWA|6Q
zJN(j&M^=`Ky`!^Nk59CQtRJ;cW6rV_94*wTn#vC5QiZLd3_W#|bC#nyi)Vi8T`+d7
zY1{desxHpz6kd!X`NK<seQ~d)pm}Q2tQ)(J9kZrA5n4spG4#Wtb?AEb>uPPhvP3gF
z_}}b^mJgaRx@P4jnt?><kSb;elzLrFfvs4+Vnyi1y*E0*j?e9+eiw?qqzcae)S@L3
zdk08%GGqQ~?SDA7UjO_h@i)?r2(52Lzn0gcs^DKZal`wdz1Bh+*$FWUtlq%+tL*Iu
zyPg+ueDA7g!Q*GhlQe^0na3?D$j#NvnL!}(yH9k^`Z|=XCTDOYAi&+jV~hGAQj~TG
zr6P^r%nitRc`DFiKPi~SZ5zHd{7ibvxROch4R#Q2Lt^i6YFq~0MP|$QIh@3ceY&PH
zvGR(I3~h)`pF7?~*Dz)3)QTftgaZ^AKqiBRM?q?7SjQXN1}-|Okhy5_;u<)*RBzEL
zy}_%Dm`JI%dGqGXPdrRWyY!28ix-E~yrRd0?)R~aom<O)3f4oS7a8g*h@0ZdUT}-E
z*6yVqQ=hP<abrN#7+Pczab;^~eyFWQ!y2XzlAr7TTox4n`trN4l2n$F(fm|{!9=tK
zGDW$?dFYkhKg6E6ocZz3TyERougV>iXcQrw4t0y%Xq5qFmAT9o7$XL+_a$9$`>;eY
zmQI?ng7DkOnSpX)zmXhjYm=PY%b4wnzT&!?<kKmx6|>TxhO?OF!P3FRHaPgfqeo{z
z!FlvjZdLR0UI#_35s+UzVU4uY{%O;*&NTE@<I2;CGJHjA1X?X*;tbl7&n{~C(fu{t
z3W_IAhK@(>{+a2LpA7g7p$!O5Ej^BYF*85kMgbeMm*fk&#_qLFJ-duOJaqO#@$EV<
zLJ|XW^W^-@Mp8}jnK)X8nHO{6JVFy63(JgizxoaNk5IsSAub2dQu#Q8NqS2}`|9Um
zPdHtfHosKf_$cVPSG82XBFZ0IT-<WmvI@b}ec`+^5<U{ee$H`_CUefdo%c!|k7l=a
z1zI8v?Ai3Rbu1?ma>38KFg_uH#1MEm*6`w`OAP#NSLZBlZul4l`X^N$@-gg^$fIY>
z+-TagY3shy%qwZU4bBD^&CoanoT?@f-3~}dq0Tm;nyx{Du`g-$%Rk$PPrP^)l0<Q$
z^REiamd+u0^8sjH^fI85I77urNPTKsOyoo${gc68&z^-Iy-I8;UWaI>`bq1|eul?4
zDEEQD|Mgd|kF~V^0h))CP%Ak6b>UQL6gL5;)Gvuq1cXC7Kg!Z=im#a9Eyv;XuW;x2
zx#M>DcB1G%a@fJTE~82y&;92@dC${qqwdGN0M?g~eZR<zh9X8gSg@=#j9wYV#dHPH
zv;?o^Zu*04A7Z1kn?gpZS1K)u`3-e9p|O#eWKI}5^bL1v*5xjD==X;hI-rK3JXoTm
zrZe?lAR%NfP+geSe-idz^eS<U!eov3LL-4#z+c@#i7`id7RwhO`gr>xImd7KToVQQ
zVsvueQIYWdVakLn5-bKS42YkD9<!#)P>G?@F?vM;&W^NLMP0(FfJP!W9l=-~Sp^`x
z<lPYImK|9BGv}k)`~*?vK)j#z+_~b1)7*!Zo};cmX*|N$@P_>%bPY`nmXRKx&@Dxa
zbdd*55F$%!yHzhn7eP+TEWLnOMfhG)d_`yxXD`!z%=|>sA)3rbuHoR|0QbVorfBhk
zCi!ieOp6EY0Tzjm0)NRzrEkT+2k@F=)`iYW$f`G}TZqar?<{#|e64Al_JTv^BE9kS
z^aL=CA33s8yyH>x`(aPO{em85wuo6jmDlq?7$*+`Ro1^r|K)}lKLqv-liz0>x2fT3
zWMl=s8sJ|adb;%g6cj*&Vmt%VW4^%-UOW@+<8rpQp|{I7#uC=B7q?GXRdU?Xi7Dyo
zHiFI2H3+o;ZATZqcm@~vexaKN(Wd3bXyU^~kw-HFgN?7uc+$P#+#geSG(oqcS8oIJ
zHX!FRHe(#JaPT@ZcW4s&PBg+u=WD4O`DP=%F!u1#?VG^B=eFzHllTc3Ctc&kKnx5T
zJlK2NHYTpCnX8xW-1+f~vuz9bFGM-zcpZezw+anNCr^kHxt${=j5V{Ulo;E#Zy%zp
zOc%Y8Pe!TC%`==neVE1heExXHmmh*LD-plaS2-`bz?)N1>@emX$1C^#5GLWeD=J0>
zcQpVR^wZmX&O?Y6&c?QKW+I&<proii%buf=S+MTik{dwb7^@NEn~bt(WnKvm(DAq$
z8fqW#!N^Z#9c&-D%IUc}=G)7+l<Yr-P-o24m4B#up~&ObH4Z75xA`QzpIhh24i}26
zjd>mK->)t>h{(z1OB_Tft$i*?%-)IT&!5jUaa^Bd7-JeR&&s}f{!*OsYD$SEn=&K!
zmv%Fa%s~lpNGmk-<%<`q;UsE$Zft5y%uh8uW*AzYbvBI&udq!AzicVy9r7>^#0Jke
z$Qz}7g@VmRJBC+DKil&DKp|%4f$El(7zFTnCHXNrO<EfeEKU5}d(7e;l@Cy0jy<3h
znN!}}W*Zha$}xb8m*Z4RZL;p+{x073#`J6{74aJlSu{iGSK3VL2JM22IH*_QcsVw<
zi>z$s`3x)}ts~Zn*8`LbiQ9n+QAnXoOkin7%**0{D5j|b{}|vB)3xrir+;3HXedI9
zHj(xH3g7)*2K4^oSa@yCqz`K-iz>>?0}2<bsz#C=K5soHY<n}sj7Vm}D#TtP6LTAO
zI@U9Qq4Db3%`G=8yV;9@NSY|iZdPk<PV<&SaF<avCv_A0nUrl2KQ>k+tCMcvLq0V$
ztU}F58iOL=w{zz(xHbZgxK$9AkIs8IQqVskBerVU5^dH7J`jR+2F)#~6G&(Lo3z7)
zU5+w~s)t9%X1X0vX<NR0S_9#MMhRxK39~tucZCH>Nu>z;O$Vt->66H$bSkTeAD{^x
zcCFRa2mV}#Y0VpW1A`sB%4M#ryo>dkKJSZ-t>e=$N>f98;_7sVh-B!TUWMqE#l)<0
zcYWwT>NyPsWPQRuljGvGZCld)`+3OHCQd{Pm!6gu%^!h<$f-)lsPZ1)i6|Ni1w?wI
zKp3p9E}W83C_<q*GfDscJ%adSiHV7oXW!$t0t2r;bf^Qj5YW~Iozk8?S1w(mbz({+
z7tC6Ond;UzHYo2q9-(T8j9j*Q^#!?w<Q~KA!P>N}A=Z%e(BcrAD`*wdRX;q=m6B%u
z(J}Uv%mBRxO2jwQy2B))$+-!?18s$9^ye{S04CSzFcY!4|Lmh#=6?2K=u5<$C{*~=
zNK30Yl%$r2bVJc%i2gccY5cWoGD?)Zp%2OeyNsG0ZVIJe^WlS_UZ|5k^DdYQ8^i1v
zRV!7a7<$hLzCZQD>wHt9Hg&9M?eISYc7crLPQxNHNA$6#hCyIg;)F2mTvmmwD*{+0
zo&q-?bU;Ty3<b7qBRfG$OX!2(@HET!>FKG&SVg&fSEqQ61S0S}x*Je=Uj{7-RejmZ
zud>E}!cX(XeijoJaW`+?6dLyUZ}+va^T4Tmf=W`F!457Zssn>E)ZX^*zwVl|{d;x#
zE&NIPC%X5?DZFZtqs=LUHuhV8FADlmTzvm8nDwKJFq8HVy@2uN<K*Q#|07bClKVNu
zSSf^=svsEeDBJW(DxSy&Cluma8hs~nguD$y;~_-$PK3)i)Rw#$%`-UvXPd1-4L<*<
zhu>&Hm^*UF+cnG1Ps4-%0<&{SNW&47WO|ym@I2u`G*cg7CQ0%Gs@E^R2el)()9b?q
z46p+S8_2-F!Tpr3wE3iMulDc!W&Pu%q$vv5H#L|GNqj7AFUJ8l*R)7(gDAXO_VbEe
zrY$5p&*Z%QJ;0FEvyJc!G@E!RCyyNY1jIwe0c^o~6!8jbEEs3|SAV`%+Uk}t{e0{1
zJk?@bQK<cH!w#7G>F&oE%|Crg(f1f?`{Ewed569vp%pJ0uxe^g?hr$G;~yO}WN{x%
zH3z5EdD<tN|Ng{<g|-U$39hrmCvHb_Nt^*UpPRfR>a}M+LcHFN`eG8`!KyjcO9B5K
z+qr3tc@2L{Fw;XLPVByO_bz=Ze(7!e<je1Y5}5423{ilW2GWo@nkE}{|7-Zb0@=6Y
zYVxUg1Bzl~Oe5*r@WQPNhxp6c8Z~g>AL2c9K)Nd+EHYz5<_e_b6~%9p`_jCD=7N$i
zEYr8s=SQ!t*7<gRYWU72fEF(~T03(zdrH&%U2JS+s_@oPL7(qQ7u`z^oa(b<hq3s#
zd7Kd>CSFNp-n?rC1}f`Ez535d11bj25xs9pCggnLLO3#{eL+18qE3t|Ol#ayM`b96
zXF|5~dZqvU8Og0MU>eqmiXmwEAj$Uxy_XJ3knw<;2$$iWxA%`%@@pK@(z72kn<v%Z
zS67>i!-4OR0kcC*_S&l~!gI=x@B8j>k$eE9Fju?15}<j_`xn0;p7+e=AXS!^t9UtJ
z4?>~Eq?g(RU@b~@H<nou6JP<^lE`!#<9G<Cu_WPbtyr-FEa13s;c3S7q@kV+L7iv_
zxgJ>feTX)>72dDq#l?ff%%c$G3YotNqRLQphrFv%eD`fo$EHrfwRC+%<C#&c5K+G*
zCr?xI^7ro*$`9H)nXi$y$>cv~t0X_eoCMcZO86UBQ~Zmc+BlV=Fn)@+985PlZsweB
z>3?o$FxtxT;=M@@2cHKJeu8;`($L*}Qe8gjyioJVwG;Y%d+`JMFtKp`x1(eD;-otd
zwDj0jGDLN2OPjWB(Yw!*DSG{S$oSiBd{eWX)&Y9KtC1$*;@Q1?DIUT5`KUaj5gpfz
zBhQWyV+s1x#bDKwv!(yexfL1{NXdILN?@~#s6@pZXu>?}nC7|&^?nSGUb-!9`z_R@
zQ{M^3GN;TVk^P8?srOA--p?|z7+&_zWZ0@rPK7hyeV>^w?oG|NZp|7v<4Z9yVB%9W
zWcc`=N64I{-)0#>L83>jn6w%(a^!hXHR=|F9XxOvC)P|_=u>f>g62jq3><;lioJ5h
zl&M##`;R%ae`mV(5YN|FJeeWWl|dcXfYwqQ^w7>c;^xpsS7<1sQEwQnU~+hr4_Nx>
zfdi+PYfHWd4vbm;ms<GXCwSB_(k$tVp+>!<ssD~#*$(Xgnf)b&yt?L@!#=tjH&AKH
z-Ff=74rbVt8MZs5e#ET?J4pT`&{8nNC?8J|!gpz}gn^LQht~Z3=Zl}N%1<{liTkch
z@wGr<ER4boWGpP666lW{Jvxk$*jB=z36R%83GMer+9?Ak?-HZ3N?sFCq{9pF)_-r=
zh8<R@38KZsYBpzjo{bF&fNmyh)QT!A&x@<@rYl$2ePEWGR^pGb!~1*sPRxXYm+1;4
zOi9a!vkNkqG|2<arjS2PEgKP}r>(8soFX^WRnDVIMjCsFc~B)>t_1jG!oL}N94c%n
z`N}fawaM4-KQ5aG)vY;PI?ggme^NA{&}|;871JZzx8H}i^_D9vEEJQ)ywavuKn#E`
zf|sraSRlJV`!H*F>qSRQs|O6wml2&oKwL@M$@yXiCtZt!+};1b{zBl^)zw7?%|bJB
z{?I;Mq)B5pytfOl;m=;8Hz=kJ!+W4X2TN0u8tf2bN_<K&B77K{WBU9eYb&dj&u;`~
za)>j3<)+X5&pF`}#qZudZ@%d+E-sYh40;EP=Y9{J%@t0Yl9#vlJ)R7Kh|r|3z;s;k
z+I4KDL@hJZ{#6UR32ZFNV<f<_ubkX_T#knE5vBS)Tz`}QoP7SxoexOmBObWy0EX!4
zIvMQXG9$DeqpofmK24I}3j|^W*-U1^;>A~B*y(dK_v$-x%a$ho>-J*$2R$G1G+x=Q
z*P?W17<z??$pN{S&=7SWKFkppsYa1ew0!>&ZV84_=1X1ezJ(q&8Gs{I<0ZK}fVOkn
zkf9#J(v5nB$~3i19!&HUc*irAQR>^bFTul<8sYe6W2x;jgwudszrB%Z=t}EZm!QjF
z86ZHEVRCm;Qk*!*RJw-k+Pt_d7Kb5{8AqWhlL6=?O7-rY(S-^4eLt^i=X>mu>c2qb
z=%E7#7A{@dt49yN(kMFkWPdaia8UeW5}^84X6Beaclq7PRGCCVRj;8YA<1Z-fxY{E
zDq4BU4P-P85ehwW-;s@YYGFxv@RCxC<RHd#_zIv+3o(=)4@hO0QdP0LXIeTa`0LlN
zp_OwvsX$M)Z)4_Cbw@5#+|PYJ{5!|%20_w<E?c+OmS~+dJjt(d9XI6a)yFpS3~M`L
zqs1HZmgC%U#cS&7((+BDpb-NHiYNj|?rsof8UX^^%Z8saTm}9>Q3qN#47}nc2MI^d
zQcXj{Y@CX8(uaRT;!X?DFsiRk<W6w1i)LW#Ig8XZV5+^xzo}`=DyETz*UkAHV#v9<
zeyVF$uV#ej*3@?md`I9D^|?*FK-9EFtSl`TTP&A4KEY4{W`%z>+>38K7xh|!tLOEh
zJU<#y0*Q_s=aen{L^&76;v@ae&=YjWNo_Zi_zCpB`C$=Y+TfPbfPIX``c85l_#baF
z_FG(g{bmvZo&`c`Q}N{Mn}?98!Xv!;#(R`hP-dcelJQ>Jn1mFaCY@Nk%;2ojulom1
zrgQYBtSlV&hcTnn)K(|^DrIhQLW0Nh`&gE>ec(jtXs~%C7lw}G^Qpi7$_)3TN6a5g
zB+`ez;!~xG^X20_@;^RR9qAgNSDLP*Oy$Mo0_<CEe!ljrIxA+;P+7v_c*$|{09%6B
z;lw)+G>}eS<f3+l-er6{b26kJiZd>Y3C<jg3wYx^NF;wM^TaS8bS7}^kdLhXiM2Qg
za8HGPKi=}pZ%2uKcq`4}Pf4Gai^0;K69Z!V*rzfsDt@W`1|ds7@9|DQ$d=9gVSEZU
znv_M!ou=%ez9Z}3kYhTgUE8*rT3TXqhJDKuyAJV8OM?;=f&^;E@R}!g?kt2&LLL}e
z)AN&dHWp$Waok7)U8=~*uq5D~E#zw9=Aw!g`b|ZJHFiy9sv7}0&|9EO6XDoAPQ2F1
zalWxxKNKl*>Eb2um6qFFKujd2-uFc<rR+&z%$U7<4UJzt$^s||TnPXi0RF0HU6no4
zX4u%Uyw6%xf6Os{PJtO7KL5sbVvTIuwga~J)pQBtzh=}Sb22@1%o4B&c+UR$<Q<I~
zk_kTBVz`P{F#BEo4;>mbkChzHpyb@#-BtSZf#Ms@O+7hi{D0ik!JJ@zti#pe_6rw|
zL1hm(=#D(ox+e&vgm#qJ71O`}sQJisYxxVWtSrbSQ-+$@r?zU5oPf6x(L!uI1NS2J
zPC3@??WG@DfTtD_rj9&dxVcw8EkEsHaCro<iYyAT+qmWL0!MvQ`&V^UyVqGRo@1&u
zK3QLFwnDG&f6kk|)MiYs`sH)OM?@u<8O@oqHZ|KuWvul%^$`b;+N^HxH`%-EJyYXW
z*$Zd?BG>iM(IU4sC-h>+??2UXZQD<qZ38F14ygEYe0kq(&*Ns^d(6ESV$V<$@9o>4
zzg74mm7nC)Q18!=%$YKIa-iOp8w<B^eW^<uF<+~i0x(?hYOR*%mcgm7*XMqR%VGOD
zlwH^~Q2Jr0UkVg&c2a>AHPTH;0w^8Rb6Zu{tX<0x$s(kC@Wc%MV3+Y*PJJ7@i#6_>
zerf)Ny%lX=Xeq+OQZ@ON{j-P-n0>AT@6g_HE@^EylfuTD+eELV9<r+7#{5&pP&G<J
zhbA*^iKK+l4t}Xu|6U(;{klzkn6JOTKd_p$dZ;||qD#6NFmq51e}XIq+`8LlVhhQ^
z^HYB3pg+7fuKeMc=$SCb+?+r3Q=dMOS!FJo1!(=8CY(Z1L1||TX=gPe9<rTf@2*gI
z_9iAjJ>S{llNK}~Z8I^V%v^TmaA|SS(+UlRN8B39W)EO>+$-tA1bhJKuV6gh%T%R@
z{~K(B6`6A-LCJdGdGzR!>DuZz#an?#KdjaIVFdXlfuas<6u#!zp+lx+1FJoj*xPHs
zZXml?U-snTpl;?<*RTJ`fMGnLh&~)676EUsOT&GGgBPKbqejJNlLac64Qs`9YvDVe
z3-97m!yj-bN`tZA)Vhn`2X*sn1}~_eTk(FrG{Mud5>%Kyn6#`*tNO}YxHihLqM}pA
zL+LHp%$wIk)|&^3UmznTn9-@<DkzwD${3<0;o7yK-DNE-ES4@^>TLkPtmF^Ad1E<x
z;>o-grWayjYCn7c!}W$XLu&eP{XyA(j+4RW&5cEQ7`eryED(W)5hWEBO>S?&3E=Z8
zz#oq=Y|URD?B4nWMp=sAXrjAW{2adwheM;gl<%UfMS*zk+;GSRO0vo&mfI(2YSNhU
z*s$Rm@3EnQ#m0MMDx~~h939K}rj+U(8oQ(}%W2P)mj!PFXuupJR$%=$)yyt@J9`H2
zt+MGX^b-;N1K~MjlHjKx#tcIh@JDIi>wPOJs9ytX+OSyel=Zl*wDgSeP-YR?mHIO>
zVE{XeHSOqW(Df)@tChamv_8M^)TDOa1`8VHf2=DQ;3emts5@fBjI&wOm*qqMnLnEp
z17BNLXF;z#bPQF=gb9D;=DKqy;h8torlGk0gx55}xEY66{plHy`ryHA6+fKq40e#?
z5WrZAH>mE?ymxmx$03(@Nd{8YO{s5az%o1KSf_PQsc4{AtsO_`Aaek;F;i|6WghF8
zHRb$rNwU1bPlA|zT0YEFMJ&o>CBd@V;ga#-AFr&F1vh5q*@Fr~Z4A2|=6uB8--1xB
zq7X|{&95y6uUK4}eh5+VD`#hjBQ)NO1FbIQfhX|ftZ`4oc4bO1haoI~rzAO8c5FKd
z)4zLXpV3fL8P(4NAAL@vL*s_9@r;hdFHP&`!}34}yb4tmJ9X>`DNnwIK7I1!$tB!B
zZyf+J@*JbaEioV3oiNUYmyI3NazL+tKCxI*?(BRidp7dkN6E>2Vg|K{Pgsy-UwfN=
z>h8_QNK^t39$fo+G&lV@2?;wC*HIZQQZEA@ahNlb$)J#+aAQh6H4y&{_ez*l*e$xr
z?`gsjIrQdfD|<PB=m1LIax^zij$0AXd+h1LG4dVh-!e|fgV-|PIF3G7v+md(gaX}t
zKv=l;e8ZX=D@yGLXSplTq>CVNA1rO_S^mS6+Bhxr?AiNCNrpaoKvOzDW)mOg<!KEc
z{+m<&`m)S>w{I7ne>?^oO3v-qtl8GqV*4}jbKmnx!MejnjoN|-FRmnw$Y%+&vnwZX
z*Czpn^!OvNDeQ6K|JcuvyUO-w&z^}#%0PTr*tFcSh*D8O$tZ2+x*QJM{(#T1OkE+$
z92HOS(~4Dk#c$rw3yTm7{fhUq*lE^zFV4{p5D@*MW9uJd6zqg{Y~v#t{q;W%fB7-)
zEwgI`I%F@LpAt2iYN$ZN*Yx@6r%du~!0{Sr{o`2N`)!8;j^Y2YgYts%hgOHnh_j+B
zwuswGOw~)ocp87Xh-zO3Vobsh={~yZm?c)R^hn*W(tQ`hxWJbVO5E}5iHVQ=<vTru
z$7SC1e2x<)|8r+Io|luCm+##>+&q$vfl4cPb&?#8bZono0aO0N-^Lt6QAnGaUi7j9
z&!nJjKTK@1-J0(&EX7QxeBfMs%Das;9=?42s;Vj5qD2eN=b4lXKmx9yMeZ}c6G9jr
z_>nJXdV}8T1`HgykSv#Zl~_gE5bMqa$4<pzqs``Ou@s0Kv1=*&PpZFxAt&G>e8b{o
zpKWKaTu~o4&OA8d3q=cz`qI!~KuK@y4HpPf{FC;<a16+OQ+eNCX@X>L>=$bCSvZxx
zztu|Pq^$JYAKyK5^#gG^4$UeHNqQ?f!G?wreU(SFSJqfb5nfhYY-;3&kt0PvPoyTD
za76JywZz)=H6sFr0Y&YJ*jQJ>DBJ^>oV7xYn`c<eo}D55cR5rIk9lqHp$>Z)bi_4X
zQ!{ttnifi4a`fG^onjPxly!A)WJbTjgNpJt(mZnN9{CK~RB@Q`+RdatymQZ<a}PSg
zSV!Y0P46deM;p({KbC(PkGDZk4A?Mv^^wsa{DjOplb>*AmF?8YPa&3(#wAAb-MhOW
zC{X$pwgSdBD|l*`%YmX(Svw2x%*pYRBggzjtbE>%%?o1HlCtud%*~7!WtDl!iA@re
z08V<tjAMu{=-lQ6XNXpc=;HLhbnzmWdv;_FE#)pUGQPsub_w*+X;YP2_pgaMiQIOS
zD*eYTKjJv2h5n;wDT@5f;+NT2`Hm>@H3K<4{TPm0X#dQR0}6>5lC*+zLAE^_Y#7hf
z)rk|0Y`Vr0D$si<f6#amjUfMVP$ysNg6-|?(L-&CE|gW;GbQ*+1_{o}sNI(Y4$Ms)
zaYtc`OY(wYh|F<9h(MO}l54wFiMb-$;H=iDs;$M;2`&v@j;G;_>%y(pd!{TRToQCK
z{ji^1cZM@evEX|?xus-u_eWIi8Nzi%D8eZ`n|1I0{nxKwTTFbIlM@3nF?n^BYJv~G
z<ni24o(vR4HLJi5YRT~V%eTU?u*7hNOcf5Wcou*BIH~Q{1z<ny+|E_+1YpjZF{6jU
z7e)f%eAFhH1_e~wOm<?rgqKZLq9$Q_!5*_>{TaM}9n5&R2MD%A$@9Pzd5PFNhG>%8
z_!~<@e%t;1sCynyNlBsL>(;B+5RH{W$5HZBgZ~^6lVPZdQ-0gFZLQ|cJ*p=xq3p)Z
z(ysE#j-D}ny6ETLYsAc6Y{C)?xy}v|B?qjBtoP}#upNHCF1W}pt5AhXZaN#3R_ZNY
zwu}X-b;!QA_uqUC63H1y_MiYpw{c9K8D*Y8@y=z*>oV8c8o*Fjp0kW9BWG23%*uj#
zP)?Uloua%iNOog>#LI^xR8@`6W|^t4G{WDuZ|U2&QKl-Y3bBwJOCIBAB_$7q64KD9
zeK%OFig<<Zas1tGIJ2dsrlK-Cz!va^(Z2_zq@_VsMNbLdxoejSYgk>>)zplclQWG-
zPd6TUr?q5x2$c-oC!Z<Z`uCq7{(EV0v2d$EMUVdg!zxO@(uX9C*akYXzmHF=$)uE3
z`%coC?+)xz&=c^oA_Rg{K9H%2BFwDw+Vu?A#9hw5)~QPue2b&p7NMb(4xbJQVWj$=
zof?${^u?~PEM#h;nHxFwE{=PZ7j#G3{SG*Q8Z?gt7*+ssR1~pOe#ZC#opHtqz+8ij
z3?#|W$cS-V5IDykHNRz<1F;lc2Zsf~*<JSD+wIt>d=~qO-JBWJ5(h9)Gw07!ErQmk
zf49F;)7g64Hy|KFM8Xvu{|tw7&LPp!tG<DRZynjAkQX9Z(^jMSG}wW8y-*ovKBl<D
z!+BZGap8zeq%vGl(#mP>J*Phi4t|@D!wj<6_~w-XD+xEnIIKGubDZcjpUrTfE)!~h
z!>N&%O+4tu!;V(2-UUs}y=~j3&C%iNQudf1m+p*M=t4!Sv6Amgn2YF(`OTA<WftU3
z1%+I2!`RoY2keGpBs!%AvJOb|6{Hr*veR-4IbzU0T@B7byHO<NoGc<{QqD$1ZS;(@
zK`=pz1E%m#aKX*5zVx?2A>53}9R5L8_jDhwqcaaI7L$wqCDC?EgAxyuopXMM!f%2$
zcw>bryW415aT{+&XxC@YnY9UH3DU!v2c4uPzkKHfUi71D!3Y>aXYdU4IADf!nU*%H
z8P9MJ8?$8Pr1D6x)E4w@R56GfotNUtxAF)gBGR$99KHT7UAi<OKOi19%^H8`6TbM7
za1rrwaXCw-+&afRE|*MQ;SnjYrmp<JzI|zYVKbE+&QIJ~St&^Q$LaFfwM!EPbZ9Uk
zYtpukgmVxY5z<xT101O+0}Qovw6%pSf$yon!kmdej?J|W0C4c7$*XZ}Q{t(cb^h@2
zqq&M-4<Bp1&UhVm6JZBfOf}cmwz9Yw7@*8Fu`4`lxhFn2$YA-@q)h~Y#0$gJsIT4l
zjp}OW`SYJMB_DisQcGts3(1d;u@Wg-^=s1eXV2mY%0zyGtE;Q4KWQiEF-x}#X|8cu
zkn+>94-F>&Wja;BkzI_5NsOK2fR(e=riK&zwM5AC6cTmv3esY+-beg;F%x#4VtAm&
z$}R+6R9!0DMNz~1;oE6zA8$x)6=rN>^Myr9A`QwcShPq?U*mUSTWJjDQ)kOYY3X`8
zo)ly3M`MPxDa(FU(_#uE!H$i%*J}8E=r3VOnBhQ2g5_E38kFGZ@ZKZFFb>F_NW~S}
zu81;?v8{JsW<4O>L$@y5e4a8LB`ry!d0<iu-$QbGurbH;?;n`#OP563lZOwVX}-i%
z6@GQG|I}1`hvtGR_lskT?DkwUceV}QU?fz@i?V>k$xA5iOjo%ChmdnMtL)L6Amf2N
zc(;BV9ri?wWn$zyT`*o^c|lGxUM267R9w8+XUd$CG369KU<jt~V=nHg=K~`Pe?sLb
z>TdLnX9D|55BK~?LM%wjEOX)0=<3>9c^19H(wub#CK=ai-@jK|J5yR`;pX;P>)rLs
z_=qw#CiX80v~GC?5)rcrS;4Lo@ZK%h23!PUWoE~DYTmL_Ltn?mq!=cQT4<Kzw1h82
zjFn>7u0>3h5+*%u{<#rcRq(KiVr^AbZwwNDV8%@&>@9gPG-d!jeMXInw$f}P9qII)
zo=BHNb%@r`$Om=HUudD5O9gg=ASdG*Lx3HbIK<N(?f5vZaV3_)mhtRZdKjn`t2}lw
z9SI)FuWLNxRR|(hPt_p!qn|$CoP7JX690LqiBCVBc;cu?l}B###a;jY>l6_E@4wcd
zmiv(^jVgjw{S3!?4I44SY-o32;gSixr0j9u<73dNX|rbS^YH=t(IDlUL^vM-o#Nm@
zorjkd#KqCIB7}DmxKRutdTe$l-jUH}Mz#_h5<L!GmOfEL!J(UJD#*(-yKkmq=HM_C
zA%=b;b7Ip;m>CYVVfIXU^>#Z`-vpv)bMRFR^@mOS7D4BK)NJKZ|3{TGhw#9sF{XLl
zfxFsvYf<3|U5XM`K|Q^)-_w0k^iwg5N>_=3k0HZ_p}{cYeY7{MUc9)tIkR~Sl&`Ts
zO`gwl#p*5jkGZBfWnneCd?YA<^1uHc&Z`fb_5Fr;*7GyZhK7=Nr~BlIlkz!dJ5&&2
z7fL6E*l%@9=`2i5rJw`wKEw(ZGkTnW#O;J<@#qDEf^I}ub{j2Vd)i|HJ1>W34soci
zy83{KLjf4GQ_t{>h<iBBt18<2td+tBf5$TV1&9B)-uMP*LX%b?KgH_q;qh+#?YC~a
zv-=HUlIg+cc0zeXMW?){b+!joLvGyKvxGSm>gc83ZfEp~H_LdvRTP0&93azSKN1yu
ztiPXM!N12$=ga%MBq;GwvseB2Op48l%=>$JCP<&eY}v$z$TNlx9ZF|Sr2skT?S~ME
zl9ca9P0i+mQ`|G<gMGp6M8`9`TC39t8BE2rzvrKi9zQ<slKm9cw@7sMFu6QTLA>=g
zZbqXyS_no&!qd!!h(T-_ZW;l95Fa0(azIW}==|d#cmC%fb3CaOnzKsh4jrIfuzKU@
ze}jE5P6#O@$=qNEPiiAu_*fcGc{YKVKX1kXKR+bP^R!z0zU_{KIvXjCDu$K2#2Zc}
zhUVYgvW2w0@J|`Y%-r2wd-Pbxa5C4GtDRjqR|=X*GcO5#D8_09Zb`i8(BK}urXh*B
z|Ky1|-zZ)BvE$@$SaT&X-6<d8+yAEw5E%8A7h)$1TiZq~&gn;VpkjGNH;b*Tl;V*e
zTu6H`(w9_**cl+bM~U*pK^<`~*Es$wjO7gcahRdUX9Zu72eZDixL_kYjrgX|P>Dj(
zF0KCD;#T#%YpWLkQWboba0qG8U;k=+@7Q|~8!AEpy*l;&9?tB#LP)b~!p7!!_B9>S
zzu$8!__q4bpNIJwf~Vx=9jLAvuSj-nVG4T<ZYAkYf-N?;cO8et0e0{Fa^37s+l3tm
zPD2#Z(_tppafB5FU&U${(|^MUzZ(@5wQRE{8S?GhP0SN0d7VA`8h(w~^5|?EPt77+
zsyM#E8GqitZ&AND`&))*n2PQsM^0pT{U4ZP3}p-{q=!!)TN_liyLou1`lakjiXveW
z-aJ1~o;p=9Q&c)6AWhfYHd4^Ja}PmRsFY-i{{EYOlTC>lO<{yBmz7&7^aJ_&`Ud4V
zk+we8HN6TLw?cs_D*{Nz-kC_RJ#>wl{kI(MP2wHV85wSK&G?jY{)gj|-GhIyH~mUs
zePyo6q~kGUa#R)kGY`UylWpAo$5YpZo)iVFLM(C9|H*~VT1$l?ixyc0ss=P1FkV?o
zW<!|~y#gwFgB|DzMJH=qWidS_#cJ3LZCDh89fYJ!^zwwL5}n6mth^m95h^1=8ZcEq
z=y$_evz|JrTSbXZa&w}|$QaFS4mHuKPX4)Wwb6D;b40K~nU7(s0o&Ej<n=ev=k=oi
zT^5szOtfc@9?dD>Uvj8YYAYSy_C1q_O{S8j{|bTYlLr(C&af<*25O_03SX1Q9>Bw^
zrF7r2+IrXzu}h`x)?Z|bK!A*s2uHv`VyxRCfB$xE+I*$dHE3qcyuI5i)wq8pKZ{-G
zlx?G(xt8!C`$r5-_dI%Cr>DHU=p5aA=j!4@B<ljH;q*tkG~i^pO`=@y-p_ER5Zp;)
zj~~x_A#@&Y>EMlV<E}D&nX&+ANDLuIqK?NkbEh~n-dc25K2^P94~UnXc%oEb#VkGZ
z)E3}%kGztw<d{BfT9^y8(|VpaFZ=RsyCaM_LyQ(D%e-hB`yen&^cbmT3W|pB)2>b?
zXw&|g(M>^mc;^Q`p^Y-RenHcE(3?=FMdpCF9qGo4Jm%{C<H?4DPT*p21`1!}LF){p
z(R?$7IfqXk-WV(W+e)tHCMIKuxeyM_>TRw2P14`mLqS2b>AcdNVn|q#&TKVb%ENrZ
zkz^TJ-t{u+OYnb{xJO_7{dYd2#oyp3Alt(IC1(#Wlv=G>g&ao=sct}KD0imFT@R1R
zLvv8)ihrnEC(D#IX&){4v&&~;wZ(U+`*Aw1_4>aG15bBQ8{syYI!$sD1tOG$!44Ra
zetMgar_T0_e*-SSu_fC<ooINzri~cE{}u&N8|mwNUb##afE{s@Ceh_BV>ViNThu#A
z!;Pm*NrFk#P?$mY2bj20&$Q}^Smh_C>^UAzJAORr!{8h$v1vYeq;QmSaVZNRrFrBB
zo6Gg4hL+<9-IBsyl)cLsh9q|-zV3~q8y%?5N`H!_mBj~V1&maXdP4~=0yA^tR8&+j
zC&-*VfBt+r#9%5`cwV8k6|lCmyxI$l6skUc9JkI=y0&ae@M!F#L`i8CIMbrcfoLk=
zlu0;euBfzBS-{rzKN5{uRz)pyo6VFN4HE^Qq?=tX|7e*s!}}wHn?>f_D%MPX6r0BJ
zv7Zu*0J%=>0s07F0&B2nzY`l5A&{*;E-eN;F?_+JBF&KwIxVHDct+0=;Jo0#a%pue
zsF_|)aE=rI4IEi$aAs(me;6~-g2-<BJ#D|Z$+>mW49vcU>FG_}8vO45FS_Sh9{!sO
zLC~2b{*dhu36KPJYH8hOX$yk~r9Zh)$}<8Mr|gi1PjPpTzcRBJUMccs@ruFUX;7Ny
z4p&>K9YZJ09x~7u>ZHVL*FJrm#2qJY9X5>pUa(tzukAeBM$uN}Nt2_Kl9nu9Z2BMg
zh6chv_$IptrU0N{`jb5jEa%Q;tJqpt&+FTleT3qo4y$z+E5WJG6crVhDGir895{Kh
zzoMd${XK`HDO~a<H>N@?T_-Xh8z(R2i<OL-V??va&q=h~{ueLa(!MrPpiMyeOx8Z(
z@sWLv;rK6SpF5J+@S5VnL!3>RFoEiV|H?C-ilJz07hTa<<pNUMXMuqV^0eVrGAn8g
zcF;-oD<6?AhiBAZSy>-WM8+EIpl7nZpZgow@J%eI{!PB)|H&COBkR)p-`{NZ7D5jR
zpHs#QqcTca06##6;y0K+CY1<478XYX1@eV-ma5oVN)PJolhmn??evYfWPkGo7^nTY
z_~6y6b=TyLSFSw2Z{Mux(|0mhL{Tgv<uN%=`v4|4NM}yEX>5XwLBvIR44Bm>NQr}s
z;Qr*4VD>lU;STX{H%vucJv$}EP{7RI-|tFLKOO~lrJ0%d2x>Fi+{%s##dq(bO;teJ
z1P=n%=d6~}Zw?H$AOm}g-Vif-;K>MA6ws9!q69@br}U?=k+r+slAg9&H`3dlJsl7J
zmIH7jXRa^LelX~A3kPEkhlYZWd8E(Io%_vSMw3wyH8C4JWnaGn{CY&p8Q$W1N-Ez;
zyEbqp6f85#Rt%?<sNrs0(u6m2FV$%+NWviHP!mUW1#dZGkvnA?;dtH3oK8~MS||z;
z%>VeKrMCh6$L-od`mNRg#(<#RX40@0E90o;16I$JYW;58F?L8QfImXk04mSQT1(dr
z&mSaK%vgyhX0&!+th<2rl5{BnK#s_VRtOp~c5+AdU4ps{D5s#HqOp<`X;*q&k}kLJ
z*wF^-R3_G;;D9E&6h@~{4|FWjht%<<Y1FfOcNwJ<Zl%2=V)gF=t?hH$!`rgWGzXT4
zK92?`^UA?iVf`T&(5RfvaoWO~Fs`$iW_d*gdR<hU3~Mf5F>G!Nsj=O%Z(j7q{vpjR
zz;YtYOjo*fqJ$Ud@7pEe$*6Rf6%}Rj8)h=NrS<I3;y`j5p9)yet%bC@7jO*;AWf1L
z%a_k&%fzi#U6KQ3Jdc7hUSaYXGkPtGZCH^8P}h^BVyXNz&d_)+zz8cUZh}_L6B`Ne
zj>S5K_(fy6nXmc$y}q=|p!b630vsxk5n|o%F3JH^azkk-)x0SxD{~Z&&}c0f2F{n>
zpkd_z%Bn!*=VDAxrs~gbvo;btobR`=QwIJq53Hu(6Qh2U1cxSrSC4(xJfpLc@er}I
z3j*mT6i(<@7#NU5?{6wx)Ct3%Nzyu_0?Y0?Y*!?Wy4D_ICk5m0W?Q}p-UN}A#aU$N
zP6hF`^K?oWLW1;+Zrp&Hfy0}i*(Tbow#v$^|9}0jlf7+Kvu6Wf=``TV;RGp(r%Z=i
z&KnRL#KmbiX8wssf93Exga0YA-O8v@qo`Y&i)?gkL~>DNU$CU$=9lwX+Oh2@A;owH
z-_&SSqG{f16;)Mhdcr~>Sx$dA;%!!o3w=c%j@s-N`Lse0{6~injBj<ynE#_YGQ#fN
zflb+k(_2`|z&=6^A^{~@LDu9&#xk-;b!!@KN^kaRY^?w2L9m9Smo<z76#x#&AUlVg
zxE47C<WR?Y;2V)=H*zcqayk8R;!?xREkW}hVZzqqZA^l@&~R~iBlK=SjuDw|M$UQv
zzYk12zN1Ak2J=omtzfi-8Go@~i;tKPW#1}nJpC>)yMy|Ni^mS_si%yS1K;$g>?(7K
z-<7n207eE)OhgX>fdj~5-OM)v$HABxu8TlT7-F%V5M5wzit*jMjiOFe0fbT?KJ0FQ
zOgMy|eT@4^spnuk@=kW9eUCEcAH51>7J3za@yg%4fxG-y$HDJIt@E4wD-_elLsd>#
z8XI4raRW}m1r1qaHE6N)3-3e@xha$a*O8%x9MNQ5|F>nR$uQTUlKwpVy&qR>-h2bP
z0=c)7Q$&Wta_~jLRUx|eEaz+k@V+qD%Xcv|HXcCitxiSj`1mos9a|>1anYxh%DnDa
z=i+HX))TxFNRFtRf(deCe!NR~$^!m#k@pp1`7UH{ja9#ic-HAD%(w91+jY_$Qik#n
zAUn@vsk(eecJH3LLeX23tJPKNpuXUa7wE`h?&+up1)Hg)njQ=}`GpnjjP<L{vy|4E
zp`B?KXZK<c24Xs*KaBQ7{#ZidYqIj`t9h~X0^t?D(9@$lRatM#2fJ_~{n4YS`VXz!
zw$&v~yRIvTE-uK~060w0xmQrw!CFQ*LngRC(aD!N_B5h?LsZ1R|N39ihixN?KA|Sh
zpi2@SwOY&MQ@Id@VX9-aJ2?L_Q#nJ2hQS+B)x;!fc91&Ern-gkV;wR_jTu0Tzw=3$
z{0|3-F$Fbdkm+QtJ%cXeNB<RSq$%rdcLL+2Nt4KqP!89gizdA^mpE?_QD|jtEhur?
z$1#Z$#I!5;@n^3$tao5cIqF0Q&ro^>X98E#w?zP*e{s?F+jiGB5woHo*0}Z<@MCun
zg*&l)NBy4ei7W(A<3o%X@s<L`J<&GzBr%hr1Lz8>+C(0`Fpe5Ognq|G74EN(8-*Eb
zQ57+6d_H#WTv(pzu1SBVpK8^A0Xn@vm`j$psd2Q$ZbyJPS}Nxu2KLZ=Jc0Qb7874p
z*N<Zb2&w~YBN?i&iSDSrn&J@jDW>*d-dW(Q@{uD3<y-=Gu6h4HEax(Qfgmu-lr2_=
z^LU%o3$%C9-`G?OeF60qgFgrb2D!>fo}uz+4<I(Uf$%4mgs@Sgm`trlaqBHdp+Zyi
z{M)fD!=fqD)i`~56|z89^cghB|DrjAG`+j5_ns;#ITC>b+G02ytEi8HQ6IHGo!$;W
ziEQ^kzKdkM{RsRY1sRG6(Oq0v3cIvVe)K|`4I*j0jpOw><%wz1b8~-Q!JWVU>E&)9
z1C6HYGX~yh%te8K2H3A3uNOh^^_om0atP*A`^Wc@jOM^p0ewXO>gJbfqkD>0h#B?R
zIEk?4XnA5HAclZV`y2HKQ!yzE=p<+MhyW0(sa;TxHAb2=i1sT^nZzhjQuF6X)<Qls
zKrhjD)z!i4xJhD#jMp2CF5U)Cl)`SH8zK#Uru>NdD~r!g%b82oe#wpYKY3~@X=!<o
z@OT4>0JD3KnRJXskWYVk3|pD9+bgepP9*@l6xw6iXY9JD(LabUbouhwqPaYcho#^G
zu}_<d7*cKU+xD`u0ZfRqpyB#$1f1S{%r)?hnUjzYiA#nSszk4o=N7-fKw)u*-1E@k
z!-gJ=gmBRN$jPl<)eV6nhl~?KHix}~H_gkl4mUMdUrB-Ru8L6xeTwr%_rVsSD^Q#X
zgLHCPdHSiX*TV@>Q=xOgGjc)-!)W*ct<IjBNe=4l^K=3tkdysC5<L*NdGKD(WlJv}
zcWiqXn)^AU{JZXT=dSCL2K{Z_clL0TBv-pjA6LK$Gp-A+8luDY#bdZ?z+t^9Xi5Ka
z!1(cXm(z5!9Mm(H4&l@AoHv1doL@U#!#qKoM-7fEEBbk#Hdj%bwCFacMVK`Pj}zb-
z@aZsdUu7}0J7us3yU!y(*C0<9hgYb;itchL&GxQYw~lV^GH@qHm#V$++}yDTL)%F~
zJwrl6A@Wus=j+j{KVqt<)Vc@9_mF~qPj^r!sq`{%sd)01p9AzUz7Gej3l`##9;>;)
z8>R=~!HWB`f|g;rTW>>nZmTQwcn9Zp=p*9iCPV|TJ?2Tb^hVEU_$cIp6;ZLo@liT`
znUiNXX$EX=KpF$Eb`!-u0(uO)tkc9u4G)>@Rei?E0ouvqT_TI429~Wo$K*3-g)Ffk
zZf%RONcyNJSR4yqSXKl8&~5m8EM>`-O=4BtCw91-<vRXTTxhJM%ATvxLNb2Mm%vPD
z2`WxBm2kTJ(|JfsNnM8ULqTXD15=9U&)!x+Bd6mlgcn=6EXYVOiAa4Ad6YbE6Esuo
zg&_U0vZT2r5DGoMA-)Q;_-xYf{f@b@c&vXx2E~<SM3B{V5Ao5--Z+yp-$Bge0LdHS
z$};cLRwq?}(|#>9?R1|aN&vz&H@$^M4d!p-ZrPx1AY^)rZ1r$1YgbIs=zuVZ7kBf+
z<>nO#m(ygG{;Q~;>Oh=%Mll2#`^VO;C7oMJYQ2Lq<P{a)mXze+Y87|cw5Ot?4ns%Z
z<P-gLoP_7kKxthPmaHFGXiYbqS;wo(Kl9a)XQwL4>Ie4f){WlT5u%KeS7PE<VgZ7A
zRUn1uJ>Idoj94Sf@k3J<<B)M@P&YUv%X5%ZB$U@gLW3R5jsi+{?bBc4d-9|n0~na5
zv{yn50UCxiRN+U}-X|@dU==eyYf4^JqX-qy1Xrr*vSFSsIi)VDve6b(c#zVyUR+Tz
z!Hw{RA)_9>dw&KbrUZsvg0}@x$|@1g#$aknp%^CK)<m4;9XF-WLtsoC|2iX@#N|85
zM|#x2+^4RBU?5b#@fly}tJPh)b?L~tO-KuC-F0G>m;&xZw=)U*0#=lJi*;!5G8~%y
zTBe)mAmt!YAVnzs%6)>D;j)!<ZWDB4B1wf%!`QpG96$Om2D5m5{44PdG{#V<kuD7V
z_jgdDFoMMxQEpgGO$MI9Ub1D&%}rljp@k@w!ukMJG$2SFJ#Y@DG^3BafBa9L?r_OB
zk$RBk5A|d3Yz3*!fBa9<t<y*TzHh;wjX)_MRD`kA+}M?Y(b3UzFS$sB4(FLO_vW8!
zZV7UEr|0C+u=zVLc3=4j&s;fv;prxpLeU*oP_w&5p8N5k7p_<_!fX{<p0=I)UZZ;i
z>P%HE-$Gl}BA2_^Hpg1ZsR6H<n&N@b@RrW$oWwSE&SayL8>=5+SrW2c)Pl&Bi+x&4
z7ltZZi++Dl3j{#NwcVV7q;f)P4PE+W+pS_MQcO#Z`1%g;a(H^47Fz?6c(6DtbCGJ(
za}>dP;D7<_^J>LAao*li3hE5&t4B^5#|WZ;y*;H{TkMxC(XC0JM2y3{1dixr{u&hu
zXk$u#FF8So^Y;RekN{`55QQ;O+1l22rM-PEUxfuHbI~%F$BA9vJbnWi`VK6east_i
z0Aks?nU>KoF*bhB^<kD4U%JVhR_FWStH`;`{j$%5U4BqPxE;dx(bx}aOyEw_AP5;8
zd-v{LPE6`3&lb`r3o(|`bClRM!nS5{=XmoE6lEn8+Vqc1hjw>98<tDNMKe0m%aX=1
zJb4K*tMiVXJ4N^0HoBLAus>h{1s^Vb0PE-3I$hXJOmx4+4-Tse@u8FV3C>l21{Q!&
z3awkWPF4)*X9YA9#<E`AAttW2kbO&LwhlTajIk^&|H3R`nAayF+uct07y`1OHQ|4L
z-u?+%TCYn>>7k{{w?`|%Sf8JvJ#K}R=X9EcJt5KM@JKXc;igEqnBt{k<KU+R%ILSM
zX2XM6TqILOCyGtQ;7~ygEKUmCPf6mGl9zX^e8md3SbAPr`8(QuhWEn?TdJK!^Y#Un
zFSFd36n^oF%y>e=R(u_V=`Z4k3zshaB!)B(vBLhvV<!H7j_Ov!5^s~lHt(a!EsOT)
zD4VT9lqFn}`FNdDA%)eOcV#ltz`tFhCNDY;zP+32j$1#8<tX?g5$fd(u0u$0h!1FW
z_1xyby+myKlYy{UrEbsH?L#5Op7sVrN1WZiVDsZH)2m;oCJ(6f4-C{<nLi=!@{wEc
zLYnLUhP{V4a1#dU<x}|zh{2{+XfV)YbvBC$A@Pj1vO5KytT~Dp9Ukt+H{`8IzTKJ`
zEX&hkBuzX7xzjNAh<9VImY(S(6<7VWXx~@vg29d<Ts<)GL9Nb&9nZG+>Ansc#Js6J
z(F({-vY=u+-%j<OE}L^9Tz)dZ4f~~^h3$U+gpn(BGZW5!q5<+361jEn3g>Yh6vBx2
zSlMj7{i3*7NLr!So8R0w?Oo3yRA9`b)R(^GfC54eil<dUGlMjL&ADWk+(JSjcyK=@
zAD?Pl3E3tPCxf>h47&Jt{dr2V-~B-#Er6Y!nmohJC-(0z!sT6@rSsn!4F)?<7cOG5
z9*mM6BqVQOp>E%my;ee|^6Y|_JU@N-W1#a2HF+;NlJP0`4*Is@tsQ>JI_TOb)-dvg
z+Atm#7Gw#Q-{lXJZvA9t(>ly}C}m|FDVok<YM=1Z(#L_Sn?8R){%S0a(G{yG(?m2Y
zM>Uome&{HvZy>iPa5TK-1k%FUC!ZaTmlk|`ONU8@NREwVs^GYeYpzS7Z_QI20D5KH
zhqoM(R(9y@5q~6XJJE|T6Rm?f8uD5=%3u^^fe&)rq!yl&snRL9`xBReXArd|dl$8&
zXW2nUCcUCoj3g!}uU@l8JlEakwzlU@Rp2QmCt7~#qyO~+U#8hAz$6^&#Nu|Ah|42#
zZZL+xT+E7`2*2JfxphImSKxnw4&O^o{OyF(kq`lL20MP_rb<{Y(j8uQ;*A?WXHQUn
zO0Aa*C~P@{Z?WcIG)U4P(VNabjV@KMBfMC&W?-vZnJWsff6kcNzeh8`Bs-esOO}5i
zK!h=bOH}$(=;Qdx&6P>3it>(D3F}uNFhRh?e_@{YJMS~yS2(dn6rxC+{yeOu=ioQk
zACV2b<k$ze-8|Ayob=js@)54K^VhuF4MZmmdAbWLZobl1xjRtt_){Df0N;)VVrCNj
zX?S7F3vwVn-s8cms)c7r`j2kLt|CY)G*1^WUim3rs6k?!I6s+5Bmt)t^P-IINw>6#
zX&R2N$QuX8qb{h;M5%|dB2QlYH4&x}Lr!ck;C<UIbWda%*w%N~KRkx9Dt<ihEa_){
z^wSPXh3*w2uv`_xS}q`>T|PcN3@A3JO^rFq@8<(*How2!u32ixw!IK>RK{GOq`moJ
zhsZ`CMl_%&+?F7~STY@m;kgYdL=cdS!Vp3dUA|zf$Z;=jqIGE1McenpiIpo>Y$EoM
z?!}wr?DY|1V%|9A0;crFGDQ^?qDo3uH0;NRDrX;4h656uhsntf^hx<)mht}cH@g9<
z4R!!*O)MlkN1T&JgSM3ZBp%|Atjyu@jQiT)EN*-^0kh!IBR+oMuL~;5Ww{Dn2f3Hw
zXHT4V?{F*A(%M>&HjjZ!aK<OB3z8MvNs{TuXw?}g#(>_n2uO``td;%*lQ3gh;xot}
zDyWYSohl9(so|3anooL+J;cg4o!aa<1P9L)FzAqi$wCoOy3~ZxyT;H>$AAF=0V-ba
z>*`kFBRh9s^U1Fl?$165r7a5wNyeeOB282aqj{Fz#4JOCK4V!KR2eUUt>OnaA<b0s
za$ppK!*+w&kK@Ptia<E&!3#P~ME0%*8{FIysr5x2)s9tNOYQC3E4A$?{{r_A9!!`p
zY{$0HU<cOUXF@{6_qAKulzGEv_wIw|^*eRyq+x{JhG)G%T2PT>|5ntTTm>&VNG77n
zAr8CgDX2gW(oMOSAR&-(bFBXq&A(>xlsyNUDrO_09aM=nO9%^_o+DYt6bYIIwnGh$
z5aXM;ZVwibEHk4OpS!(QtaNW3q$zO53b@N(c^>a)k$#m|Tv*&nQrG7*MqL{tNW7T-
zY)1^ToyednUz(2TNj8MK9@cyT`X{D#H4p=_D33VC#u{b*I(!W@075`3HTkg@3>3d>
zJM5PT%A>nT`H%1210IZg@}#NSI<EdLR|(2vx7?k)yaw-Hw!Ylic--r3bA5YuZ6|dw
zt!islb-0b>Frl&Jwb-B9EXbre_~-yfC#VtG6lN-`;S6DRiyxb`>x&*d*I31`TDgU%
z)`mBywukuaas>?8wWraE93qs`<k8WwLs+VeZp!LnKk2u@IyzqgMs6&%;|H@faG>k)
z_@&UGnRjA^coi~}>zLgs^XVu8Fhgch`G}UBP|@-K?HU`k3X%1M<?mKMbWlHY_G|}H
zT|L-$n+xDt@q4>|zT?Pqs;drY7V%Xxr*O}TXS9@DjjQ^ZRh{P|4@VLzY6{W{&7F>k
zwxc*WwNp`)GV^_zw#!GLdo*W$%#?_=-y9j)`wscLHce*;|LM9(wzel~3RA}?NyGE$
zRk9<lk<(UMQZjvpjYP-lY|JWDLv)tC<Y47XQz&<gw|uIvH|&G(!2kO6^44YXsO@Rf
zU7s$!=&Vv03{dFec93(abs;7(>rOYbOb2zgps){R=%aa(A6HQsqVVM@YDG!_`-gLS
zv8V7dv#~M^$-8(PC^dhqd#<B4F6wye4sJ1qeW=KJBhry)fj9mbuA*{a90b>+pPEnB
z?4jM^p9y2lC)-GIb#1ws<K3*RIh@JW?f6&V7kHl@@2q&?n}ukUO43WT>tx%vhqiex
za!=<rw+1qd2?&e5eqEb^M%HCRUgeLjKL5rgp5NuQ#IF2GmnucEoqphLo>uv;U9rP%
zsJHdpRyy--9+NXhefXJUjKCD0^+}vkXr`B^mGfyz{4RGVH#eZmNon|TCaBh|xmY9K
zT={kq8=`7QvD1zsGj7eRwr{P2ZV<KEq!Ax^7Mve^_>OvOAo__b@83c!F#9Lq{Ntg+
zhaVaz>^a0yFs~YS&p$~@PtOnE<nb!0lOsOn4~8!&4<QOdtEKs^7n&rbCzp!D{poUU
z!kZ+o?ytN&i;aCc=SQ{uG~Nl`(JJa{=3}9RW^wv~sSA&WO9v0?PDMtWsu<GQcjQ;+
z_3NqA4r_Jz?dP#$v+mzNoB5b>nXPWgUE8VkCO0Xyt_Wx)8K*IDqy<xogCwbpu~NKN
zj*$wE{Klje?3(BZo<*DPfc}DVV}xtiwt-4YLI-ECgY?2~Oc4so4`{|({slM`e6OL_
z78<MYB&OQMs&kIa07$5QooOSZG_OmmgKO8V^BZTmx1FRApcS(UmgdTuSPSXy7#%1n
zsNWt2=d5GTBL$(}I!Wu7=IUm0n$n-dXoAuFudiA3pu>m7Q4EKL2Pr8BHv6;NhP^qv
zkqHaGyTd9wd7x0j{F62x$BZcv;qOKnM!y5P!`!2(;c0P9%heW4EeZcWy1bYTu*T~%
zuNEqa20XleCt2BDJ9c0^@u=W2GI`Xw9{dbmBkdT7`a*a@G6sDSq#MdjKYtq-a;Hop
zJHIA>!XOGCAEGK{ITJ1M%42aQbOhsOuBX<cN?*L|debD81J&!MteY17(@E_Xmb!&`
zEu^q5C|YRNlXA~8&fY;u?1Aah^n}bfQn~f;IS`w43FcH?7<c359lfIy*Pgjj)mVCA
zqs<R3K*PJs??-qh7|d&-KW}!`II}7&%^U_)X5UyL0}oM$`zAvfpD)7bLpCD!#<M}p
zEXj23-(QcROsy{C6nrRfMLwtS6U;Psu<&m0cQcsKHBDSWu^edaUk8zUV4N<L%gy<t
z945liRI_ytuh0+YPnZ#<aAX5~pb9Cktv8%XP~sOhK}b=ONdDBZ9$j=l1%<a<x_B`~
zt{(oy9|O^R^NZ63y}pe9^&H|b_tS3oRndOo*?u=?D>JN4vvx^)B?e=OuqAsxG|gQS
zQC|!yM9~Qgs!JCpbP~u6n8532lGYZ`$>}8v2J+g>m*&#t;;!*q4Icw(JF!&j4xIfH
zz9S?Ewpqvi-ly~@eR{Iz+6Eu1kh&xeWE74H$V)L~KOOZpabpNa5B|ch>_E$~a~zf)
ze%_pTq*n7q{UPaV;I3BQ?}T9KU>Ioi!S@ZFFyR_q!^{JprNzBi-kos+VYjQZGyNiN
zAWW$YY$z4l&&>o0B8|ZURLm132FjDcUa$gXv_t(CeM;$`Ls=QmdqV{YQ}2wHvrcW&
zk|k%Zj20rJ%S(@q=}TF!*C}z67Wbs4#zpB*ICzFU_T}1k{U{V`m+W`Ty+pskiyrgZ
z#|MIEykjuOVE8RNXCx_9S8=f?_X_J8+6!jd9WlN^pdl*@RdUfbx(VzCD89_gq&Ih*
zRwHC>AV<+KG{UWva?-WX?HgOj;I8n6fkOef3Yp#X>GUinFIiGbc;O}0@l=`1dGck?
zfq?7gQe_HJoj0$#0vq5?c$mIPKZUCzZ6~q<UxfiPaW0u;K)b{e)2j$saG}&nn+s(_
zz_%5)$r-$8l#q<OGe*SK7Nj^#E`kF9!xw1A8SAz_Mt`>%gpk?4X#doHy>a=a;`~-0
ze8Am-p0^NSXlSgXxUPHOZ~(+%xQe-8eq|fe3E*-v<zr~L%Oz3lKy9LLm>Sr@Q_dhP
z$BEfB-uJX{Q+m+*V^%#{BH%?55bE9AdYTCeLC23D=kqCaY}>qAjuz@%J&<9l%h~=e
z;$u0QJ@xdI)|QNp&9m6#ubYq^kvD6>wyc86j~}BZrsc^EQdTAceBg^1>;UK#my{HI
zFik^J#g$O4G_+&i7jo&>uX|y+jO7L>8odzL9{jqFudI9&E&m7RoA5A^QDOrGjIP+a
zrdVB5vuDSSH)L62536>eS4R^N@ZaE1<`nPwsoM>6eZ@=g4g6L!HUmE`Y?O$Auo>+c
zF8u3Nd3mlteDmU2<l#ZQn-VPcy;I0=IoG?oHl1~&i))MQCfq#_60qrhVPHFh+OPXw
z=_M(I%H?*?xG^<%y5ldZ;nO^#Pn#vy3>;ySl6k|S%4tYw=B`5{mQ6NP>0!HL@$=Y=
zKJWU!uG78QaI|FMo=|t)M$Jb$Yb)bo*Svllm-a!MIeC0Ll8%u=*-PQT6gZfR79A})
zFuK6$BmQ=S3FAGkenwOLp$^S5?5rOkPz6OBo0`^?=cw|>z-r?o#U`k$sarsy&A&XK
zn~+qf*;zyBv}{?GYD=5DvYvW2GiJ;{xxhXudZVnU8qVI}%F4oXJ&~dlajVVC`@I-E
za9~Tl<501lJ9kd^a1}rClRc}feEUAAt*s5AlOlQU-8Qma1YwG9#;H?9tNLydECfmh
z?vB_#OB#ww)S5T10Gl3=9@58KDJc;#u2VU*DCK|l0?Tl-o-S$l`Zgy=Ebd!L*#M2l
zSJ8tN*}uQ*^C(9Lha<<19Za{5;3DNcdsZ}N_dQU<$_I+;Cr#R#n5fe^(a+a+wTsK3
zQ?Xu0j{HSW5R4>ez%?9H@3ZO;9Sg~G$=rnXBQ$CbJ8`~o47p}wYU)ySr%k|mA@nG^
z+s?~($Btwem+5atTok5c_V!-J%)zpfvO~h7#bY=;f4qLcwb(xPZS1aHDQIU_3n2ye
zd}TiHkj%a-D21FqUwCJ8<3u_jO18bfJ$`A!Y%?>eDrl75J9pv>Gj{y=*R_j|Pr<Ib
zePJ6(PFY!5s7~=;4-72eTO+DyIeO?#FWST}?-1GtREbgCbQg*$ZFV_dzrK;1&wVHJ
z7Z&P>K_1D~B_nprA4E)c6K&tNZ6$?3r}g*k)EK?!)3fJ#>R(LSuiWm(=~nBTs7~6k
zG@kBB5^!Nmt>^C0=;$pN*<l}X_{tQ$4;~B|M?^$iJ01`a;Ol#<ZZD21H#aqGOGTN;
z7`#m+`DCQk6&Zh)D?zY4tn0TwD2*0g876N`(KxDL(>Gea>G{`NG;(oIzGkipTi{$G
zJCdt>nIho$x?g|&l{v!QspLUg7^OF#%h1wQ)zj-QI{3F@_ENzx=r-~sLAuO`&f+y7
zysa&!`fo>+756V8B|=5v!GU9Q16V6BLF@a&PVc&N_wHp54qvFJ$(Ih5UTJ&_+|oIY
zouB|gG~i*!qEXAF&3%?nTu3nkM9PgE8?=)e6;`aP0puW7Nprs+_j{I>mUQqqF=B8z
zv7z~SpJ=Rklc(3(_E|fnkl^5~8#<rACa0$I;tnk9oLs6fc1g!gZOS+3%+z~u^2G}~
z6Jdf7PE*Fk<YQG;k;lFxQuE_NY74@fGUpe#pUIQk_@iy4g{gNw2M9IW#N-e0ysukj
zWn?IoT>$~4&f3@2K!!cSOs(|y=Z-VyINw%d`cj0uV6@yrNk}&!{ks~Fq3H$`B^=yx
zh6MpU#3kH9x;_($Ef|-3)QJgn@oRH89t)c)P(gC%(Ie|3uR+G;;VLWrlJe*gW{Dfw
z_p<i+1b7uJ#5t#&uq|KfV?cdC?5K=M1N0fNy3+>1#F)?hzx`B#MBpvWQ8#ZAJQ<Po
z^5q}>cL%mmfGns*Y_?!{_<;kNpo3#3bEserP1l{Kd)=dM?;&{WX>-<M+Kh4OhECB)
z)sW~{=}b}fT?;5YOauV|T^Od(qkH#h>F9|>CuTwgWYx<*IM_3)Prw)Z$d4YVsi~k;
z13X3!I(_<sC9aC6Nb(e4Hczs%Ln!#eF{{~@%B};rtuw3QE?js|IUsWitme3E*`)VF
zpM!1q#pJy5tP{%_M_#@>pWFCRO!GDW`f^9UV-lP)x8h@Mg@y9LofE1zjhj4gp5pL#
zoizhMF8`0NH-XBz?b^66b3`(fA=5=gDr2S0QkhDl(jZfj1`R|-3T2KcQHm0gBt(hI
z7&4@ZR7epGqCq6#`~A6}cYSMp>$}#wp7(j~o9p@y=ef^gAN$zHzJB8dUNd1wj*JMf
zv$sbf8S(rM5QMbh_2i5WiPV~UJ{#J9`ew~vcYwnGKDUw*W_Q1f^!PO+X(mseJY&XE
z6O+qu&IqzO@86BJJ=2V@%=9g7n!|?=N9IDq6udk7ry?k+ygW3PGA*A8<%1Aw(tLXS
zxUH4dXrcrqUT*aOeEY3i(bXeI4j+CpF0Qi}X|QI*#tiO(Tv=FPj~^?msvbsybNKLJ
zN*@SLr~$vgD&qE_MJstAh#B1`vB^23S$@;e=>Ur~5~3*kbne)ZSIEqa_-ogWqBvc!
zfOsdIW@LohCJ&+uVC$6v9nEgCPTBn@t9u;zEiEJs9KQMaCILp|Z+Mu_+xFyx<GXdb
z5;c7SrO(C3GcVIyN9P{Oc^c?yYisT8>{`FoO+lXkJTO`rYi*6B?B-2B_7OW}`SKWe
z;(hy`z(kFN{-A2vwd-z;+Ih9SaKx$@3mPt1@c7xYBf-HKmluH=SWa}Xy~KiTQRc;n
zy*}RMtwH9C7RkuT?R`uD<c`%6nLRpaP8kb_p#TvPx;LuN7W=|R>6r=|S~?p!2PI-K
z`J<I1FZB~vAy!r;NGpYfn>YT}j;s}wvia~9eZlC&dKNK#!|OJEmuwpdCkCS?D=i%y
z9&TxFE<9?OH#QpVDT<`Lo2l2Xkxfq-H*TDUhHP1+;1CBGU_cy6k(O`a^cT6g^%(WA
z3bO9qv$L~9^S`k=Uw7WTHNaRN3ZJh<llNuYy-8d506Rt2ik6lZ*Gzsxx#}n-g!uS1
zsB9yA#{64^Gaa}ulo7KeE-gNP<;oSn0CKSyOVvVV$AQOK!E9?jv^TJv*RCBQlCYUI
z>o%2v>=}}hl1)ub%X@!u-IuijsbZHSj6@O(=a&RNug}=Mz>uB94(=K|$47i6^l{<H
zha{(<X8x0_0|J<vnp&e_Cd=t~Lw*@C0yPzXA6Rne*fC|K=F!pPAP=!+GBq`Ay*faP
zsMxVnCxV_5`8JorgXI!QV3q)%n2ls=MM?S2FC2G?YA_G@qzwgWMQW|19?{|z`5mPN
ze_HSXICV<L+<XJBiflQ|*K}jc))F3p^-xr-c&Z$u*d$?Lc(w10i~GRS2ffj<kKkQG
zkh5dAMOex7`*Zxr5u}xqP(0uuj%rBFC#<7~1BV$^&RzTVG4^(}l2WgJ{T}p`KyiTX
z1NCZzM_-b1G{U{4-r`quBqXGk5(D&%>{wg<khwzKmOihsC)2l~0)?z<>0X$m*RV_^
z#Q+o&YwNvBRa8}lQJ!SUSWgHYS70&Yvdbin3{rP{okhxGaencv-`<?~a4R{Ec#g8z
z)qa(GN(h~T47HY8nT0BV4;5meefItPd$mDF4;~aYX|e6*rVl+nMs?Yn)%g88=Z+6J
zrn#9=1T+u=L6<JXrQc*3t*xzJr)0rY?3N^$*(d|_pvH}xVulBxF5!ckkCV=wvxzW*
zsj6UFN94yjjf-F9^q!P~NE(;f2*V7jF<Hu8J>#EvH7B1t_Xz0l?Ad|V6+lHtN9Ir%
zl88C2Tv_mt@0Z3|_BCQZ68zac6lY6}4}Ss|f{qp4g@A5Bm^W>DOE?yOf@NuOIu#eU
zWAi@pJ-KOJfyA1vvSQS^_cnfYDeY4@vKz+a#9S{tRhzRAOz`2GATlGu!^4jqGxc-m
zTH~YZ8<@3%*2iYB?4m`B-e-Y2*kim^Y9Ydwhj3feG;^_)m7b1{(NfxNGz$-5Wwb`~
zaHmVC^$Ei2_Wj@@!3uO(yg$s}M@j=Y^6~O=E=>uVXt_<aN#X2Rm4I9@A(8lO(#6Fp
zM2i$h(isvuKWS}328s;3QY6`za3!fxMw@_mby>oPj~|N;NxRil>KYgXC2RA>q+E3+
z)O`jUnVTzCo;oG}`=o39P*LkNL5lyWxX+)o({XWJNxuH>t`ZUwQc`*nlu{Sy>FqeM
zzhM^AZyTF*cm|i`8{b%SwSV)skz#QSa~GVQSwR&`r;QtY<@k;ro!M&O)?g<|?J95@
z3-bH7Z+tPy)kpQn2nO8oC_jI;U9!Z$cN!{fKBOU_Y!(k}SZvz3xYMV@j~*3*Z|e#=
z82D`7oX!9<{w+{(*_t);l8?qC)B=ycG;u&;%7<n`>dSNmHNg|sP1)1!Z)n@1+r4uv
z=#v=e`dN+>)5+1XK2d$hF-pt<Ro1!M`ijC<2GO?R$f>8-d2d$5plPu?ALs!k&XD26
z#c5D#crn+Z{p9^5IS{ep0FoidvBEcu=JUVLL32>1eKEr-=tSAm)<l?x9j`q94Oua9
zPx9gd0xN9a@=fl-E@uNKuj+C9a$x1hk6d#3nE+t#U)QD{{JUu$?<L|*&CL3e+fpvU
zw}g(rl#d76Zm&$oq+5L8&?Q1LVl+;ARu4A}wG&hqgg!JAy0>&MEJjU@FxhoWLq#WV
zW>Z}T=#XC!9yhwX#~9wcv*HqFm@H=-W8+!2b`!#3x*;NYO=X<9T(j1*bBv+mkbY^~
zT|(k0`2xittYzlfKFQ9;qmK{$UH930WLww1vBBSiafHol{Un7m5l|);m-F<{G=g+x
zizJL~e=$Mqv{u_H{`w@`EiUK=qx(91Sf+PxAytH*aiCwvcco;{9tr2pam9#z6xAbR
zQ&Q+R=JNxIH{~sQScRgRL;f1<({I0e73mT%bf_-sxsZ?oLOE{{K}T6tvGC%p&PIJa
zh6@|wS46=Lu$1IajSUTS5HU-szcz!AHxfJKAIAGK#Gr=7r@6`<buiBfGpk*D_B4L`
z#@);{Fi@W~Y4zT!ic(fG59z>cSA;4;V<uai$cdOG5~ntlIubE#eU{lKA0LepD?G35
z?Bum~-CpLFF>eogcJffEj5>%5K-sUhHdtb&@@xhu0YJ-z@tlk;d@nQqrH8)aIx-%4
zBM*o+&~^`wS4l~sJqJA#)18j>PfcBZcL(y&NsLdyK8dx>3L<7m_2@w;s-e;gTC-FM
zV<bvYP>;P?j;HUgQ6T=^?gS|Wsnb|QZN2}1;`2zf5)xvQ41+XrovL&_<6`K4VpKpb
zAr!7?=pdgdD9YfF`5iT;Y~H=Q5~2z4!@zE?3(fz@<72iako<x)(fRV`NbnH9RaaJm
zNdx~_358{8!FY*NoIhigB$t?Y>XaSAC}Jpk#FzO7z&I*B$aME>CY?J+dM`AWly3ZF
z@#9WKhmdUj1q<$0(rn}L)XZS8r=!}FBUW3B+!n>2b#?xlI=*-%$&ZiE<RnLOe$D(P
z7(}V|Q(h0*L;J_Nbxz6CQpPHVRKI(dOm_m+1&GK-j+3A*MgB8_X#~RjghP`e8~6p}
ziZ%kRnOn4@gF_X^>&uf?ySZ%6vNR4lfC2=aK7GKPn`h6qe66Ow-Dlx(#*I&Muh2>9
zHS=V2^fK0g*Rr*|Ikq|`mGKD)eD5Ywr@+8Y+i#CjQX(50^)YvxJ3UN1vv&(=+T9Tt
zNH7A#;c-Y3Z-AG6w%rgd4?Bci&c=8ChB46<0N8v2zvF!4n!d$~!h<5>($C965{Ao}
z5B}2rW&oGV)sYZAfAYkc(-5mU=e3d$uJPy3w|%QS#-w0aLJ$_lNkuQJMpsGDbOl;-
zFV4%ml$YhgZ>jACmCl{}korLVwq+bH^n!R5khi)29*A9y&~eN}PayyG>yt-}ct@GT
z>vR&3k-JIVoO?Xmjz5KF9<}s@{`<hkKvZ1zxIv=153Meo^_;nU-_qn&2BsSxthhw$
z9c|{`=!n6~<>^*jx15%Ty!P3XCs%N0lXQG&{&w!%=-IRHvb`+K%-Ef{V$^ci9J;Xa
zbP@4(bR}SK!XqMp9U2Y&>G8tU2J=2DC_EC5@)i`&<p#t@o9!2c(aWDpQ^%5Q&7JGO
zDaQ%J--e2!?*<*oE#nfP`!O0_2&@q=2ejL>Zyyx$1riPPuwW$j2hf|_3!ZXN3Dw(i
z<AQ59b@Fi7v112+o5qO#XpaTutlu9_Ifx|YN>*Sf##1jTu_8+Kj=xjQ6SE8OL)-o4
z%^UEHzT!k|l*mb2W_{#7d38_1han;tNvH47;9E6G**!g*c>zzOrhF$Uca}Lm)%)b{
z8CbY{aK2qsbi|BW>-1lrDe7?}xX0u<_|buuW{&7!<PA!<=MoZDCXknFx|FEF;+UvT
zpMJdPGlDq%bM6|?Nh-e8S*Lpv-c)4wCg0&&-2w^p95X>h`!Ay8v0<cbbEGxsTiDsk
zkaIXuu6qH4pg4G;q2ZaeWNagjG3W1DLQf$%@nmG%tpq`<KhG3=7bydPA(lLxM~z~5
zQk+{LYZ0RX_qF_lP<VHbVp%jf2Dn1b4)4;tfB#Xp!bqYYu6~Xsgn~hJb@jo8bR`xP
z61hg6NvQm3U*Plc+^pymqiSSGQWiwYz*!NU+ylv#F1pCPZGM!KBb7RlcTd1_H`m<w
zc~G$hel|sV4`uY0Ecrz-?(46Zjg-z_<)902ICt<_DdE}sdCa%@2rDcsCaJ0hB`)Nt
z1Mucu#iZ)R=SACxkv2Y81{!Qr+w`N+-u5}Z;B(JazJITF+WrD<xpKc&uJ`m*@MQ5-
z$q~Rt9HyzarOE^`>zd^p96>M2GB!<$|I=8xpw+RYq@=Lm@bTjd-`sF28Z+JJ=f_*~
zB7O`TIZ_DnDBdni(lA3Jper$N!GhGMy|!w^B_;;V=ncH;+p8CcncV8ZX!Q>%ZGgTb
z!%Hve)u+$9)+LN{>3K*ji~_USq}^E$^Vn-!RFF;ipO6|jV1T>J*2#HUP|C^M4jkxd
zxqKNXp^aR-n`+l=1zk?`c7oR40|)$o<{YZWn(wn{Qx>hK)Lk+^s-ue{zsNq~_#viJ
zka7833dG6&w%{fYMZLA_mY(uUu%a^G^u3d){_#(Me1k$IT?;PTU#H!;5i<4$&^XsZ
zI1#lS-lTimr0xg^2#yozA9d{=`s7_}zUl7Je_mKf*?`4XJo-Fh?oT<)F(NcJ$(@K8
zeh;PQZ`{aw4i(FzoM>)W%&W}q*xk0nslsG3V)+pxni}5CNtvv#7;^pA7tWt+8oD|D
zXU|pv_Oz5KDIZhg=K5R+LQNJj8Dv!rN?p5lOwwQ?p2EnH>s%hNbc+5)YjeerG@5{I
zVpv2(G8?(&wT3fiijo`+W?c`ECUuC=bSELvvW_AQLVmoR%)}-ezd2Xha?>+1=5@MI
zJ_vB{H6aGMPSMIAZ9gVjZgi(e$h9Tj;e0JZsckdM)HIoLxYLyOMfZ!8c?pB(Z2FMN
z8>M8Wc>IEn{jy~<SAAR;wJ0_rVS)MH%#B}$JRL)}TVK*qP*)^{aAl}tvMNckOG-=E
zjTt;;*)4yb3dYn;sgf7y0w*q{<#eTCTs`N~vlW-fy|l}u(pE=dc~Tk&YNzvsGbSLI
zvnV!p#KMY)_j7ZDiypP#)|-)D_2si|@McI44<E34aEA`tA!F2j$pc$%aF9oWBN9S3
zRgjnFouX~%Yy8%&x5-~onv~A8Y>v(l$jk4-kZxoRuJcR_Jq@97^-P<<RE6(lTMF_U
zy4Lrh)rJ9k`yS&;J$&N8c%}eQ$C+S5Wn5lq&-htOk|5a8GrM`i9@toPzjE>7t*yf(
z1RuVoY6Z3n7v^y5z|IHXuKz0rD9Fp#&mck04=Yyt@%v7z0wtpII|9FDcp~Sjl={t0
zgt00)`9B5{;+od!26KEZVsAKA48e!A{=`wHF2+`G+PVzdk^WW#MvLg`+7qWpXI#Gw
z*}4HCzB+Rli8O9&(E*I8UWLi%LpPa`&`s1M-80{iEs%M>Ya~?9@NK<k`4OuA+C_xe
z7rqOwy~ASzBKeD8$bFAFj^d<jdQ)Y5KXc*nsRQKX9w(@?2>4jdW^-Z<JDx0^^g*LG
zh@!Fc?6{{c?ZkF8kJEfoQPD_Ej<`E?OXrj%Olp|>=Sx+R*g|m|e$`>EB$*|T|1~)`
z<s_CRBBHeE^F<=@J-Z12QMY2iitiM(I8W`d4t-HE#qal2gQt*T6i$Z)Afzaio%Ztt
zWCg*<eazZr_V&7uz*(%l`^C2L0<9l%Y<R%jj`DuU6=sITlLdW*CDE!CT9rY3*Eav_
zRBzSD$jC5EEdPD=e_DV@%~`&Cf*|k#&W9WxBM|nupj(YV8|jG+*s@<yWITO30Htef
z9N`@KdEY<6Y{N-a+8oO(DH+(0e+TVhSg<WFPO<q<ppOoD;@w(NTicK1x_)i+xix?N
zD$GmSBox)`x)~T7_xjkTd{SYjUgwWI-N)*YE8(9xvMR6EvJuF<23^U;#Re1^Sr~qf
z^bykV76r&EB1xlT4_=G(u5bG?RaI59oQ7T`uZL&9UCB6(oNf@u?}v=h(h6Ty{~4^Q
zub51PWR<*63IV2rQ?EJO+3BqP^b$$IiH<M3GBi<-^Ulj9YT|0CG3jtonVH8e)MfYL
z=d$N<WL~Pk#p&zcv!_4v!lX$rf%g$FoTek!K2&dB^JJ=q+Od~&q5d2l8%yTr*v^{O
z&N^ED;L)RAEC7nQ@VcZ|EkAy!yB8(O&zMnm_il$HLbUc=(h>*2nV>{dVP4^>oU^lU
zF8rD(nqf>4Twk%z5rW-Qr-LfQ-^jkFVq)6;d8ydZGh_Hx9c~Z@5*urT(sriHmzNV!
z*GMJaH8C@TSW_%A0C2I6uSG^hv94aN%Bc7$nea}sXHSYvgSF?{*lY$_rB8ofxO|u1
zi6kTC`i-xnGxs*Fo;cER@P)ioP#Dp8mITXSEi~5V^<5?Ik#<o-nn<adGZBt@n1aIT
zj7=R&o|67+p1C57{vKX*5>V3Mcv}criuB)2m~*0b$tzAT_n1?s0xU+TJ+{J{HPR*y
ze&1_}7H=-Q?lv8qURM!)@tL`kp`-6=YlXH-mblC7>i{A*z}Zt5VbU_1F9-~D(@?V0
z>E&2w-2zf0nsJmLIdrIz<2J(jd)XU!2B6wCyX-^I{)Um72W!91@Ug4W3_EtL^pz~e
zMn0@JR)cd{=4-wmor+yzPti-T<(Ls8I*6~(+?YA^34N`b;0Q-5EkO+M@LObP4SjV4
zcz`SmaBadHaX>JpCPn9G6I;$quU(qDcJS)Kf={UZ{hm8o^ge&q(bb)&Hu{Xf!dx<?
zdef=_y?6<WC%s9Sn6KL*74mx*?v8i|IgP;H+WAp-_VP`~AIB5HAz>nu7*H5s=jHwL
zo2r4N+K87NXpqX+KGoJPDC@IzqK3wY?r)hZdSV7VrwwP(8!+fKbEoVWdS5Eq`)zNc
z2(ahCfd`eepf)^8nMmxS7WuZ32ktQ8vVbvEi;mBh=ln|l@KjkzsazYk4lpner+75U
z05IWQQZ^g%;M<X-CzJuab`{NT(BpRYg|0EVMP@;io9f#ddNaeQyz*^zEooerewYSK
zoG`)b^$^;hDSAqag&vDTOHK^Sp4Cgcuz8?n(13{$ai<RF>qGc653TnRC=55Nr59jS
z$eeXgDtN8=`pMWZi|m?d689~kx3>E9`Szn}b?>I4G4Lk)womQey*q?R(VZ74lY!R_
ztM8VT{lSTV4e<W_eY*+89#>707i?JEzD?(OQ4!+ljz^|=*62($r|LlN7y<7j-AzJ*
zCcd8Ptrshc5{M567ylqK=ihr|nk?e^dqg>x+~&wR{9zA`LIzvRm}ZU5bMNzEo0ogq
zQ7e+~6bMY(v!`29gnakGV)@ad=a;E8HyQe_EHEy69{ND5aE$xwcS5e;)Wk6e`}<&y
zU&rCFP}8PmWPBSV4ch5vW(w9pOa|E-G1dDrJl3U4<HOhW{4K5aeIk#Vha#}P+Ev1#
zU6!e!)pcB>uHkG-C9AJ}oV;J?64*R~F3L5;2-2pU$(S(kiA*Yb{+vok58>fD>?k7j
z+tG9A{+;as3$<AGhE{j{yW>Pxl1NnDgC;u2ZoX+ye=v0gfq`|T_=lq7-4U(^D3Auh
zBh2XzKO-*|3tfM!?46%NvhCUxop~}b@%8!2^`sma_E@UO?pKAO7wXAHy9NvtSj79&
z-1aw3LJZ{2qk;jmFft-podXRNQQBT)U`DOAi`;<jorhY6VXix3+@wh}$j!H$-3TYc
z8R_u?{lPVO@e5BjO1JkT0eBF8xi61mXpWL1nDc6>gt)Q)D1h>p={j@fFt@t=a#)z`
z)0bN)tnjD%4IUgM&1h<1{$s??UW@3BzLnPfn$^W+?WTW}5M$tbd9;#t6<H$Hfm`+v
zMk(HN%;a_ZX;-nn$l1#m^%3h#tOv)5rba1oqBsT88n4zmja9TqQgvUzhYr3MnG`CS
zx#ia$>2pa*H`jfXvff`BHgur(lV6CVuU)&w4X=BFMoA*+^lqln%q?ao#mTXkTl(ox
zV8puvj?Fh!*pZo6SATVZO!9U)`9@}unVDPXiNr!D@(Qndq55+1V(>&c2aX~L7xf0a
zb>Q>2n<N9Ix8LqhNH(yauqm#G(VVV(_w4b~Ed$ZNZNIvrw6)d8!}`gxaf~t2NuC1#
z#Lj9eE!M-aHTaq>xmxQeWp#CiVz|m7JHaZgJ}Hhq<O`nIY|XEE>zCqoMimoO&#9@x
zj0T~I@4cM^h}t>gQ3)}~eAD-}WT7cnUt6jOL)VM9`26a<Jdn-j`=c??E=6c{@inFD
zb!%gN%k}Y8x-Z997Ae<wJpWp)M_)2b6{2)$Z5ZGSNl70MTVA|-@@D>_u16$<@-g{*
z%77w=?0)7uvsPSU8||E-3Z^^r=fWwyeeM^k@9wbLdh@Jr&F6>--?v-sTz7!gb*Eq@
z4b`vq8y=<dH8Uiu>Dj#%mq<+cU<Qx%Z}N()Zb$vvcCs3~{OVK7#;>Td4nWS+*ZQlA
zY4Czswzfk)zCh_$e)u7;b}~klOpT^m`>y%oK+2TX=H@#dgXg~X=}ixBV61Dn{wil@
z70*M0GKbI2OS^XMf~PzAr+lT`)hky*TMbiU9>+;YO0HaRX_cd6$@!{MLiQ!qZZzYm
zf!5<vjg~Hb7WUfzfOnd6L8l)N4~m1u_M(UiP!{xxgc!nOdj2Y0Am`wwc>c_po$EGC
z_o^pJ-8pfmG1EjCF$(taZCT|M?Y&Dc>^U^&&?FVn`vI~8%HMbJT=NJTA<sfFFI6H2
zyW@x|>(;SSj@N}j*2X7#nuMqu_4BWkM}^2qz3vL;Ed-Ffscomzwkt>`U3z_4Lvp}v
z3`rf5A4DxKF!ba>4XrB}T7r%ftm!Y!sTeWA{t>Jes;FDauL2Dy$%hXX=C%K6h#i9E
zm%NT(BjhlxZh$YMlxI$Tn)e4=luX4)ysZ+o)z7ND5tqAn+}Z`HUc~czbE*YgPMyHb
zk|N|5=l!dO$P7g?LiwGi`mc+DI;dBVguGKS^4}xCXiyXK=Z$P_GY9u^H~tuKFfube
zck?Qtfep3!mLH7@6a&-KBim+NF8BN<G_wL;f#X!DXu-R(vO@6nV-*JkpF39rReel-
zrv6GYM7r9LY3WHYV_+;W8ukUw3r|<j3)&($VILggNp{d46I}O}WjRf--#Pr2Vc)Kq
zb0nH+0L)aXd6L7car~<4*wB0~ZQAXIP<STrlj_LM5*>eCHDf0++G%Mf#W@795r^Fa
zTvgld)Eos`$qpE>Tbd^UF?}jYMM-Jiqx1`Bbv!-QizaCdZg!mQ64b9IeDw{JMds!(
zApON^Y*ESvjd=rdJ3}kNrqaVS@YnjSWgU{7D*_@S1}cTvpL!{;pa6J<<{eG#h$9gm
zrPkTY<G})9$Pb)IWRBLfX}%DDWFAoKt)NO?i*8i)?cI}?-pm_oS(DoaPbB_tu5_E&
zYKUhlF4C|x)Xv}u6+^Ng*jeQ?1cW^^{&;xX-hMd)?rh#Sn1)_!ya|YUR#p)QRPKnb
zgwxDU8Dm6%kQ{&dR3hk!Zry~lEUzkMf3eZm<*d{q5z=;mUUIp4O77`XQQGsE_}N3$
zQssN;C3l`1FH;qge(1Df8vVFc`KKkN29Duuc=ANvaO<%t%l5{)!lY=O_-K-+qB&D>
zhx_2oij;czStN~6lEU~PE2~G%S6J|b(<}B@I{#|fY})ywwTA-@4%g6D*!o{TdGZcr
z&+P1ZWB^N-`av#pacYz5Q1PJ2xW<muKcThP#qD^UBD+6Vn^nGG;_|2nojpatFQC<c
zQBmRHyQKv`ZlG-Z-d;$XAf3el`+|agd=}eEcXLvU68p8&vq83irsv&r54o(`Hu{I&
z$;5N&W@+`18as9CRo6JZttZ}7eCpDr%jH`gr$w|YVEL+GB&rm2<v0ay<T4s+YP?e<
z%Ui!_dR!1JMnXe5<~dJ7!^5+~y0sUdI)6SLIMLR&xwr3Qt+ZrSDvLr9tLOzT-x%}{
z?E1Bju0c@S+uI$c_DoW$Z`*QC);8(vGyzZ-G%Z5qqNd!+LrB)_H_U>AUT*vfodWB3
zhOue-m^<iic^VcPdd1+Yi{#CVlcV8<%V*R9JK;noO+FLWT2xrIeR#y;mx?EUyJ%FV
z^}Q-pTGq^QPp3UhH4%PJvV*alQ1wd6tZHKk%$sa7>tI-z;70_DgI<C9mS_IdmLT=k
zP6kiw?xonNYM;f=KR>^bQ{N!1#v`!+kWN}pW$W(a$C5oI%`Gfu(2&{lM>9n>7RQ}C
z+uo0hKB-vXxo%zTnDx3zyT5<?8@WZIfLy-B`sE$){^}=3jcsighE=UuYr9=Mdxua)
zUn)+h(l7=k3$7RFs{wGVf`%iDw<?b6=r5IZ`*yo7x~Sg|u2LA!oZA+Pss(V+p`m>P
zf^i0;9Hc&I+{B4}l2Vv@6_@hG+Vq9odshSP0AVZ|MVO>S0UnK8-d-EK%oFJ^tL{2E
zcB+xnQn$}Ndg_y^ii&(*YGFHp0p`=XVb`Zm&Jl8|5f7~2TlP$9SM#s*OZvp?Ry#ei
zZJKQ)-<32+LlD*UEUGe8|MhnB<DLc#G^5Z+Qctz*K@gNM8;}p8m&Xqu_Lq|*T?nuB
z>(TsY^7!$Ws=IDq?uPbwW`g6hs#4j`x5xGGPB^9aYFGeojbk;wx&msGGi(NqN~DMl
z8Im>YPwVZx-%+N3D720;)1WT7@Y$G&pb?7a@GE?o1UhQ2Eh<PwAiGV_Dnwexf1h0^
z*Ij2?cF?y|c_EQUk2<VfyWheaJ3oyH6C#*Jx@XUPo_&1kQa0dVKbKk#Fvgg$dE?KX
zg(SMFw*p$d1KB5fZ*~EtQ3z2Uik4Oy(8Q3IID$IIh=6jN*O*VN*6i4_`{ap>bGHwC
zy>LU}q1wZuP5BM#t*aOQa;^ONM9t!-o=dk#Qb#Vdmo^Q79&T<9kMfK^ws<VLAGOCS
zCa-UyjWEz;)F`&}@BmIF4#TRum*Zz^6i$Qt!JS1~e1&rYm^0mH_)MZIB4^v5-(muB
zW$Ps4t2}1Rbc|Cu33;SThhLg3eO_<ul)@R_DZ05F37+fMLn|Hlvu4?{Wk{`JV#Fgy
z&WWnNkdPqPx35mJBwLq8KJrTjeZuAW0lAz}-)VW(M>9H#WSBC{%N_MCwxK~=-2Ris
z4qFTwb0|M#M3EykC1veANtj>h?GnF*q~zxM>fYLO5$rH%H{%9$lq#5znP|IQJ8%-g
z{+;c53`wS$g~b`RVtjn@7GGz>!gl$3rZq;&MiWQ5pM17*k5|uusrmjc3&IQp%L2-o
z?m}^ila9zwh4kd~iuG=8^vVw{yW!BawZS{$wzqur>^?FwbUToP+(36kz5t!aDb^ss
z(a5GdPo4<+E$8yr<20$sM6VRqVwfOTqKeZnJd2vuh&*TLS`uGv)didP{{9jgmtxsX
zYs!?;?Ch>xc97~qb%!|RB*sM_$aML9@=fB-h*K8>r!Bv<7#>aYtNGlyRbcj;pBpoY
zj#O9C!eI+}1a(DiqLQMLQX2I>eyr)7|HM%v0{VB?lk9T7bO(e@;fy{2G-$`LD=!Y!
z%3K{u>4>Ip#|1u(39-LZyN#=EIq?o+RMYewKhSf?)R*^et#I_{H>mG|?N%{qmdho;
z;rR<u1ab4nF3*9}q1t?~h@I7Nr38BC-RbD6PF=bnur*q`HOryn-rLLcBQ^70zVt$$
zz>{+Q^4Whs<=;J_4BXLv%?O|c+OE8L1_W4NjwSIQG}Vi<XFD<j9uE?J`Jjp6`q0hv
zuYVyuDoiyx^P9GxGD=TQ^Zut`7x|YV>g!T}G4b?ig)&3<!svH&+c5=ZaLy1EhTgq>
ztt~8e&XFutXmIH=Yw)nsQtv%<6Laq@vi0sU(DY{$oPbd5v>U6s@y~DH-X}2K_yr}o
z)M<$hZ~XR<i1X(^q4uIJh{O9Wae}=~Q$z0^wdLgv#3w3VHKralf38yLoO*|~m^&*s
z>lL{k2|K8{T21m`Y7(3?dW(uiwiP6an-+-;MMWzsrneTWI-dxYi<(yudis)FGu4Lt
z9FE@~Lr$SeV`jLH1d3`8gU5zJA5_MU{pdQ@fK@#w`6_(+saI9=;%;s~fT!R5`FHG*
zXfK`gRb$MUwLBn@0t_3H74lh!2RW#iD(>LWF)-@p%X4&Ed<{!w!M}4P0W$uhZ*_d-
zRfQtu+us82(|@#PVdJh|BU&6nes{YSpLcXkjl$k9Lu;PaEqZY8fw7m5&+#c6rNnA9
z=c3=yTiv^lP=zemj9{xu*;KiH@Kr;D6oZ=&xLuriY}iiJ?kZgmL@z>OU6`G%#3ICD
z4T@aa=jTqR_4;8Eoe(*Pg#v-xYt{w?1AE|pqpx$xSZ6h(z{lQ>0)AsFwPxyO@#?^t
zdEsB;|92RQZn?e#=|M<J#BkXCVujSbW{8-y2u;W#wUNwy($X4mg!K97;ln^|;AZ}o
z3@25rc6+27939urB33F`<GRF1&r?T!r-{n8-yiI9>&5fuheI*(dF~UFDyYz#Pv{Q4
zrtNpmJ8E^I<4)}GQqc+k&(T5&lh3E#Xsa*pVErv<|2IE@_g#kQo-!;P44=AVa)*bc
zHMl?&P<b80fT^BhHON)jEtPn1S?KF`KLV((Dk$q_7x<l^f2ffvW6Ro|@|n-`^D82X
zgb{gm$H^Op`2EQ$^z_0*$9qxifzOLfO-I-tT6MTzYbPai-t*EFRwwzg<mmNvf&T{P
z68zd!OU8^+ktKpl9f?0UYH&EA01cvIC8=!6#g%6}6)A81t~2bRjDOaO@_nyu)F)-0
z_-(~d>$w#NxOT0T1=jHvF8aePEEW>Irnpyh)D~2rf@I)@1xME@mi-vCj0Jv9$ichz
z<AA<@U})L-g}oW2RPG#{z|%XFc9ED`Xu$YcwRV*+VI$}Lg7i*ysZ46}@6Vh|rcJI`
zQNe13#}a?<dJFVhr5s%D93bsE+>&xI3ivU2?ghR&E1Ao)LEpmTn`hh5*m%;s|C9|X
zN|Pqb3FO4zPfHzE=oM=1D6Fi_@N6R>1Rea8f+#{@=k3!UzF~WnPFiXF^Q2Vpp?izd
z-)>&s-}}iunP1WF$oRZRDdGy_>;ERNvomFYAW8Nsy27<(Ic~bV{vJ88FJ|DocN!iU
zueuP6mz_U$>E3^4ei;d5@J^wxYUWSJe4m2B#cZOvHYH>ZRK37K9G-yz0cbT}-MP<!
z2%Wa)v5IUhD_8BXfYW3@OZ_TaJToG4zna+oC6Lm6)|@#zojP^iK7Q4c{`n<PB5SM`
zVA;>Wk=$H?7i{(f*qJCEl(s2bSkQuP#k}KpaM^D)+y8ZB8VP-9e67^Q+X=xN^@-R*
zBci=MT-$qK?%A!Ubnuaew_uaWef`KA(d=uYuYW!;{8Z^aRtS9#)vwYfj`92XEiv!r
z6jof@kvqyYN50e^2B;_rCwQiuimcGYNL2+Q|Es$7StSd9YL~6vl95$^OwaCorhaLo
za`QQ%W)mW^$xOQ7j1U2cw@uoFA={h2sd&c7GUnHSE}bHudDsfw3WU_e(cX3n@MQSJ
zd90&no^uSw<H|mv2hb}b?gR02t~WuotxigUK}`8%^Kd4}@;1M{)h^R?*!HqTj*g?B
zX7v#3WIQyom`KO6ynV`=@6HbQU!z~-1t30hsA244fWGmtTG2AX)cY~N2w_2Qh5#uU
z!Yh2<Klt0qAuOldnBeR>XwFxdH2-b;=(q~`wEX!dj!HHef+_DGji;Fo5g4e;!}2vA
zDqz%Hi#YSQGO7MFJ%_JS_Pi&w=BBsKZu;77<)y`lVZkz}8BQl<pCHH|+8Fkw*QXbH
zCmMGYy-kmna=~zqKOi3wSinJq%znpX=j~093aq&-`?itul1BstCnP8ji4+=Vd=0qD
zXoW35V27l4TcOhDN3|BpL)0n-Z&Gy3d`2n~5rMCxYT01*RiUe17;o61VK9O<sJS#A
zeWyXU#iWcTifJBc!@Df}yy_xB^x%Y~&5=|Pzs6677=D<z0Y5f_leYo#t!D#4R>2e0
zHMbjp1!CQM*9~Xd2ev)bWZ>L5d*tvKuC1#Dyt6hqr%or)Jhopl!94L~SBK9=fLV-(
z01OW*!M&jDJhL${6+9G)9)mQ46<3b;D@wrW<<P~~XU~p0)OJ|#H*NdjS~pIoMrIgH
zs?*@skoz*zgVK%s_PIZO@M?aBqxgc>!|K`)$$?K5zVb2+oa@O0$sE}AKsgU0F_HS1
zx|3|0)Ksbi7FQvdF0bm5{ryQ7N3&Kcz3g(Y=kiel#;>>s4cqc&go74in>hBdSla^K
z2}&sJcwc0M@SW!J`V#cL^D;99saMoC?WfU*yk;j$M(piuzI3T!$kAE!Oj}wGI8wZ-
z7knB{`?5lWr-NwCH~9a6BxYj=nJij#`b%$Ni3y+j$Hk~Ua=b44{n<ia%D2zB#Yptt
zeVJq`9lmzqW`fAioga^}^@q7<Q^UbaG2Ky!IR|awd1_2fL^4-(P0Dniue(CgaNkat
zfwnAm>7)tt2TYpO?^oH}uI&zmhMpResNW|fw@P9|ZH)K*ozgySE#t><ji?=ii#ly`
z_?lJ@2^%08sk8-}Z04gDttmmgLU914qU=6?Txqm=n^ayZZM9<K`9$G{rl#4dds||_
z-6Pg*<mmQ3UCnG!m-RtE<A%j=a6guQ4st7Bt+%Y~4%dzJ%;+Z%`$B(3_O#DL9q+_%
zD4&8%TbT-Zxl@ypvU12b@23ahAZEp;t%B8BczLP-h%MAfAX?)(dVJf(%VOwM6=>>&
zNg}uI!qP*W%J#r?L3N%BWB#sV(MWp5snq%;3Aa*KW_k)Gowv7Y(KRGmChxEisv_3b
z)-En}*bku--TGy%WN`F!<G(ECgcUd)qdUNtPO~ES$&-C?1Ivn(8?Cox+vN}HyEp63
z<FbpthCB8syxr+gZRk^u(}+@M_^~9NGrX3Tec}Ly;N4bFzjoN?x%T@9;=NgbN`<74
zm6he2vjx%hrpI7b_j1k#n;B;*w-~h94`G_l>ecAnD4q+W#QXMb(drz)+%xU=Idq?)
zAJMEWq+*icBXfj~hO;3nipeE$urOV^^u&joE5X<RzH6JAoO8hwP1{3nhxK~x927tS
zY&_)n&z?MC>3_l{_@Qopk*#LZrhvVBCDT4zT3MM)qBCmSsZ;Eh(-#KLY9=F#laI(?
z-6a_S`jKH?dy*FL0&R}L<#$6h-TYVeV{f&5yvsj%?I0|QzaRgl3l=Z#zvV5vT3b)5
zo#<`qa!1Gcm&2jw298v%xVq)-P}D?H{fN2s*Z11smlUw?D;4<@nNKNG+WxqM3>h_)
zn=rXAa%=Lb8Pqb>e#IvxAvqo@zIW^7|Fi(wb9Hp=K#*U5mCMZ=X$&`}eAj8wqQeZM
zRQ8NEhE#bkGw~PgBn(itq2Ub-Imt6Y)H7(6+YqtNkXas5InNmU+|l|31=X=*4_J&4
zuzGWijJ5;5OXzchHVEUG-j;rm_Ei>($8Y>>W3y0SAN-B1urNQr_uO}=fm;z7yK&C0
zCp<*PH-c*!%@uL1BshV5f`Ze#MtXf{Hqop_AQ&)Ra%@^r%+kzRByfmJG%rhcG4dFb
zmGf6;y{~D`+TzQBs$mvLm(G8xhu_Rm3jv{_P<2(ycS3P2e|!1*^=d^A@1XWV8Q2<f
z{=$WYYuk|wqdV};nGEZcu6^z0Ir<n!;U0}uIWR~5ZPH<MPP8e~QH$>gLU6j+nqHgP
z+P>kTPzEL8<PChOuAXO_M#l=v(x$^tb{w-m_DUZL3!A>-Q-6d0-)Ur7O5>ri@xjQ*
zHk4W+kw@z08X*G?-nQ)ydK0%FBzUasw?5ZtcJ=xB%_Sh0!o|lY;j;^dv3J;~jVl|W
ztFR@B^jxeGhIGg_+#ZPz;o0#@aTvD59kXY73AbYO1>~lvsHiciyxsQOn(M2ik9*Ar
z>vBof#EL)Mf%5R-`KyTf?0XZ%a+_V3_w9!_F13Dm26e6fx}2HVH^s)TG_7#pWTR=}
z?uttkaj;8`pH_YsX#}%Vwo0ROMBMwXdl{|7RJjIfpCz@Ue4k#@RZ42Zx^>E<N7E1l
zxA9(P<ZHM;?(p4BQ$IhyoD)LZ7zp@YCB0Y&%SxkNV2l~@jh1IDX8IrlsJ#!Z7F|Un
z$&x9v2x^$M6j^Z%$*M&5LmU6n!bF;xY%kk&9RpQwxp=W#&c(4~#>9oI>o9(NysdJq
z*5t_*gSrBkoa6gSYeSkUuJa(uF)22?`P*B51YMH0ghBX(EI47MCI5$5wuo8s*RX1s
z*(41p-eQcngAj11lD0fVTd0d+canTJJ?|xGOHTEe%ybBpMVhq;pAPP9L~8aOQC3wc
zQe#0e&@3KExf*Ta!wyvxBjAzpzdp#zJLnWV)7Lix!SX`U6*P%ZN3Fl>5kz>NO1FP3
zo6~6%l{R>!>Y~YO7ZHGIK6$^dJ7xz{vcId07%_XutzPY?USSfH?SNds&ck(XDP1~B
zpcjk*5x=vsK>bTQs?EBf48K1^W+(=SguH8&7w%Cyx(rfZY={2n>E%_(G=mo}*0-=Y
z@@g@CK$z)vK2riG@}uG1PP(m_69PX)>DM|l2`$;78(~08dj1KWLKh1p^`b?{0P^0v
z5o}}RwU3zE+gAcIrL_rSJe0NTg;jlP<4CA=<SdM9Kz=}Chd_r*rQ`1I)-$kW?;?Wz
zaEfX%D=I8xv(at_pI!Lk1)aeNuV3@lsN5`Bl9AUT^~x3bkkhDwSs}+M9kTzpB61`Y
zfnSkleWs0xI?Wo!i}eD+^z8b}-NR!tttD`va3|E42<X2_agj_=7t;F~5*$3U$A`Cv
zi!Fu`|6$S_?g_Xa5ag80xZ4p1PEfyvn<{bqY0VJ&ze}zt!T$+;b{EFbQiEilmK*MY
zU`siE8sgXN+u1T*hw4bs3zzzC!T_`OKXjIJxo<A5?D9Jj`4wb_21zySCHW<udyFjS
z^XAR*tQC}^2<xZv_V#4cQ>r7^7#;vW2F)Y%n-3nmgoBI&>%@sauv8GOj7^pnvp04B
zX!Vky)sq~*!o$Qw0QOLdOX&^$C&0x<a}d@RU>f-;%lfcgAo!0)L{OsU8MA3+`}L#T
zASn0Qrl+KmPMu3EOpp~g>H)qRJUx*Ete6l^_3N&&CEFo#{CDu@$UfttxvHwG!$U$$
zsjI;^Q!kKFHb?U!5ZqgC@?KIdUAi5j$0EWtPS4Lf#;|^$G{wCsG<|VqNNbY-SQ5if
zxDoQXWte18k<dUVm<H^EEr0s3Q}gw_+}xS#f&>v|xSun&7N0(~ad<AWut0cO#(9A<
z1X@;G0)f_{OTVS6;9*!GE{|RvSlWL3?W3F^L?A#a01N!7A{OVpiE4lIC{@6*i!UzY
z1VSu}wF!Oyt0shVVH~9qGyJA!I>I@_NbYP>&d<)?{UqhO@6Qy{aGZ?6kT@?uH=yvG
zfTt_bK}|t6owFN`u%u+NqGBA2Br|h}q1R%1C$k-J4H}=*wYP-0PoEG9jPE{2VUA1P
z9|X_i1>vZU9^I{bcgCOD(z>)a%f!@_G2R+aU;nTFUR19PF6y;VX%cg6fn8TOZUjy)
zaxS^DyVXdr*BzI;6CG>kk@L?cB`syH0O5nInbu>_Ar*0<r)$NEZR=za`l3XN&+809
zgG0}Bt{y0L=F};)C^T9?G*gy6r0&W@f_uu6y{iqmQKCNP6+;+0NF|zZarm$&L>~N|
zkb6tSP)EnHL9%A58{DsYSM}czj?}eL)wLnoYTNhh!Gk%JfH}eHjy|PeTgm1myCDOD
zWC9~m+U0%mKA&Z0&yrvc9Kp1?DsOvT(6(t;h_@7GRfY|t87LZgUwV2Ql3P|S`L5=|
zyDwiJtv!^CXk#;AJZHh+Aw#BPM$KV_!gyl%6VgRoPT89Vm<~lp&onV<<m^t2N)lSv
zq)%FUO&dMh5z0p*hQJnFtlfqNd_?>xEn4H=W-T5%&FB5wx3>~X1X0zCi*K&1>2o}R
z83z@Jv|gP1Py<)2nKaUZc=4cNwizTEEXlAn{o>hQD1k}*8>&y@!P|9Vl#`tuxz$G7
zO?1x0<$M?BOi;(w4EeMIeuN`O@rUMaPATLJq#V+7hEYSQreP}m+gutJ=@adXT-8%~
z`|ZUUK3CJyw30-(>Did9rR82%$IIbZTjT2b=Tq^-6d|u9C>B~A)sF)wBpsH(^R$`+
z?Vs8+#lU&>SZ)BULKwCwHO8z1qMlf19j8XyZ_Sl607yXV!JASn1xhU!dR)J8gG~`p
ztEd6VM38>J`Sr7BNqJW_G{^Vv-nMmh$gyKL5Y_eezropcp}||IIHg&wsj990@+vLl
z{MkEu!OA~2QQ`a6++0@}-=`=-b%w>$;b#r|Qvdd5XGFZLpjHO+L`2s(?fUe(tuZhi
zzXW<$JB@t*jz@kOMaQ~z>r#gPL2Qfvb@_9ZG>J+5uq$zMdpr5`B7J>kd<%o%Sr*Q_
zozW}q`KaPiC(LgEtMnYCc}r_$Z?WK}1+YAHZ)aj&f_dwm#`^kOmr|TD9e@V^+T5%)
ziTuA9kUHnc<pE55A+~|aR}aO^YpBN-!Cb7RVWn-<dtFo0kfP0|^l^U$DckfJHzEK`
z@$v23aZE*ykfro|Hr$;sI2kcaL@TEqkl*IW(bePn6LG0gIlj;gMEwk(68juRpfI$n
zd=LwTHtIbT5(kJ|b%p+F#%i*>*FDiSqA@-U(`;4Mr_DFz+V*gaX|8NQa|N|4_<AVM
z`@@|vL!37L$DU5mfh-CNQ@1?Pdz_+7M7CHw#*_v2yWw4zF<m-$#@(6D$h@|`hmIUU
zIelgK?ygDQB*Yv`Y%X%YrgIl*Y$h-hSo77|Z!vbu_zxqsWiM8F!R~HwZ6cdI)Ts>8
z?jj8?`$CpdYPcx~*^W#XGR%W|97&i?wU#$Pm#>@dEsyo&mYPDP3$x?QAW%R%xacWA
zjg@`XS9`8NWSRaZw-?Ys0Ks$r&>1-i@q}0;fst`D(kWpw5cd<0JN^Cxa5+Azw+76b
zl}_vW`AWIoz4J3RZA7TespIhiAu`=tAn}R5e!}<Ex>NBY`(V-$NX6MO6f?EC9+@6K
zE%oAsAF+PeQ2RpF_Bs1zZrmYK81R96=g4J1D{hA(0DhC>N-^`4Cs?rvBQ}o#<}gZm
zP*S2ZomW~b%?M}E7J>*c{I{&QkgGskTW@tPHE`z+>DJcf1opt0Gb4*`$#wr@NSd)r
z@#QbRo25WzQNCz7-d)t$`#fBksaPZNw>hD%;=AQAVpsn;$p$B;XH>^H8?0WnN&||T
z9t2%-ID2~{CdFL*?l#iz%(|ydAB5SvFjE*Mk4*zdCH<zqTN<ddX@eawdf|7daSoiZ
zWooAJhdr;B>mSod+<n8;*f<Rj0>I;JZEYG(=;Fr%Mo~fG`5D8ZB(@|1kspGO0N5F$
zU4okhrU_?O(N{QV{0fYO@|b`Tu{n1xmPhniJ$zWj$qI7-+UwqVHa-8$R)(R>eg52K
z#g<$_RmwmM{s^oZ<tlh`m@gKC92G29F=@};w1J(@U@cRJI~zv{<rpLoZwK}-8n1-+
zr=})ymAGryyx>n^dOupZU?{-%?>cbc&(~^GHW)%zclYtzZ&%Seaq?sZSv!Z^tlp3F
zO0TTpR*hLbU?J#o@4z=H(`NaeyvWrsc3F(ID6x^_5C9E7+44-7Ezg7fdTEDqOQtR4
zX5!*YE6@YtTEO=J*!Y7gvI#(Y&H%{eqqr<t?`Rp<e;@z*8UFp8k;G1l$I>z~b(k<y
z9mM-3qNP`bFgXd?$dA@m5GPzz-+uiT%$@u7Q}H_bm*`z^{3@b&pvvNmL$Op>7Zn`r
z-@%a<D7>s(R;-wSO~VD(0q{)xQbsz^<0LE+7>YjuF8|1}$x_)6iTRK!Egx;P>d?<j
zL7sknT8;i$r#%aw$iaLg&;A1s=n^$-njL!eI>_?R+xcdA0PYFwI;;eG_3w{MfiXr|
zpe?{Q#udv6wdoa2{<qX4)(-HGW0XCO$S(+j93RZKJV;f-(h~f1HYVm0rdKQ-W7GK;
z2jG6fD*FP?1W&R!D>`*#A;aerq;J5TayY-Ut4}9>@Tq8f${L~e^zi9ZRZJT~Ls@e7
z%05wb_u48kkQc*Rriqwi1u>HWH7Q-)KpKR2Uc7T$i}S)}Z~Duw@E<deH4yA$QqdKp
zdmKT9^kq4~U9%OZkBiL{OxyM_bq-6ys~hU93=N-wN8oPX##ljQ(p*;y&W(uBi1@JE
zkuo=HTOjWm8wt18{3^L5Q2<0FOnhNcsFw`oMZ6jcyMTTh533hNMQl{;>&@oOS&g-F
zkj8Ln64-BzjU_xaR1>^CI_QjnTC_xv4N{KepGu(+SIjb$Yk@353GT=Basm?tkN|U1
z@~`h<+f44n-cEn>vT?Ssg^0BvyJpU*K~ZSt4`BDiQnj=!<~9U*1YB3pZ^Ir#rm~SV
zRa8Eqnh<~*`fCUZT8DWr4}#=U83)hd^3aTeT(rv-<ImQ)8oUvj?f{eny6?Do)2~l~
zn{XsVH@9h|xt5CFAL=M-8~xS?msH`?vhw+h7x-#wDkyXk_Z>7yKVOy+Ik#qw$u5(#
z{#+V2vR{Yj4%=#FM54A!5eHj)t=|ll=QXVrqxhLss{g*@|Nc`tnv<v^w-nqs`~Uu>
z|BL}TWxve{(r)cm?fAb6)_;Gg9UdTB&>z~c3soE%f2*4R{$2F6-MP+!XIt_X(N=uY
z`M7+s75?4ZKSF`%%@Y3Wuw`y8|CPzx!l|-DK}%~BZb)^g#lsA&lfJ-RR^AF^yy$PW
z^=YoG7q=LW5QE9ou_2{z&=9O6!I7S_an6=@ydsGhcvA_d4x;sLRAD7z1mCNTbA&(3
zC%TTE4jCjXMu*MBxLm>cpzZg=Yd##6Kfj^pt|G}S*JhP*K=T($`Y-va>K(f4gv#H8
z+5f|xo2g>$1SJ;K4ne8B^OBipW)}=d_x;}Z{J*o?f5tsFSi{sN#9-)Mk_7oj%ddl%
zXs<Z*tq$5Qcf=8;{%V~?{xf>;?cpKNgC&Ylf1hnjB2(wzsdrp<&dGFrU5QLTpmm9G
zB{O3$T#$NZq+s#Cx7JbzCtOYvCLzXm16{2^$iF=>^S*7pjzj$>0m4%9;j@3gl(+Gm
zIbCjIO;Ehww}Zdy`w5-4-{;w#=@nB<otU!Sj75u{6GT%jcm91$TjWR<IFTe`2t!vk
zF5S`&@4IXC)9F0o>hlmjmOD+cVf54AkDnC#P<C&iG$JOt_gsV*_gjMEQC1*zrvL99
zClar4<2>g1cM-pxG<%D4LVWzM8Q(rn8MDX9{ZC7=c2YOldwY6y7#()Ceaf;USG9Yq
zk2l_)*I#{6(D<-dP6liD-CL^`X5-Ok!O0unwsrdCbi2KyPSBpSqeh=Cto;7q)#IX}
z;?6Rfx3;|TdkT^9Q!~?6dEc|+2alaHt8Z#D)u1!&$0OyWhdnxo{3<yt9_f8P!#ov{
zDAa|$i|OS~>5~{WlJA8K?&5#?tku<XXqSj3oS}BBXT?}z<-?I_e@4_T+@s(n<PEOM
z8)QW@FPS{1Dk3skwFGa>;%HxMX~PuPN*uG#MEeQ@a&MiqTfG`9P4nX8oUTaKUQHFL
z%tsCzD2)UQI(z>o6;bd_g7-*yd5O>cJNWCK6#k--8K_u3z$o5~-0X64XnO?h$*HOR
zw2|h`_C7a76grc?(@*=X*=UBcP7!`e)HjXMPvumFw0Gh0LEL@F9D&QYy6P7jnZNDq
z-+OM_N|oa+4;jTG^>-ADZUxASGhJH(V|zNrirwe8qfwrztm1pDW6R&NGqcyz0Y~Gu
zU<w<BFVrWdMR0XjXsDcO|ERD1&)TG$=J948O`?lxfDKa(6i=hD_HUDk%{%K%H{X)A
zy+tz{V*lQg(9LlM#33y`Y|4W8gPX>B^1wYpL^CsK<Y_F<2{IGDJZu>VL_lrY+zS2a
zMjHl+WcmNO%zl-)L{j|ZiGH!WXqYe`lVX^Jn3BlHFJH#u?;d^a?<Cz(y^+{2I%n2P
z8y0ZkhA@-aNkxn(65qRQ|Cl9fk0E_J4Nd2qfd!^qAD>Z<v<SBr-TbW(z)X~g>Au&w
zQ<a0nhRb%d;eIf&RpbZW3tq$2v&A|H53X7z@`wB@^!|?KnJWwe!~E6yii0sv!hW@q
z`qXLDkQEIN5axI^E?F)brG-bYdrSRtu@1$yfs@v%@RfY}CSsL8TtPusqJh~fK8%~=
z#c5|W<xF{WAL30|PBR|=a)8VUIo44$tdlS*!~f(-^90YyQJ?yrenK2R90UiSHcvF{
z0PU@#<OHAWSMtJl8x_LVq7_mMaP#nhd)g-4mj#82!!l#&8>7ge%0;xDz>ADLrkfaL
z1E_Zd|7>Razl&tlEDAmqm6ZwsFCoAnEbRn8e_<FILtaJ&jKaAdsT%)~AM-!ijN>d4
znXY1{jU~@NJUW{@@rXAYUAZ?GQiAreP>lP3Kf^EgU`WV_#Yv0;J39XdxPez7h76hR
zkULDQvuxS0PBtP19n2hHHv*-Bjze+6{crIFAW$j*ii-n~yHN$e1;T!0`(=5gZ(zK2
zbtUTJdmO*K2r<Jwc9Epfy)a?q$Oi=lO*~FCYnK_~AiwPI#F0=H*UTSr4AYYyPAgLN
z^>+R?JRHi4QPNcLR|*4r-w9VYvlsLZe$0lUb>!5>8J&hjG4d5Z)6hER9o=)*c>J`3
zXjQT|b@E3ZJHfqiK9VBYy?-aR&zB9-n}~CP{U*K+Y%c5;jP}<Ea?`b^SQB}-Z_nB=
z{O?21hGZoZk<nIjepxTf*y4EwNUuPIMh>98i%5#C;3J-DzTl-m+NCYQcxPeVu}%Ed
zMpya?1iOE4p+$kCsGoLsIt|c(a~BtF`Y7rjW$K~yow71N^cRlmWh;b`Y_tliI*8wU
z4<1O0dC4y@0WKEiUoiUh7h#j~pXQV)0PgG9GV|;%8u8NOQKB#o$|~^<>gHw3qCe?~
zL~D-!@$2CoEgaN)a%k^iXO0XIMvwAe*mcWdSaFDh#Q_&nQoz6CnS#Z~zLd-(w8n*A
zQU+6uGUk={la*Br5R`Y!4nZa$yvGHou(fCmhaVT;W5qLt#6NJ;alrw&iRuq7NIw|w
z!3cF3T@}7`8Sxq$pWfPd+4BT0Qe+jbT!p|=P%ZcY^wuUxs}m-XFx!#O9wQHrMn?Fq
zhbFyhB@7ru&-|DHLVr%651r&#Sj7RLy^B;D+l)E9E!ow)O&Bp5VRS{5ru!Xq?)+na
z{b$L}&rk#*tHEOMGo+%7&oF*qW&^Ikkh_Z|4n{<jkmQs2@{|_`Lpc!`E#lt@lXi*k
z!s_IY=LxMLAo@UQ?t_^h(H%Y9-Lc#WBEym%K6UD5>Vki7i9F5pTq?^N=?hbPL|(_g
zxu8}IJ>vrJ<wh(D94wR$Tat*K)xzi)_RI~>TC^&|d9Hi$@*6O4AUQi>3EO_~eN3C|
z--aA}f#$*EWK_V#<YZwY7#lEnM$|%Kn4eN9dYqUGqUe|XyYJXo=4(-m25-#LMeRkq
zK01}5zR5%)slAIn-T@N@JI~mQrovrmODuPCf<-Dhw<w{Tm}#{6WGnFX#WA8#cS?c6
zfKMpaKxxWw6Z>~(bjRh{|DFaerSxxIm#)D0{DOI61lLl31S9My8pR~Vg!MA7cR_@(
zM@-`SR7!2YDJg?acZGl!hNy*(oW#g8cCdG*U=qjjRSGalc+P$h8tY;h#3>q84VAs+
z&##TxXB5wRlnHr_Q|IVsey*h{btGU*Y;5dywl~sXtEB(<aO~fxjhvjd6kMRb!Wccq
zXU_|;ChK=(z*)2Se0HkWz6t)l(W0$JpZ0H}4rLuC!Bmq&e=-t1Qek|HFv7^^>s*QP
ztZr%zjc3(3RDS%K)*mAs66!$dRmnew{%P5ne~-<2Q5sTwkh)_b+A36ocB_?s67-Un
z-|Y-*FTN-Ie5xxcje+n)X9=~0{L_0)G8lx%(&_QvhC5#D%w&oKhyM#@Mvd`i=SU^1
zn|;LhAWm_k%W!-?TKEP7nmMYNabuDpq}l8Mww=gh+MgL+M;~@XbVQciPn-CF8HHhl
z+U_pxc#;d;ko!XX0VUPYLNL%5;`4H!!@kV+!$^>!Stq51X;+h@)>0(k{f;u_j6y1c
z8El|1mYKNqg_-51_o2^ifVJ21|1JSzOFW0O?U*69_Eif<Z5W}k+r3AR99iOjw>nTC
z`l_DQS!p}XUz_BSneEqLJgD_`6*J@LCLB$ox$=~LqUer!%&Ys6W(as;;9|bx)xSZP
z{j!|ENS6O`)-AG!?xnP9G1n(DG*n(V>RvJX(UeIs#W5<|cJ7?R+v(9+u=X)Eu^nB$
z_kUUdlPzrx59nCNL{a~=N%sEE(S>_=?PBT_z`LKe*;ylD02DJCy<3&IRZP1U<3UFT
zK~~F1VG~RoF-u*M{TI{)zT+r^>7=@=z3YVE?=(7!*!44K#Xon<uw{Y_q|b6Owo4_b
znaN@-@5<fU3**7BagscG%<?7<Eso85XwXSCYK<AE@Yv~(>5<B0Jg$9=Ct7B9Zz=`*
zwgleYqoXjJtQ5gM&zoV0uZgMWILAFqEh7M&J^PPa#_{;*F`!w+@-k*7)@w(LPFp~Y
zGT6uC#KM<wR;1WI-rm3Jb~A0QK^T_#en^m>Y3kY~jJqhPMlK77_hjNjVZVgB4P{Ue
z6|O*OF!AHMBchqrmr;x3n+10xAx3qAFZ}?lx6kzu9XG@&gSauBv50h7Nr-8PUj^F7
zCAc_EBvLRgzv3DwjX>x3*8;<_ORk?vhkE|Sk&X1Eo4p4=mqrN+GyYTpA*#bsn<)K^
z$DW9aLfUN=Qw&XE1b#CRKxxJ!x^8HKcZaA_(dzm(oTWu<_)g+~bK{remxb+2=qJTh
zeOS%DzAMa%#I}H)u(=CtkQ#j;S$}xbF}Wv&({SeU7V4w5h~f_(T+S##S|MZgdI)E%
zmLQJ!YloKb=rN@j0$>Unb`R5bqde(8Qs)v;b647f|BKFB7BHALjWkTbJmxC*3XOYw
zhTm@-@|`t^nWGQaa06o=?wQ3;Dufj~3wF~Wo^`IJhj}UmhBKJz8~bXR0OS~SCs+Vg
z(o_EtG&Y`gc#^vKG*M`K!VO=8(@_P*-wU7O&EieGyjp}&j)RiP#*vZF(bWy74uaVw
ze|yT4yLV?>_L)Q}31rmiQ;U!l&&(sj{bW=?fHYX>_`kEGUi&luuzcrG%6lf>@#YB}
ztPI8IJ5W;8ZTR%*TM#5N)Xa2kWi`Ww1u(t{C1Y_O_hPX;#Z?AAsP*HFKi)(amfw5t
zZbFu<bSN^?d0c;C3c;&4iSti)ladmq#-g>aT(!Lda)vkhgK8tPio?_Jmt>;YLI#cF
zy7<ajG(V0tEf{R_m1!BJcQW>(p&P&uGc|Vu#R6!NronM+vy6<xiWe51X=-k+T)vB&
z6YVMHz@{}`K7dTo(Z(i{;kn=+BZ(AjEUA;dS|rERJKLn88qSL$;R+qE7CMOLpM15*
zYoFdqgyokD9zS-%;fQx1e5{Aas8MDM!(k!!7```9&B2HRQu_o$CZf-hRxXEd2O{KP
zCGfhKa0J*XW8l(?aqnv>WeGE?Su<fjLlVssTtGK&Zf@izOMR*p(i%je9f5(buemdg
z>=oU*Ax;Gws&bbD@O8hxy~sACer>j<LXWBeGLf?7lxm^9Oe3!0OLjg^xYOvf#7<6|
zDt?Oi!_--1B1b?>-3;I`1SzQDiQ_0ZZa^7EaEjr*xwb&abA#@AiRN#^XaJpR|DgMn
z0Vz5uZ1fB4CWD4{29u_yj}nPG8Ot+~0+zG4oZL_SGiId3#LZ{xAHhu^UXq;tCNHnS
za)AbcWDYTAVpRTF;}K9F+YWrRlG!}9=-k_PY37}VKD#=)#p}YN+Yi<ycAK8r|H5-2
z1W^ifPIz8JD$GE^i&fVxNA=|~9%pp+?PU*tIbJ@=FCm`b%BbDKlL{YR_4#1Yo;lnb
zP=b(*`Lgt1L(7mGd(yNM&G9C4$7?q!y>b4f(|C04-@BKF5R@2@fn?H;Mm+ib<niML
zYbt8iU+0$@;wED~S9p-JE67?1b_WIUJ!ACfJrCSPqc(v|rH2g3q$b2cFe9z({{1+B
zDpv*RON*W9+JQ~uB|=>Y+K&dm6S_*lSpN)14aXD=6INdh9Ten}(9`nDBSq1LO0()p
z*>RAV@52}V0Qzw3{n0-Iwbaa-?KHHEKM$&#USQ2g@ktyJJZu1$GO?<@!6xDV@4@za
zb`=FB1k`et7*A#iGgwwn?$?`o+AqpeOF0wqIfcuZf078z)P!LwD*COlR`7FhFY#*b
z)V;r2{Fylgi#rBDtf?RbrEuL@k;0(BnZ@gP*q9|?TW*nPPD@<ym&gvI?h>vU;`Xl+
z`{VuH&%{v|m)GcU`9F+%$>|{iM5p(eZ{SP(4>@4FLbLhDpPx_fCon-)(^f)>H(}}u
zsx|MPnN99u^eq(C51n?1zF70F#P_a0oEa2@N`Mcgu=COkVVbq3A_9}EY`g&Dx-Y<l
z79HpQ{pPklUS15rLO5gZZbQ52ntwXO(=YgT2wvp?%=4tdSoX)v6Br^ZknyG_Cc>@K
zMozXDh5X^!h(gigzD#GzbFlTP7F60C(HvT~<PI~}EZSC*irQl{lfaKxhpzUOO=&5c
zyrI+3FEsX?cg@4wakcNbs^savk2)=K<8{5Wk{QG<xMtqz!PU{%D$>kB%+Gwf&Bsrj
z&k2S^nn?(jI$%W7jmk7_9BGb3NzKedm}4?zAevHh_8fg9*$cmZKW7%-n~K}OfujQ+
zqA+D%hf4*3Qe)#wJZ<&~SKL?I;i?t&uUCp#4!Cwzk`)fOQP~{Au~X}S*VO_}1(_S^
z1N@=!39n1V2-!-Ar9zzj;<s3~->6bL_?G-RL&N|)ALF(2Z^6@#se&-7l721ZaRsnF
z8%&7rv3%{pe_W~P{n+v28&E3F?=uPQJ-g!n6|$%uZDv$5iz$<DyyE+FgZLifj6%Pt
za*c~a8HwbYeMrIk#VxoYyefqZ>8p<~36Gt92T|ylE3N{@6pb28CV?%L$FXbF+b11+
z`eXms#zvEfYm=icfEv(8WDSB`7tQQ?1uR(^E=Q!H4Uu-_ARz%3^``Y0Z<@PU40YeZ
zKB?3~X_Qb@%K?E|=X1xap#dU)ldXSg0amUFR?v8zger0{ZW4M&zl_5ajD4YG@Cv54
zh`n@lop7Fx>Pi`k7$GP_B*ko@=%QLk50NNXWy+MF$PV;kF`H;61bOc_2X;_8^U27>
z6NhDM(!0HA{)E3|4v4E0<u;z*=+O+RTuCI(*?1IQFaI7!dsxn=rlyYl5XldUPQet^
zz;6UdvsI`N3q*t{+HUuO1C!|Bi*;wiqJ%OW!t-LJDsf+`Yu5(Mst!wwjPi3c|5Clt
z=V=s!DvHd%ryj0PedovT)}%a#niq))YVIA{w}))<Iy8_UGg4GgROE)g19q(vzj0+f
z2C4-MiFtGYOf&~p2qzfr4*ZYfu~4EkbaH=eaOM!r*XDK15=c0Lz=Y2zg%M^MM_fbE
zz)YxxnRg?jqU6*ZqSf5#n6r%04qTBV<ipXsI2YXhPUQSwQqPg=Z-N{E)zG75;~1BQ
zS*Mft{uh8+&>jOAV1!vdVTYte!}|TDkC-~RfWY_n{`ghg)*t@&JfByu=PXUWbV=#o
z^XwLwZRhwVJ+0Vz40!~pAr(VA;SG+mXZx&F8%QR>r8k)+Vje>dmsBK7!n2GwQLyaV
zPL%a`4mDRDRtu(KX<9^F1BeU5u`};d)(7H-3K#{`Puqt3!ZxUnWQ&7NJgaiW;(H(<
zoY({&g)Q}sjD|TwS<A*@=7c)sSN80IL2(cjJW|pijN>n}WcKv<?+)vi{#!C^xyh4T
zIhlV*HPzSW{xW3uqF>#s1r8JWx3^03sI+#?Pzb1RZl1>ARp!h7eSs=oa4Faz2kE^-
zZ-Gv&X%Ke);ItRD%|u4}MGx^jjS3wErwhJlGnGr8^TNNlB<ueD51V?<ip?ubw6<7N
z0j~hTyg1RkpC2;}MIyhjhzO<;zQ!m*e2?`bV6wmWXt`437>j!?C1btVY`aJ63ZFi$
zV^lLT+RvbH(4epphim^^HKMi*+9s%7j52s7G@>aeR8GM%I{Vo(!+$5$imW~Vom3aP
zLK-0PlE{N%DBJSW0NRaJH)i^D+xYU&#mOTlPHf7a%?LsAX%;EP;&{l4^Zb~$&VN5P
zHJQsjz1wL*I728S1WY;(OP$lZ8R48uC7q1y-vV8Hp7PuaYBlInN=oQ3DqsY~LE)L+
zGU1>UOrSe&=P2tTxQwl%jAW5i)P+ySj#LGdSTtkz{}*Sx#AM(U0_XQBvYQY^L@!%z
zmH2|>?f>EH%)@d_!~XvaW;E6!OZFN=C0j+P)FdQZk}M%hNm-)MB9&1l>x8mPB1wp{
zYqKXyilmTGl4wzBLH$10^YobB_xK%ee>jGDx}W>L?&~_2@A*Byr-)NV0C*H@2}3@*
z@33L+;Qb~6xGUHZbHuX&yd65Uj#eg+3c!ByjghHoa9AM)&5X`;`<$IY@WT;Ia+QMo
z{6dSVXnR=R#Aia6tS3wO()~S?t9Uf&Gg?qfkv(A0d^B#vWGKZjJP_s~XgI%qOdori
zwph1wD-Q6W0}o{;rbP?Oj<-%*hHn;2=^%e^%E72#K-A~Lly_<VEV$?bdwyW}rl=={
zv(7J`51Zq`6fC>1_B%E%MGcVNP9|tIj3AN0<5DL*jFZCjW-ZoTo)ZUeuJYa}rwdnE
z0nWQ;{M?8q_skU}>Ix0^GgKtV2Ms(d9pCmCvFZaZuBMIas#OI2M<zwIY}INhU<BBe
z>`+*o6ofOz^D3Lbza{w^C0)_H67GeVk+(;l4JeT6dY<Z2cl|ZcB_ROBl2?lD*-%50
zNb;z*TI|0}Kw_s2v4hP?%H{Z0bIZHztgH^MlX#3!Wh`?vryq$B=D*`qFs@SjlJuGd
zeE{>G14s_=<@^<yf$7<$OEv>MsJsdT<I!NUC0`IiyL0PS-cX_WexPK~X=`Mb552fW
zhxoKeE1CCGzgOWIFQS31+%xm3q!4*-77@NZHi5^&zfz-|+=FqalxUt7RgwjGF|XYI
z&t?*J`9W{j_y6?fL^857r#=M)IUUU|Ra2}r>`VG+q`oi771Z(#)L-T_3yK8(v)}|A
zn-we0&W(1?JaY&WglW<0)?mmi!t7~hfhak2rLYY{50rOD@jG01<?0c0AOV1@ZW*r0
zTK1X(hZjT%__UW_HOVkz0;iFz<EIpGcd*{(>2`r2rbw}@-~@9_iHR#c*MVOX^&IQn
zOmbg-)Pwp6R{o@eygY(|ALS?8w)p@1_uEU+-zIMF_&+v^asm|T4cGZLLdiqH!IgQc
zSO?||K|Cb2XU|NtHj$j|))2fHYrtm1&B05W&}S;`tqC{jT?y*59C$u4F)==V-QWV@
z8a6)#nR)Z>T{-GOvdNqDS0w{=V-Z(jRp52PMc}X?lGiJB3ics>ZcjFiqGG6JB+uuo
zE%!$1e-NT$+xNV<MFjM)DuJoum0!PFOEXCP2nVV9i7@N_C!l_C+7(T_Y6~Ms7I%tp
zLUJ4Jq#}wKxUD)8OG4581)|@DRHqx;GjeKQhNol7LQhGnqWnh6aX^Efj5T+BOOr@F
zDH`XG(Q1!F*QeiFhRiz&=4Po!D*YQ*(YaeUEx%Gm?`_+*+Qnt3FrJcjlpF3W*6q?t
zFwzSm{UF&Ueo5&kbAAA@y9r@Z+BKZ_BsHoc{U*-db#E1LKP5-6e>~jX)hEtAGzmOL
zr=uSiH2%E`7?~Ytdv(<~g)(rKcxJUqVSM8lN=e+-ZLAB^ukqcJdss$g5Kj{@;Q$!{
z%Nta0+HfW$TgoYIfHL))_>y}1oCGgIT>90?2+iA1k9YzLCE9(VSWur5MwT2Nx@^B<
z7)}&E4}p->o&_nhx8cSA6Vt1hulXXJt)<iD|2(bvK>4GkgXZAz5C`Rlm_i2SR2hM1
z!%y!oja!U+z>TRYFQ>~F$qOcc=3&mXgpZY&tHowP4_+zVpcDH{4LM5k)s{YKj2K28
z34pZQbP3(W=r9SqB^cnUk|vVd8ycKaQvJ{1kVebRV*9+`H0Ed|Lu#ebD8moV%{o1x
z?mS<0b_GEws0c+HKsk<z)Z&CHUxr?X2_|$R$pXI*F*ugCn5of`1W#4P6b9#fa5hBi
zmK(dYB(s>f2Sh@vqUM*-VbV%5m#!WUNY40A5irs^iGYXOURoz<2Vg&8$JmWBW3Kg>
zXdDGBb-xpHu6?Cf9Njyq88UVRgNj>tw+#z1sIveK3LuL|&y8-|rp?!?s)0e_%^xg^
z{pX*%tgD|Qa^e7nqgJda-7>kQBt~v_)ldANL4lJZx2nDhPhVA5#(3n7iZ>SPQZJe#
zd*{}aPK3AYvW*-8@rC=Y=S}=uKwL~Ks^m1&FyvM2mX+w^V{B|(JU-YQMCSzL>+I*x
z4XO@GdMPy+NZ*1Ta{LRGUD^izH~#h01K6jsPoZV2`zf?;{Ud*mt+W7atiqUEXAyI~
zV;RtG4>5rL&`D8I5$n;e-P^*#no~MHU$Y$G6(+}u0jKpfsXp`%*sAMFl8qbUqfkYp
zDE3b16TayhurvWF{w^cq0^lNf3n(L044GrFPa<_Drr(GBvXsmHty_k$D!GLaZ-5Bx
zbh}Xb<-d1|7vj8l@d?}l-2uPs=}L@>!TM3|4bsl5BpJM_XwuYpu-wd8B+K$Zj3?wc
z5WcLXkKB$y;FN96rzdudIjZsggm@{d`6RqmZwS|_w}7&l>DyH1EOz6op&4OIfpd5b
z{ZrGr3~^e;(B)y$X|-gNK11%ob_cBskB9@cPX_A%^KRK!SE6j#a7-nm`w@>qS}|{5
zMbU?P87Ui`WCVV}eU?LC(S+*oU^qdJg(hW~el0hk?wn+&O?Gxq*mTMGmG2*@kh~A-
z{I%ru;fU4mp+P0ATN^)DOt&z4{o?!k1I{C<2SrS#N#Y?5BKVaKE{i4VT^js~Wb-s4
zI%$s6XO{-)(d+q;@-av=yJbFlbViz?hBiYOq=LN(JU;6*dP8CR`D%%Vs521)kPQO{
zfVvuam>o?!k8nwJCPwonEeu&%xH7<!`p;mdUxig;4loCQ^K*$sfZQ&c?UHWHT`z7C
zNi0pc$wy%nEpMMm)Jw_NRWL__n$fXR3mC5Dhk(Ne##T!=rEfR_vj5)%1eJ^*qQJaw
z-^`@p`l~*ib;&npyzGrOEhYY&8myG@I02vJB`XnPcJEF%<-w9c#F=0h8WU&x`1t7Q
zU0U#{W;*jN=aQ<4;u0>#IO{xb93q;6Dp?-IEoQV!O@*YcpT7QDs$R0d9bdcldX^@7
zY0&WVBWxw=`{WNCWXh2yoxd{iCGOzB!}nfeDXQ?D_?<WpS1`~8+xHjUY-{uq1cFjK
z<uiMG7C}9VvYQpC90<Vgf>BZT%*I)^5Ap_%9GQcK+IELERQ9^}!oW&112lz2TS8yd
zjcv^({(s9KTO*K8rnk3#=MWNmu)p8yKe&GV2$`~Z`%63=_@CyZKtLDF4lE^#!3@h@
zMA3aqISe~V%EiG8?|3m>*(3nVOxN5=D^2O4@J$rR0fsfc@|g`WVyfOKHXXQfgedzj
zA0%;KDc^PGhovTw$lhLiALI_wh&OMJ030-J3YLO#-!j0G5l{G;<?T>$qG}cY<LiqG
z6INmFY}7t!!UjX8%?pM_*_ZnRMKK?;O)|x=5o<nGtLZ>$7|4J|hz$N6ewVb6@@n{m
zZ+d^C5;A7i9;UdLbF~!oa+YQ<sO=oJj%qph4z3)A$qomHdGm6?!2uX70-X7Rd5#Yj
zx$<umK>ZZ!zZV!8)qi)DJ^J5WXcwcDvL&^gBR62i*`({&-`tL9;%m}?UPwYGT*2f~
z{Q?mJ3;;hQ0(WpIAU?xUL*oJ4dGqV*DylbKvEhfSY-Y|}S9-lUK<C>>P_vTCvFeYS
z_w)iAo*qpUa2Kdew)0w~+`nJMs0n}V3_SSv!}Z@I$Q}K)yGr7E$t{gN!Y2PGmf`VC
zkLkL`OgXgePBNB4PRjeQJ|$wj8JBroa8ZcQfZc)5nXxDhDkaLIKSUJaR+<pL!UxzY
z%xyt4--1yMCO^qikk>&`&D*<Ce+3T3#zDT4dSQ?hs$cz*^9IIOq+=})kL&}-*1#YR
zJC(Yh6g6pqkb4FGG6~?VP(`(srdhcvB<iavQu~iwMq~-&YZj@e$zc^q3wdJf>u)a2
ztVqWA`H6-V`v59@{`j%BfIjP&krqcx{#vffc|?WzJ&cjH+w)hm$F&}^WnuRFDH1ep
z7x7Eq!EB$<TO_j?09jiWyvN+rj8gsWTc_K35(`E7&h90Pnj@s4`htZE%j1_2w=}?F
z()#Er>nMN4F-NCFXLkDj8Sqwz<VFZvX7?1|@W7)qcYSnp2_d?_A|au*(`EkI*l{IU
z4BlYx!}n_YZKE;->yg`?x_n9#<H7Rm+&4}X9i%lb0-wFqZb}BxnZjU<LzCQCLjh<O
zT|?X58=6P3<}}YYaRYl}^U2lNk(}N84;}<{{mpnVd@=|rD_5^(Y;p|9f2r;tW47)!
z@<~t(19!wP$wGYhq3~0>YTKFZNbujVVH8Oc43qf_7w%fSr6f9}?AK7ObFr~BA+KJ!
zGLj34luZtgN<tv37Km0s$dIc(h(U6b4v!=cnl@t>L;J+zQB}=ctm{9-;V*J>7rqGx
zMS+AGS?Z%l10K0Gw=kg`D@5dp0@250LugJva}cby)Q1k3mnWz7i_M~=!v}Z;uuNts
z4GvI*VQ}Th*Q0RJ1nQ8{X(mHS9a231L4Ge%bZ&d)N>GcH5c*fGI?sRbSCFL(ZTek>
zwA)KE&h#&<3p^R*i+hG;AiOLJ&tXAR6~`1ncS}uMh0xw`rAe+U%gaSc5HRZ`S^QGx
zi48|XlA24DgM<lnCLRt5j**eK3znuIyNB7|ujgF%cB<i_dv>#~u8K@sj0(Tgl*CTH
z>7%1EYQKG>xi$f^3Uxif|5^nKG~!S(r_m$;$lV9SAx%;wspuRNs1?|R5#Xg7eilj4
zfH}1<oD(&S1~pCE#o48tHUy2r1t;F5%dlN+jo|Tc1Z?A!;3B?;ibIJwg@HE~e;XpH
zRsOoZt_sV@9E+U=fiy1AlIoOs=JVUPH)8hq`kPIj3_4UtObq-f#>E+oMRlKQl@YNi
zxzv5L=CFmz2rVb=*Bz%IN*_RT9m}VR?8kSfBR~U~ETd;#L;B8iydZ95t%nD7Zc2tz
zp6Pt}ML7aol<iGT9vqF?-31<YhsPWhb!kBK*Vx9e`|^-eGBEF-p5C1e+*cf#kpJcl
zzH2L+tz*hk6NT8UZc0jn$$PlS`{MJlu}d&e6j?!Ei=IY~I`r#VJEC#~xP|sL5DP^N
zmnsY(MSS|;!&#zs?|wNuf%JD7p(hL`>nKL$nLz-RqBD}-FL1C}J_2sns>QndVV%)J
zMzni}*oKyp1qKoqiu6p_BBXZXTY(cm#jgB=i86#l0vZtJZY#Io?y7`^f`Y0#={aE)
zOfOMb88|GJvx_eL*Sz*0R*iT?>~71Hc%X<*2fI;xWAg)7!h`w9$5MI_4A3v|{aw#`
zMwAB~XziQN7vi!>)7qJAWh^HKNw#sc=K%T8{3IA1uig>f-|6ZcNA61rE2wiT*u%Uf
zQ8X4?0b}Lr_Vq*BLJ%%+oPIB><iWOYr*!yokVBP=4+5`)Kz5}L2ffnuU<gyOXyKsr
zlJ9*t9wIUdUXJ|skvq*Sexp%)1vMS~JWah9V?;rT4j*oA*UNblEmJ%#X}P~*PEHOK
z1h0g{{$l57CoLU{(uOm%B(Ry39#pd+a2_LEGI-D;=?Mstb5=|~NGF4d<6`dKyXVO2
zz#*L;mEm70FB3^hzatz>JI#KKk@Bw0$f*ug=>fyMj>pm&*8_=-o;{=EmOeX`b!uiT
zTXTAKjU;FeUr5MJ3HlDGXpkZFin_$Q%7H!Xeh)QcCBvaA(dXiHL5u_CVC<H{6@l;A
zSQzFlTen)xwT`s3hfRlfJ?_>rG?&yPlhzjSJ>G!&S{fwL=xnhh|C32nnK&o-#wIW|
z)K2EKPzB9;j31_##t=mV-w3MSny{-QbHETE?j8J&B9fK!F9!Wkgolp&7T12oRzm01
z8Ah*>KZn0>VsR(_Tk$pSQ0ix=##7_*y)Ak%S@ocgkS{#nZPR1JGaS*RvSX4CATAI%
z*aXqxl-+T!uDCi7sE~x%s+Y$$1e%hN*R0T;?cHqx+7lxZe(#4i#XnI56ItraSjf4P
z@$ZMd!(gpJ?SQ`*Oym2-tpNci=+;+V)O6l@T+wc=nc5sexP+>v<m}9{cn;zWF(~LS
zIBf2)e*6_llo0tkMbjV-fSYedE!eBzkQ>uu{i(zEz@UKDghR|TkEi#U(KT;bdlMmI
zPR%@_>`vE#!}>Pl$`XS_4%iuLM8hXNU98KcBMFW<LoPNK#f{I6ze-3D`)D_+j~zRH
zB~n)(h##wkomS@SF7eMt-xyvOo%6~jgfkSI&Y-0wWy6uMA}2@3SARcbK-ZC+Hkb<D
zMloCZ;i6c@FdiMffPtr)-)fpfxdi!;bIz|vs`{CSiv{d^*z1d0`~OUJ6la?Qcv#yw
zDnr1Ag_)Y8ZNlwsPR|;`X*anSNV_4m+M{W3J!4|%nY~9TXPUZBQ{yw(PedabJY}PR
z+b$<~e7voxMZny(pD);Y*Nm3Z7QqvzNMkPJzeUgAKJC$2J+$qTsle-Y+4j;5S{b&O
z|5Gp^Pnc5bc7|0;<<U^-!83E?XT^K2wttbGEhxrUqMmEAf^P~7A-QcCupw)yfcm6~
z-Pv>JHU$KHfDoP*-(50(cSuMbCP!=+1*iebbh}WgRM%JbjhTlP0l+s=ATuB&K6|+n
zFI?*}TsNq^>h^0fTI5xG4m3AbLGwNia6#h*Hn9?j=(|_B9F<9Oy$VZ&_Z$?qj9BQY
z)9NyZ&QxE?E&T*j5j#Rd0RJeNH=MKBymgoJ<VC7}ga&j&ve^Y?^r|7EvP2Z1#O+N8
z2P=v)Cre?Q(98X)iNkWHk_~r@kh}nw+h1SFR~zXad#FABiYDQf;zPMf>r)5+F5h}V
z*5Xzp4tr|~CUwr)`C`%%VZOyz*g6Qxogy1mc=P6sm{kn*`hJ^VLzTM{d4MkXDRRFr
zU<66Y^u{Y+P)H)U0ITs+6owb249<p@3C5Ek#a|Kdkxl2dJ={>9nTA?r!}w+^6U4G0
z<N`2l`+x|JrxpDf?hoVCLX~n)EX7xm5G(syYJknr1qug0E2{>}f(4h+uGOunEsi)e
zVDR863QDLIdwj7YSDF#X4p|4^?yX1S!hvrbeR+5MB()S}8BFsTtxbdH-)MI7l4SEP
z+ywTOAb5b^b37@Igu`N?iIfT)7}J^R4%8>GYa%h@hS{}4!aYSB0~Hde!bvdzisuLn
zO#-l>abL5S5ay?@8u4?bM-tRp6yL`sue9x9T;}TLW(y+~>_|}%dSLzF2-Vl|9j>Tc
zshZ-SakHXT1UTkp0-1GwCnp*o>CU*YzfV`+1>tFn(H#%<>I!%lPyO(-&Y?4w2^LbW
zClE$Q9>klpS!#|Tyt%l#vQWPMwQ$D;Y^ZvR^T$A{-@~`Il>Dkh0<EkZgbqz^?h2;D
zu@DVOZWQ!o(Lq3=smr!Scn_dp+=;LM2-uP^`Pr-_&~!A?GB+)$eGT1V(jW6(;HKLc
z&7D<8D++Y{z{OBxuxjm0P?%tyM7=Zo4*rl?V?lqq;BrS-ipqP(#n2x~pgNlP`TL9Z
zh$*;b097qNQU292+`sXHjA0?M*f08A@zN#%kY7NwMYl^jChQFMldcGe>;Q{k_Cbdf
zj5SJg)|T|2an@KNjXDn)8oea4WO{+UZkI~Wc*QUxQ+DY2OP8Rf`Ho%-AZ=msm<ZNS
z5oi#_NY}p}rNi~Tve=bPAb$MwfQ(UcrdBWd5U{XRuGuv63V~^WeyQ(>A-^FZIk;B7
zENB%jjChS+f?>miGKl|o>2|{D5U1|hyI0+Bui@H@R2Id_5$OpDLhIi)D5{YwXlb45
zWg9fD)4D;jFArTY;qXX9(d#pqnxRNJwl@`VZkEGgtN@6O*37U|fxpnEMk<RQO3G0U
z7_c?z;|I!e6Xi=-_ijesvlT>pI1EwQ@27FAVzwbf;93ePEnQmt+`(;#I!KJ#i@PUy
zo68HV9*9-^cVw-Z0#9}hkqKy2L5m4`QECoT3}cx^1Ci81_%_3iq}+j#(K6<0Z;5hK
zz+-fF?e(h`esrERC1JLF=dK_=7Ad2@B1ES-AR~gH?}IBlN&Pz{1lZe~&K$})Vnt2)
zUJ!krn2d_x%<hfo1ep;^bo7iyTZK2DKX+GA(GZ<UJ_I!&Xnu+?4XsC>3i<ozJ{plX
zba`-_0g|9@T%1C?k_ml^AVALKI_jImV26$!k85qwS@KK^OF<D~0uFQf&>YXA{&Y&^
zrjlz!FNMXN=iK~BLr!C?v4TuQ+V3X3=&^C_rJZJ|RoXAa%+RZ*zE7O>vbUpEO^uC{
z2&Cz2;o7~E=?-Gy{T121RBnPPZ+(F=H!Z}8ma!4}EzeJuV?2`ZB<yT(Yx~vXAY@}F
z-xU{QyRNQY0+_|3neS#}GMzv{70_ICV_26PtrE9yN{ME&nVF)=d8S#4W|MA`<oj7!
z!+Z5IR#-`dL<?6$+xKp3`7A#{O~^JahhAOtS(AReyzxiVsFl!RMzswIOYy>cu`$FB
z+c_LKo|?u*Sy0sN%BAYL9{C-_8by<J|CN<Ir+r?u+$FL^kO5RpafhUZb`HPze$V$d
zG_Iko@HW5`+<%say2<OcC5D2XM^_LLU-VMZiboZrdi<517FT-C^zoMmfaX_;!k|sH
z6fK74;Q}@IlF7>3zjN+6zIl=hcqPGC3)ODjy8|Aav@Z8?<)UY2vu%`31e=dT6cZ~k
zt|Mfaa%T4MDZVyd-#3c!vm#F%P}QeX%hyfR?_qEjIy(=<i&fKk!%Uz-_G`&Jfjf2|
z5JiK)57aSnMxvj}HfyFN*Kw|QjlJqE3>11R<w5i1;fh5%b(L{hh^6eH8!yD<Xqx-l
zwJlYH#y)78guX5W7k@?1cyHXgv`8h<&aaD^9Wkl9SkFQiqvJ01&T6**UF;Q<$E+1@
zZhOSVCe`dsp}le9#Nb1}`r45+CHmClcT)93HVkd5hOIV!(j@DlDSHME<h>fqDZXn}
z+<bb%Pw#w;qDex`o40S(#Q#17sWoP0Nsb}ko*Wz)I7k}&kgJFkU-sMssyZG9wV++F
zbk&ufQzs|h9xI0){+v_XZYK;fmtCQz<0ee7`Z10-U{~<>Xa)~i4u#v9*XNgBQ()!m
zm&6^_$}ysWWxDOUY0}`vIN_y@7pL20SI{p#y-L>?9ENR`dv7q{#*gy7&}wJyTW{IB
z%1154dyBlBEHS@|TCe>mnkFVTkxY)TuCF?CEo<rS1I>I*acD2&JP3U0M!+P=*7i@6
zV&{~XZ<|`&$oHOL_@iS_xz&SOhY@zIiy*HRAMNF*SXNfX!R|oI>`4+Ykiajp99Dfv
z0!D+n894AdC|f>5j<cgO>m!J;Z>zKozYmG>DP>H^?8JK0XeZq$-@0rY3xx-Z%0vI!
z5Y&7scv2k5m+Y!EC-G@a<#RTh2rX!blC{C~k-vMfKTS$$H-IWSUExiB{t)IF$c~Jp
zx+}IZWemoocxVebD9<lQO2b4~fMa@ZRlhMuXOc658%WQsL}D<JyoZ?KBc#qq%WzoX
z&%h7Xgr%GI(-*%g`?0tRnvh5;L&vvC0MDk2{gA#fnz(aF0s(Z@^+!kEl0ypoT2f0D
zz2!jaTNT5=j0x~X>y7wP?%()R<sz;E%<)qsi$~rs=G-|cT}k@E)1Rcj!5^i1EE#;1
z`trl|LVJc33up}<6z57scX8mQOI5Lt))0(RilP(8_VNnR^^Py?plKzd5%wT{*TLD@
zR@tYS#CT@#!Gi`sQ|uhoTWr6K(7$B$H%aJ<PeerQ19I+Z#mQ)bAo!*a@h9p>V)h&S
zD>$<^3=#-H`oo4jrMyc52*|W%)hex9z2Sd}%7~p^9%McIn3ZNQbLqLce$jF1P=~?G
z9QOAIIz{S=cora#oRj<nzSk^~=v9^2hMNcf{a^(ch(KEi|G9Grj4>B4BIQV+$25!2
zOl<^ms5<47g!bAxG*I}rYaZo@Of4kpf`Vz4bYCbn6TgXmDzHCrM79M_6~%^Sx&?4l
z%c_A*Cd<(jNvP?y()#DuN~*h*)Yrd@-qqc--K`VxPXF%CwmC8D+=6DU)fCLnpBsOF
z>_ew<pDp?Y+}domRbiIR_zicCA2HdqBcRpFo#o%&)q15CeAjOC=WLaKzu88+&M7Hc
z7qZFb=$W`jMOVGoF?5u$B(3KrtOI%jZQF{s1lR|g(AG5y&9jn(Z_uecICQleQ2Udu
zj%Yjq$D^h2IkdK$QDD`pfUH~T%|1JCJ;l|!d@>ZUhb<NNkrJ9gVJNfJ?~>%1B)4Ti
zx?9XxoUz-wgiBLRNfy76NlirCAcO0|<NshgmjeM%Rk(+9yaWq8v$MvaK`S6`IJViY
zqI2iT6IX`ntzW;scEjWlgW()bP(ac-PWMK&(VL-m1DK)DRg`(_&Yg|WV*mMv+|Yv;
zC=_|W`O&dPUM6}}s0hC)BkUx+EWo1@SkwcvF{dtlHsXCMii&~nbGa!t#t<$q@?@Vu
z5b%JevE%rx@3lqIa3{Crjhi?T!pz*Os~jn@*5wjJ5$w<=3t<@IMg2p)*I;P<ANucG
z7X|S5p`^rU{tZ9&t!8t&|3q=ssIFR+f}xoR5uJ_Wu*yQO3ulilQt1I1fS+rmVpNwo
zn+YH|aJr9+Bw`apTE{lEfa6SkOWxCVR39hVvBoe>5K38CK=Y#0{{FXtjQ>zm!?hvZ
zhU&;m5_>W+m7KSFOlCYsiJpX<_!sR)(82H2Xy}>XzXXQ|&_{`)Rsy61zCP#k^j{^(
z{Q-F1(X!8)E7N3lh(d>yQ6jNhFQ6xo5g{E27PNJ%0i(GvO2JcE4J2zW2Rz{JiP10*
z`sf{UIx*%WX|8VBvZYcD!0xM8H0FP{xg%zd5T0@LyLam3K=a7Qk6qfeb0Laj`6<)G
z0~>ub4bCU=_&Rn)bom<MA&{yXS_}J3Iw;Z>&VLd8Q<yRl#0Ef1n1Hlt^20mWbXpLo
z;Pp~fz0O>un>YVJ^93tKLj_zNj)dA5zMO>S2fb*~HAASmtRMnGMUJoq7Lfsv9g9j`
zWpo~xD@Ny;TmEqh_50Y7nx`*u+)3+i(!Auso@X^+!%Qj6q1~h_<+FePO<l6_dZI~H
zgEoloQqAGRukfhghV7NmN+bm51VoV<k-uZUil3i*hrMHe(tOiXMWrU?m0*Ug?Ee1!
zdsxQ%jC{QL{OMDtT6ECN`|zPNld8zX?el{)rV@TvbjrEBa#X|<#xqUvv19RJEk!(0
zX_L@l2c?OAcmHXnqNaxNo-$Xvx+7>cdy)H}6e+qI!c2c(T`nwt@_W${O#eD)3@VwL
zV~I?=K-#3bJFME;lm$N74L#p;0m_<IA1LHQ2};Sneo5^u%#KQ*ib&BM%?b*&!gb$9
z3>&uhzyaELJ=p3vkCeZ`&VFzjcvT=t7a#$W)1f>%K1dz;^Z;tZ8cTpiMSJM(GUju)
zgBPZTdMZ%kKEt(ZC<}clNM#~*dD_^Xq?_~Tm`}Oei!yWKw_^R9OFH$Rb<ArHQSFnS
zLhXRpLf);5x)bXql#IbvxdGrj_?}aqJvg9O!rNDQzYjGk-tOpI4<0P#j0!J+r{JX{
z9Yy7a5s98ZT+h9KbO?cPC--F=Uw=fk1B_N!8MdE&O3+|Ey@6yXROvx@qO>jpgi{9G
z>az#O2CwGv#bq|wD;=F!wkd<L7V#~$*AB77prdB48MKQ94Z=+6CKzdt`fs;5(I^A(
zX!Q~5iymj^VUulRVqt_^hMR*`B~@FCz75X5B?&#N{k1Uj0lSHchWY2>;t-}wxjzFY
zL9`$eAa+q+JSr*V0;aux#OBdtxsm3V*NKT=@}6K;$xFt67GSHyWh2J@qT`b>k<?7d
z&KBF9a~UgqSjvQ~sz0=~U8vL&=udL>0-LL?<tU@czDxUah6NrDup$`BZr-v5tdP21
zmZ=>^TP5c)fhYxOU(0ClO^o@SDj2kNq=4*${qHYhxu|%<>kQsQbzy?hI6ZP;!ktK{
z`YiGNif_o19U-7HH>AiwgUmY*jP|qe@tuAcj*G|8(86UE+4^p?V-AQWLxX*@YX&-D
zuV_M3!f;Q^zP#hYhlbQq64;$86pEhe7XS~umz#D$PKux=`sHV%`kH?&XUv-Qu|WCw
zd?rpBRWc#&$HR3${8LPAl*Du-j*-sVXmXK0y?a}onjt8J>+A<fjN1?rK452UG9$F3
zTCny4u`Fk493-J&lc)^ASdO*F=&(C<1`k#t!v40okJ^Iy^SR=`6?esPI6LpJANP58
z;kFB%oMuJOH4J~|bEfX0)~xep8kjC{_Ta1-tj@=^lYOe+G=C=^S~s@s<Sro`LCZzO
zi}v0{Ecn$c!<z^$PpF7M^)$HeN)@H{?RBWBR@|L9aU#Ud6HhVFdNHFweOlLBqzjEW
zOHp|${I;oVccV4xj(#E$jQJ}nK+tC&SW_>^sJ14#fA;#A*!MfsMr+$-9U0Yp-J$m$
zK5E*`5N*G!K4<E~rQc6aim*Aiz&~c#&3)}9zO!@j695JsmDv6uQz|v%r9JrROE_bk
z{-Es4Og+@NZwgLw`yCkPh@PXqP09c%0Z>&5A@~Tg?0oT`<5P}LPziU>e|K`L0xl;Y
zz~UooQdm%Ud^*PK$K@=18@u#XGF#(f#_QMPQ%hqjI3Rik0xX_QE*MFCKbxkT2SXiS
zUA>U-EH&lf1VX=x*Xg>XY0=8W8dE~9?Hq&8f|?7ozwMJPohae%b>BVg?2eTUs=m~C
zg#dSKi<&R8JHd%0kRH>K(IBR_oYSYtDX{tqef9}Iq1#jl-&BDcX;fkow7>uO3n4kS
z5J>xPZF?-u+a{3=&m=Gbls@)-jTf-Rz8?wFNoLxR!7AP@EkURqT)AQi5eIMKPc@ZC
zW7b7zTAc{Yifr<03-!??5n5VW+Ov1z9>UQ9^m*3`sQ5`H^q(OK&sSUrLfv`_F8$RV
zvCx6iGjTw0HB^>=yVv({=d4?g9u;qBJJRAK*yp?U4xpDeA57}NfwFaYjs@51-@mdF
z2>$ib_I4YK1^gp^vUa@LRR|$!vDeB??f%?)5bfuK3A78xl%1yM;J7jgdhPm-o=7iS
zZKg0mavqU5d*7xP>>n&(SD!z`nE~Ro-)+fGBw{)tixr1kVniwr?I6vYoR~xhCM7U@
zIU%-Z^gYw9UXNcreOeK`L`;2DqXl4_PUoDiPbaT`xb;nwn=PL_sk*)BgxL<Y+OKZ`
zS!gIOc=!cNqCT3#TsAX~cqjD}Y6HOuDDpp8Iul?J3VK9@0|oM;dvP!1TmUD+9;oa)
zf4k~dQa9p2?!%!HEtXwMtA^?LPUo56dp?l>Z*OHn81=gWQ-zU?s9zH5&{*nhB_
zNh|+U6ZVa!>bmJ|)Y{=fm6dz{Ht{NFVi9{irQ|lH5M<_B*UNgMGLeSJWM}yLN|Z#<
zOKCBE!a>m%ELxx*-^yDz`MGYly|qJ^XzLOUV1yT~8LF=Sf#9;^4&s#$Z6S(wejPjS
z`rozLr8g=nTs?HAh8nNu#P?fGR#>M~zp+Y_z?{05`nzu*sLKjubKjwdL;#B-T8(Yl
zb<&$5tBA)@sbpwJ{G^=^hF<TwP@YEnelqhpU{hB=j@j)w-z)C?y9mbKl>LA7>C;CP
zUIn}>0=CjmZ1@Yamcb*SNxF1ViGn2g5fR$JAFrE?->szquj8Z?aLQW>dEc_dp-GpJ
z#ke^Y*bJY2m0>vSfoW#DTf^usqUV3fyDhDqdyHKB^+B?^PVDrp{f4|uDXlv;Kg8#@
zyI$#w^VW3Ipq?|sF`nsqhZ#%9`J#q)v=#I8%C}YB?KfDClNhhD8}>v^L!)rnMeT#$
zO3KPlNeeDzArtobfOEnWHL8`g6O9{wCc}xWuYaqAUwvnrM`egM*`Y(NO)2*p&P$S9
zkm|l9XTbxqA&uwgZb{}p#U^&vnHF*-jaSI|k5rjrxbfes&@FWno|nEq^5#&I-n`4f
z^MAYP3C>UG>0Ca*8G%_R3Ias5YHxFMq2JC%26y!z`F}n5&s}vGFnfmW-CMUh+b;F0
z_9Nj)@P0S^^2Y15%LR`>k^q#_Jf4ggLf4BIFRF?%OZwBI71)~2pdXlCH6c_%lWBuX
zYCRI3Tev$#5rF!jTB1#Uuagq(B^f*8btv0!N_mOBJf)=*r?=Sa;IjR{-uV`6o0Yu8
zz5MZ`N0V&3z8`#WQ`Bzu=Eua{1kP!mQ7x`Elnf+xV|n&xnM~2pO@J$Lcg&-YCD~id
zi2h&Xw;Q~7Io&=`9nU-!IQI^!=<#1>x=Vb!6`-_2Y?}~3u;>i^zQ2Qp0Be22h)}Vt
z=BFQA>!1YOciP$}cG1L}2WELp#3EEw5~$gGm2oQ-7v4;~UBUPWr@W<5eCkHkk$V4W
z9+W&Nm2`cXLF(ES^ysa7MJGUqU3vDaWG!qs=<WN5E;||Bx};A;70g{$qU+4Z07#iH
z%}voD--0PiTP!+0vi1vK<5P2vQ%x>{5~ApB08(^!@BF%_B=8+-3)e1RCf^7U*;^MJ
ztqiNV;zyl<5>jw22_^0EG+w--OENG0`laa9(wuqBM=!h00nN-LBpn<h=3I<<G?#f9
zwwY1W2gVv~yssTZ?;QJ#sLcDzy}oy*Ck(#n;IH!N!^BZFd$e+t8ZADHBK4Ns+pXlL
z@BiMgVVm<b(6V*0l!e!BGwNDQ4va31@(wVH&NGjf#H{<0w+Zr|cJ;Hgv~!eD@7yE+
z?09IB9)L0xrcKEdFKglUoRm@=jL(prolILC<VkUo#i~btqD4oo=hP-wZ-|T{K&xFD
zi;4H9biAk^GN%zG3Clo&aI#yMuB9pc4+o~!MMMLp1kVg1`6(Qmx99_va!TjzcHkYm
z%=|jE;*VXUdqba!wNEixRJQ%GRG*LC&`V!?$dK-2RhEhFWj9{D(7$QVu@Tjb{N;$3
zY_lZoQ~wt!Mp^>^FHA)+4{$JM=&F?~(<TQC?G{eTXgh-R=m||gD`dh?%at@{o&SJr
zWJRK~$f|v{rhqwub@f|qVG;O6g?G1>X+Kf>$FHlMY~*xdR{o2fstCoBT0JFD&`EHh
zvWkb*HmD?*fIQuAwcxtwJ^EPQyHB4k_Yh48d+DvKqvPu8suNPMEhBR3SX*4WQLH^S
z++*x7z5d&c7g3_aF+@EDQGxn8qjm`AQ7|ER(6ZAcVu)4@3J)P|AjV0L7K3?9ONY^h
z7eK*CAf;o5KG-wxa9x=XJL=zU+xnC2fo09D9P*3AWhYAo6>p}gZkLDD=x4{duWYbn
z{uOTf*m6oedJIq_Wx8^5Q;a{QH$-HNBd4F9wOSf{kk@^tMfxe0-kD^q%#dXFJD}(f
znBb1vc_j;2Sneq0KDiGFs>;1syL&rZ9ljg9bEh^*YJS_an>UZ*%kPxO4aU)}y8&uZ
z7*XFPn~KWD*SdBc{@ch?Nq0%&`pt|v7Qky0oiiNuO4u!|CP@s_u)rgSf%(9AKyA7{
z_!<NCNUCj2&(B)M8)I+wgv@~oEPYc9hC$eyp4Qsx>Hss=gM6?or8T*Q<FH>=lAlB`
zZStEWuNN2?C2?!-8o^{@FGtGueDPBZW!sU}#`Q6zr!c0h22cK41_@wn&E>TymLot(
z`b}o3*t2In&>B;!={?;_ZC1sr=+h_U7lWoRA4buxy1JTGh2bh1?nRe9rv!o;p&Ef_
z<f(mw=9}>Hw-uy0U0{(xa4LxQv1unWoIq4MckcWgZ$$vT$&7d<E-o(S<F~-M1R52S
zi($up$cim!YxI2uzJ-8`><f=Tc3(^#dcx~bTeXg*O`h|cEQX1R02E=|$r#hR@8zjP
z11GvSFL?ok%QQlPezZzOxmJkLVOd$=Rr{wxx&;CnY!LRO5KwR!**JvjR&F|adR2^c
zN6K(X&R)>iRxMjj9Ux|o=_p-K?bl0B1wmd;mgzCl8oc|Zvu7=&_)*}7({tmqU%i@R
zN{v1D?c1<52bVg41K~=iJ0XW4wFIAJ_*@n_#y^kXJ%JxBr=MfTMAX*7A+Hn6)1Q!D
zZLRi&;CjyG6{<If5NEb+-MYgLXq#d*MeJ!ZA`lCw-jf$Wrz0}u%y*+L;7t<<&VmJr
zvT|&9PDK{RHazFzvIFt)V)&CY1FpU8IZ?;i2S^>h^RVW6i_^j$eA&BiAB|;5$%S9J
zUcLYIN0yE3dt|IVAl=48Lj8|}fs%~M0MY9f6;(v}H@(~Rx~?=tg?TSsAsQBIso87m
zA1ErUSEl=OW1H^#v$SU&XefKzEn$xtumw5eP+q)1PN>f-t6PMUfFjNH0gg?-o_!tY
z&)_avt7zK+%>q6lUFD96Wk=FLNIH-2gc=Rqv@rAa3XHIMmC#aH9RI8D%iiSHj`961
zLX+@y{0rNnbkdc}m%oF|urakT23#;>`>t)Y-;ry9EpQj2<xQxHe6I%l(Xc#5fd^hA
zJk7Ph;kG_E!R3ozTp9?F%Sj{z*t&iD#h4gPWPSn3>7{7>kD|kF2S9_39=IPw>K8aT
z2>6G47@@D9=(6h+vqU-r)I&}u;TSt+j39&F-xJMq^E^ekMhI-w0?WO3=1r6bNXBdK
zYK@Z&|J5kk_vIVRHoTxRvQ5)lnVCW@%Sp8EOI!>Vy*BCZxahU$N`GaLTsPPC>*;%I
zw>Fa<2>!HNg8UYIrGL{s^?Bj5WD$+A=TGCzXX+t**wU5_3T74gh1@@~p}b-@_P>t>
z4xv11$-1rHC2`k5pa;fc2PHI{ut8*NC!|It`OoS-+P>ZHR<|7>w|arE@>37nieao7
z)YR@6`qa(V`0%``OBnn7^0jOID)Ik<dA^)M<&s`{_ptNA<DH-!x%EkXpBvTvNB(yy
zP#=xaOzFHjPLgza@7~VgkLz=r1WpyU6eZD=`lYDmXEHxiiuf3x39H}Z0#YvhG|8cc
zf0>QfqLM$Q;(~0A_e+;83pRQE>^Di)t}hEyZr$?Bx@5dvhYST9vnd%y?q%3iviS1K
zwz`fzd!D`f4OW;GUZ01-3t9l_gyOY($&o;v57(S4jV^27`qYq1+PRU)SZ@Xej}*`H
zxNud<zht!L`}VH*);)O~4KqRpPz`{TE*7dw4R7=Ss*CEku~1LNauxbaDD=_Wp}MFc
zvg>s%ibA<1I&pu!>Mfaf3))Ih)+PG#r|7rz^?5@e02jeL9$ALE_QERW`16ef+9GHr
z8M4-I`NfZtB*)F{vvTqdAW73>sQ&;P>Bekr`&xA2^A@f}p}B>XFzsz%F#9tC!I4<7
zexnJ)xT)MqKn-RP+jtz$BkU6Cy)-23m3b1fWWY;EP{4Xl6y~=IREtk67}T-1j(jKe
zx~4N^h=58*=dtnWCbNcRVW*|5Yo#%=Ca=BNUF(FX6#HFOjf8FnqI=mQt@3?2p}r&^
zSm`9osa*~YOY?R==d!ehILfzMuIM$`Ol6mA7%w779DOT2Jq91ky_C)ZAVg}WzZw;m
zs;a8#i$05IbstMOazYsv!be1IA9OQtx6q*GW#{$hE5X>{C;69(sw>0@r_t$6ehic?
z5<VqISekIKxUaZuQZgu4$3^TP1r~ADgWvL<9gR;V#8*R=O%-J<Pyn+c{5<}zqte>7
z-w+1sY&$`Yh`RW)6auX_B?5qVd0KpY35dA)o4mYZd~@_4BS$6>2*B`H(<X!!agJ<{
z7dtFper9VT@$NmW*;B|oju7xDA8j@xwgz9@I`FtWO&sovZi=gG`yC7;ktI$^_4L3~
zN!g~h0AV9)Vr+KzA&_$d0u+=Y*=gvp$QD0NY<2T<B(!36JyZIRR9flg=99nR=i$_0
zfY`nR2eu?rT1AFk+)BM@u_SKZY(@gi=$wr0O>;Yl@RW%%N|fZXuC9)vks({Ow)@6~
zf$Ip><Q-t8wDOPs41vdpRT;VZV^Dw<hyxopdiLa>i?)%>1Erp+ra1Mps|qc-_aq<o
zChImn($K)*Y3ruB!T7-8Q1z7H0*eR~r7hXF8Ia_NSjH6pNog;d6|X+b-Y8l%LTQTk
zg!r9fX0{gZ9}DV3&dh+!<JM8+;fV;BBJRdTjENXNpi8VqcLsrrCEIu*cVdBEfZ(FE
zkNH%f1{srtEzTQG1PSGVcX%d%e=Rur??_8R1~WV^&?)`@#MTf1>PDpA8!RY9ujPEL
zUM+D+d8FY)zLU+(5sD-fpAJPcu9y^;plCdKIK46S`C6aa%Wc__*n_53vN&q#1Sur0
z`}dXxnQ<4n>*z<jf7qxVWZ5(iTR~X(Y_YD>KTQH71Yfc1&`(qg)V{zK7TmqnXWoX$
zNZbV3RP(KC9@juT6&j^1b#f6wJvBBhHK=S5p?9-SI_og~LKGWK&G!t5UtMI)jAvOa
z?(vinvLhi7DiKsY;H%jRy`pVcH)222V-$)AA3PsM>Czw4ePhG7#$;381pm=e!@%)-
zTN%s26Xs2kDdBVks@)9!d5=N>he5jokt7-0p-CJGQC7$Q%F?^!Cqn4;_{6UKuZ0U;
z*~lX7<yXF(z*tcd=tE;7k`rX>jxgqz%uSZt@Vr4(%<$3ecVGYshvu>;rN2ZVIU|y)
zjA>E-K%rOgnQUSDm~<kHz0ZXGH(rggba8X*w1f5^UR0u2jivZPwbR!6u*8v7P$I=~
z|MARbz8r;e2_wR0W%~a4=cZ)J%0$yq7*>z*z<F$bHnN5q--J$*6|a8+jx-LK0_<H4
zoGylkkgc?$(xz?Oa39M-b^|z)VjQaY=7ZF<JgGjaS)xD$BntSQnYcSkZst7j!QXKG
z8W-UD@J^mS<o&21#-HwK!!K|&2?~4%BZ^^Ow2WsXz7r>$O_hijy|nBFs@s365kUX0
zpg=I}8+TiX*Ti3d9!$KK=oJ4m4GJ73rrzVEO@($OdfgPwM~ge$EJ)C;y<`p%<_-=q
zFcLSA_WHFgqq&#jG0XGsN67Gp`l(bBemHzeto;}zOn$i}Ss&vmz^bQ!aYA{n?sJ<i
zhbHaGK(A8#_*SWcknxCO(&Xr4SQzTY{Q=>#k4W-dNFu*hR47wb*hS=9Rrq8qx1?Q(
zmmy`x6<Qum19p1?ipIb5qn%q=NabXK(z%!zAuR(x^Xa5!z4qusFZcbWDSQo6p;UmQ
ze$#d7ji39;il>8k$WxsQV2tw6CC@%#F2xBvz}FI<uQdO`@q6sq($0bE{>n9Lycwk(
z0Dp9!bYJy)QCoJmr{J}!VNp1F%qgDAa=-k9D~38_HOrYCF%hIx@5MhqUv3t5=`re6
z;U&+mUsvtjTjVX`J6{|-etdgZy+<hVc!+b!3yAFcyRQ<1Q@I->67+!ckotS~zJ==b
zXA9|bS0ciT{Xi&fQ^3A`S)ksa9^q@$C08trjeVg+SVwi(L1syF^Q#9BK0Um+;O7Rv
zh>Q_J)q=yL)~<VZdZ)@sY@)iiFh28z0ELu9<Mbuy!boYK&uNnNk_;p0*$dDc!tqVA
z$3H-7M(fl++P}~bLprsFV!ip=ZW6oSIauHh_!|u)8)i1K>Av1Med)iIZ-bl+)jHZm
z0=~EqtixM43S@qgm;W20&j(^-%dxDYRQd_@r^m1rq3ny!v5-^I^HI_ur=Wj;{T(UW
zf6;#q8IpmZJ2;Z%T%vDxN=NvoX?h(f3=#9t&4J^yp36N7xhpy}oMZdOHsFbYU6r+F
ztN;xrd!91)41&b)nBnYL^4s0|vrD1jF9VE7_bkfIJ?mC7cb4<|r319IXu2Dx7M8Xc
z9Dh`isJ#bnpdJAIk0FogH*T1iij0r<45aved=vMD@I`<dh=8Wea;EBwy<&o3*rlcM
z9e&1>d$M4x3)*12c1$fxBO*^_5GPCyr-CMdH$Tzx1=a*NZTejH>QY>&;L=@<lyvSy
z6`wkb0m{*)#|gd{-f1N>K<@2%a9~O?HY`{WVGaylIEIo~#>8KH-6bs&AQm0bN8+@m
z9)G}VBI?_vm32^Fg2@|7Sgb;`01<2O9z1eSAQ5&4CF`fHf9o}N7%?S~CMn;^xbrbF
zwGdhQPS`t-D2V25kzT~X-4|UgJLrFhe|7(s-#^B}p$yM}IEH-{pnQJ#rcL9>=7ja2
z4tug|E932=qmEqMn^aK1)+{qcPMF{`sH~mxZ22?JX^E>ORR`s`M>v+EgdxRi{@Z!5
zV8(r1)HD!w^`AB0zD3VC_m&&2-5~5qSs9`K=FQh7acyXzr@Bq+z7qeGBGd86Xd`mf
z+-20Qeb}3@lT*f1Rm6`o{{1Y#lh?~O#-7}kV1)1tbk3g0a0=c4BW;G1gSYTmLZ_c0
z>jkRY!#E7&+hj&2KBW{=iYY`BX&?a{k+#@sYx7?EXpUsj0m)e=REC3c(r7Mrov?K^
zT0cFNjkgOS>vb4QH9CaF^ePNhlngq0W@c`V%1IW8Fd6SLBZgrXg8c|_O{b^HY{c8{
zl9&y^HLN(MrxXGWP5c=>-BzsFt8Kp-S`$<9b|-55ykn5g1ViT>12*oDZnOPQpB65L
zX3aTh8lZCts6nsk$qwz3z8%SMf_TIa*Hu(fS_zYYL=t?u^O;Cmn2_DX`a)pMNclb2
zmgn;!3WkzI<rv{-3OT1_$9E<{m;;ywtyuw^H)|U#pFdxqxq?W?e5stAxmRxA?d!Py
z<-~|5h!toJ9V!G@$o=C0Be7Yq^jl{JX-;$EWpRcHWj)3Yme){?Lll`3i!B9ntRc26
z2<hxwfs-WGp!i%0;2wGC*QCT1IAg#&w5|e@MOzR;#0>{%I$$3dEq!NjJN8e~mnbU1
zE}}0E!@0WnWz8I)^C3TvOMkjQ5qe?HatUr5<`$8a@kIKD$(=atqj`R)9{BAuJbC>j
z{3xyp&__*;Iu<j~9qtG{$-qvuS9Bk?qSvrt)%3@(@=u&D;DAHBunmFV&Jt>1ZPDxd
zajol`1Hb)zbJy4$^`LDeCx+{4tvoufV<-cbGfuQFV+HWz{3*KA5Tw5rvi_IcS2(Uh
z<kdwGdW^Ts4QkFfE37R)anAf-pv=v1lxmE&-u~4^k4L{&Kb1QM(1XN~P~TaeO7i($
z1uC>^lBEdbCXYMkM98o>Qn!8+o=WGNhzldMrf*dLJvQBQ$@1mJB_)%k_LB_ou>7_0
z5#vc;aQL3+D+yGB2G`S4RD$Kn{0Y|+NOXHgJkGY5FlkaprGW#Zc`Y(bRRsEk91&|8
z07)*Ho56EJ<liMSDk`Wq`K#ET(gQD%eCZ~)#C*0_b2FAEGx{(w7wu^DQ_PY%3}4O0
zH`!(oKrb<mlE&=Ir8MPIY?WmhTuxnXrnp?-tNtj~;}6spEMG2YG~k4?fSCK7^S|!1
z*Hl_1dBY%lYG5@ww28}hqr}%>n>>)<qR9y~%P~EoXL#iryb*qyf0Rfve%2!SOV(3h
zE>5M_kJ{Br%xV_%jW+#$(_Z>X-;~DBvW5m}fkVTL-eg7r7soIK!pH^b&;^M^p=X<)
zOKE&@7c*0-)11%Qtv*GqcW*ywNWgPWZ2WSTI&fPcDr!4H{D_x>HKvY{g=_#_qvgpX
zw-VC{M3z2>fPl!zLI5@)(J3<y65skG@|;;bPBKs+sCwj6g2Xj2&-G;Fe&a6xYYmM}
z^GH}IwtCflzY-vnqCgjAu#QAxwnctqxuji7I=6V-t0Es9hJtEM&Az>*y~avcDDZQm
z=aC5Mx>I-nJdxrK!UX?~8)q?sPd4vC(yOW0?9S5P5`Sn4Np*osJ$u2shIF?<zC_Ce
zOmgO4>yzv<sV;#0cZM*LgM6Q%<4QoE;z{X(f7-A+12r0Lr_kdlXNb-|Q(B|ZK!L{C
zU`!>Fh2(9MJ~!ZMTsrxvH?#o&ZsZ+_sC|8X@e?wLkR#Dyue|ofM^`s%-kj3iY?Y28
z)@s=B#@mxVz34{3?TT5m))&sv_vJjmUV0v#BE-E1UJ|%7?-xdfnl@t=m^~%3K<Q7k
zfANcMOUc^+!x}!`^*b~MOrnMCVg`BmLmaaYb;1A|B2#I0zvbsJRJxme53axV-=exl
zGsaKAMCWrz6~$zNhE4C=?bd|PlH?|ktSBZ%W^0J=&NBdA%TDx3Z1`(`d;`ZSjt21n
zDmq5YkxEaul*Q6)$R@8YUHOZRK)e&WMoJ$VKAA8&TS<DY{siJ6eCTuk9RMv$C?fva
zLfJxJS^AVoi8n`iN&idyQ92SQSW)u29vYgOWtizEM&1!`%Qs4{?a@YCy5!)bh^{yz
zT24kEBUvD!2ArBMzwm+3#-Du&Q53DYKhz+U!8@uawIbD#esSr>Mu$1++65Rx<Piac
z7MoBYqcuPZw(!xMZgP*oU)=ver@r=fCtInRrQDyv>>}|~`X5sZ(*HHYpgHgRA#19M
zLMUOw&&zdee7U~fG^zCM8$r|;kPMhlIB(Nnyd-wNG}^*}mCRHUqi5)uQ=<+=+!86j
zWXw;W#*HK|z;#Ce@53$;DUdg%2-uqBOw#*I?$X!=^&^pq#k_(KbQCl%=J29@cv5~s
z?>9QlzFI<PoUrZ{gWOH2rc?b|Y;P~pC-LKZ%Cxb88iT9{L211}93!bJnwbR%&{66r
zv0t4VtwC}VvCl@Vy6jaG6JT}Hs*sS25}IYh8#R8``0YFQgM9?VO+46s2ly~11yF9b
zd{dS*Yjk1ve)J)*RKwPM+8dw@Gf!9dJgKvMCETYs-T(*2j~ho!OxTqe@H{Jvd@7oV
z>gQ+wY5du)Kw0R5u%SAM5E^M0pNh1xKR^2)BaNMK0KZ7h?JOda0W)lDD1=^vWbyN}
z1&yzp$Xp?9DjkH9O6Sg>XY9aO&=a{uD;?SRZTO6hcp{oj!70w3Ev>35d3<Dpe0g))
zH`+L*1JzGo;xg@&sCN7K)Z?+_?>F~HV_y+Ho+}clK$raDbNUj{;Lp2^ZhV(Z(h64z
zl$&IFG|BZUc(YIsrpgcCw=s?O!CZn+lz$dcxD?l9r%h}4oNtT9*NlQ`;RueXw!q2h
zDouZ#I(3qtf)g7WIiI*2q$R+K1JxinQJ&}yfmMDAtbVd|*^XQHQH#I^k0vqHBIBW6
zF85{@yY^)_)b;pa;JS&l)KV=|BQO`y1LLxPe)ZNyUu|d1qYxQT6BP*-&88*|ahPx8
z_fi2z1*ALy!4@UD7HBidhkGWEnF9Ma{%k#{sHj!bk|SM763#Pc&d66ic|hY8s$YB|
zo80_cZS7`(T>!dM84i^nzNLPR?)3X~U^lWNDKQbl2dkc3SZ*PWzy9RD^Z2HwWY2Kw
zylG5SqJZ_9q<<t$8uBQ{PQQ`4FlK#_aif51+MA(aG=qrzTz*d1G%^6jbD7iff?7Fv
zct8L-5h}d2^vkz$;6!=pL%e|qqVEa{Ao@)GmY+T1(x2;)&}bc!uaO`=XAe`+4KO1|
zpySwr4#A)IyX3z{42;qU0p)~^ZW&&A5nO;;i9G0wYuCt7c%)J8qU&W%<;Q7C@a!JI
z0G7ff4W_M|?|-Z7Yf!+%xFzVEu%@GrVIzRzgp8B?@VWe^bD^okxJP(~6T>SPd2Ne~
zbi@P)*OInjU}~p!*>)#8I~zeb@-aFmNE*nQK_U8E9y6ms;NgSxy6-;xhX4_-k_M5p
zO^)pal&7X<5-F&lAi%k+b2_nb2seS9gdvdN)Lgoy<8BbRfSezH>2eAYv;P$mEfkyY
zqoB?TQbY7T;B>IzV4(y|iE(1$an!o#2!P<iB23tLfZWK47x*$^-CD%WTtu}EoQ}*{
zOeyi6NilQeWMXt?Pd)jhL(&e2q#-ai<kIIG7)W}xW$YrIv_jn+As8L^(p;!e4WU%v
zdZL4?moL+IyvYpnM2gEu?Gy41eNb#@i+z;M30iav?3XP&R5y<@Rq$J26$&fRneIVp
zM5D4DltG|<-7V2`t_krI2%t(DB5AV>hvKc$y?Z=sNVR~mn{hlq;cGFulf;dd*By0c
zWPFK<^#|6ir-;)BB0VijyevVjr|iQ`Plfx}u(#@#@KUJZO`15dgOW`4ixRSY<K+$F
zp!lZ1t~^T>EOYi!Cnry&4}aW;nntAV#l#RopdxrM7!m48l@5%z;NbP>-hG*);}y;-
z-?u5iI*R`9G%zDV;abAJ=FOW&cx<6n;^o!V_fu0-&wl%g3QaspkuD#%B{Jk;ZaG*L
z-6199EzW6S{z&@qJuN$E9+NF)@)Pm{J(?cF(WOO4>cT#iL`_*JP@O4FGTmKCcOh!K
ziSimHi|Xp?&LN0u#jh&dDbU-Qj`1DfiR+Lp0v2uIBqy^qpM1(f0o&EJ+}BrPU+ces
zccYdLdgfM>L2W;RHa3XHtr!qvx~UzZ?kk`@_K@t5^h(8rP@SlK_5zj`hk18HtnaHQ
zY<Ql#5WQ(}^W0^%7?1Kr!i-XIFmt^pKvfgRs3BHN_5o5LZ|{y^<M6NyCu*_F9UTkv
z@`5t55ac$)0D?l$y(N=PYtp32&Rx42jIhVwO#gIswYj|FNC2Wy?5EU2`qP=YdPvrf
z-zy@F5Yi<MxKHCY8>gZ<kV){OmmGB{*t!q@WWGW&rT&0VFvfmt#HV#oBHzu=r~i{p
z_8sO+R(AFx4|j|}(B8e5Ia-`U0z7e?(D*eUH*Si|X6DTz1HjMX$T}!dZz8E<WdW73
ziX=M@BJgpEqqib_ke*&<&p5PewJiiaxNzWL3ei}C(W1Ood%uz+PZ8q+lRLsYneY2o
zaJt>v!+~_ew2Cpbya94M(X)q5I{u`v!4O|bVL#IxPFIQPt?X=1qAJjqpN6;WfdC;q
zUIB##sa`0gb@tYc8*hX4zf<KipMur>ah+m~hUi@nd@~JhCrtp8Cr*s#HfV8gHMxV<
zj8g3S3;NhVgGeqo<di`&v9@B%4oT=m-|Ph{j&S$Tl48Lt3!gZ^b#|;<F5RMW>$(2w
zPtVK1@6@2CT;i3poqw*7o(T4>xSK?w6mWAYcoG%b%ViV_eIO?<6{S=1sM>cNiK#6J
z)mjbIvIsTaAH0zY6^(t51e*$sY`}m=Sy{tzY0<To#P~KCWUK&14N*X&WIAWg9O$Ld
z4JT>`P&HdkxqJ8iYOoJzn;H7OVx;t)NpeIe9;!{591rc_uK6R^22iDE&;jb!N=oO*
zY6v0Nb)*{9DJ>GUX?%*Es7z_a9R1(JbxU~#EtS>Ja&JTto6T_JbL;!1#8R|7v8I{C
zB8$6<b&7((B6O$dj+xA|QZw*`ODn+MY%z8~;u4Q>g(7WyWt`{)Me>|z=!AS<RY_Ov
z8J*Yc-T3!!Ijg!GTxj+AR#~J+&mft)N)J69ohP<C-&oH&ZLI;d|H)xNNC({lI*oBi
z?ZH|C8OJZW5qFBc<usNYw-4lhnyo+^IIW9cU9BXJhm3xt=W8jFmHe2pE{ba7^Y6?9
z*%v2v$&x6!y`O#C^fUh2CxUKKcIr~>FmdSU-W+`v5*W^Lx0H)RaZ3AT0bK*;J7%_>
zN*N+pC??ka#Sps_i~-I^c!y#{i?(g2%LY5!*`m1x%*l8a>Px&wa+vWAahvgOL8H&>
zO?JdhrzL);dy$Nez4C(dB#JCJk=m)EG_o0$?^c&^l=JroJ3?YO!b6i>V5jcwEWMHg
z*)(H27>54Jx3`ELV%JPY8Hz{>|F-Jm(@m2%h?nB4gsUgM695kZB!!1MT2#L?%G)rs
z;9EWY#u?*AJieFLxyA4E=g(4ke|h2`x(o6TmOJhpI^kK^gKiyTH@kNlweQLEu%Qul
zlUy&(9QR<RdGph|<|Z12ypH#do;*A*Ju2jO<(JIz8h68xe+zz8e%kao<MV1CP30C>
ziext5{ns5Z=rvD4^riCqk<gSeI*h78Xs&ODKo#~3MUS`c>B9P&f}YZ$rl9BLPk(26
z2*tU~HTR-p7~ynfyhO5Z9n`CA^Vc6gd;kSn>+X)Re~^-5-fxNcrlfwKWHo+D#wQd_
zCQmLAx~t%wFGtH_@3@#%c>~Y@=tF@FzCn!Q0@j_3aq2dhy?TouqsxKxa=7KpPbVng
zX#~F@uxX*yQR3Ulb;SyWR;}VF(R+D)ryvs`#qp9_Qo0zfuRlJ`X6n=@fEt@Vh$<7n
z=#?;hij@Z^s&&hj_vx0`ahQa6wMyMdLPSe%3HrE*WIjb{u5uIvD%E9xD$##OP1}Qv
zFOnRN*XSbKZ!p<_;uc21L}ApZn_};;@Ft$br=4Gh@mjWQ&PLB)B>Bm}5jStXqiS)H
z0q@tOO(ZAG*$NCJNu>6vph~eCU`SM!Q|2^)A^o=JT#{Ji&zv=D5-LFo0(yf}8hIyT
z!oIMmsD5H94E+$(wi$U<Kc=9p!1vNoH|I@dT|E)`wI2l_sr$RdK=^X3gR7vs@9?2R
zr$>yYaFF%!SIJ^q_z93E2}gEdH)?8Xl%(`RU&}0tzldfY`>>3?fKeM)A-!KoXp=xT
z$@J?SCc<D1zr8dud3z`5E_QaNu)0+B7G4Ugh?*SK@lNzK^KJkg#wwg+Sacylv*upt
z4s_aL4cJWyXhYM>aHXHv7`~cgBCY6~_VDmPKk0W>3>S*5*Ew)0bA-+qTW85%lZ*N2
zEDB?yGzUo!q!mdSv(qd56)|_3nwqtU06z*bWOFUf@cQGAqj(e~BxWDy6+}l8$zNr%
zgr_Imr#ViKrPjy&cQn1pRKHQQKlY0IC$K6iEE_L0VY#UmH<}M7UNtm~mnu)=5{K+7
zVOaTs1z!$^rf*fzVVzsHXu+T2og6*-ho6PIGR1J}5pbo8ut#u%?HnBiF;;cdM&%_N
zjzmWf%y>c{vcsVpOrhL=^mNBASo!q7#r_#Ju?=dcyV{()c=4HC!p9LyycHzIMXg%5
zMgV;Smk{l~UF!U$SgreRic!Rdb7(+NI1&}bsGC6!kGfsi(r}>T)q~8#Uk=Ir^E5UI
z4{s81_Uu_?5|ihM2cPT40FDeO(9=+Ty$TqDIMgaPx9BNHH*WQRGn|gh?mc=8@Vg8h
z0K=O<K+x@zTGHi1UR6rCXF`#F3tkfO$x+2&Vy(EQeucCXCe)tTg)oi|?BgXUC`%-Z
zr_kp^hF)G??ytxeMxf;qa{+&z`R!&>F5u(saL8z)On&ktmg0;{ru0#wg9tt!9MacQ
zS2xQ>*P{wY00}Jbv@9ITy)e2~Y){`ct-6n}Dwd9(&+WjJxuN70%&~FIynELHckSu<
zeY1E9PX16YT^TsXUo6HkCd<3}%|DRV_b7%DM6tD1kZk*TbH%colm4pJ7)%k$yUD~y
zenQ9%kA(vC7Ws#bKP1nQeo*>HUE6R=nwl%FH}7ed_5$_%gCkaNmYbhH|0<Dp=t+Fd
zdm9?QVt^$uP<k;A7?qlOgvjpMh(Z=WRv-R@mn~+JY2AUfu{j4Q5k+X)zhoyDU-v&g
zsDUyb!eFX>`;Jv$Y>$xpCndbsU|uAfkN$_q_&)gZ<qPbhq3C~VNX6U;?%GN10arcP
z=<9n$5UIfN0;@L3t}H8K{L|1ug90yU%I?&lWqbo8fWO<76WTH*V$$Xuh>Gyn3@CBP
z7r=gu^u0-c&G<L-aMbLQ;duA&KT#o#e2u&XnOn=^tFr5sHt}?k*#YtQ((Egs&uCS^
z;pzh?Mpxay;3fVb=cMe@p}%{|`-WJ&6CiQ=!-tPcv*92k?n-1C)<6=P_G_ct)ij$s
zHwX4Zmu}roS!wz!qJQy{1|al)UJ7w)enOQSZw5c=1@4}nA;VqAWK-s@=fu<8Dc_iq
z@bEv}{sWYv6VP5E#l#WeD5zs|CO62}Vw!2A&EO)&vJ)}zHpX3p-|#%fehavB+%(z6
za>H@evd2W5ejtZ~0UU@FpFCfbpI?c3bJs>2C&gr}u$c`%bN^y?f~fecT)89c5)hD3
zx-d0umL}VYz9)slk^jsRs3Tz?$JqRuGurYvIbOZik}0xJJZ{ryjTcWfHTB~rmE$uq
zGDNdHCSK-%DS&D>+CHJ(iSVeLtOABUPs8B1#QGnp=tlf}b90V4{}LJl=<7nW4hTLn
ziWoA*1)zWw9fGbA97un<ppz0ZX#NfQFT&@LgQAY|=+-UMpVwK<br$!g4h02nIQm*3
zw^(YtEMHnwTr3j$h(`!)K)o4DT8O<uu^?w%)hnzUl4^TqUnF1u_(uzWp@zpv2htK$
z2GA<2frk*rrYp}RfEF{P4RU9OGj^<&WCS#Z906;ywx(+KIZnJTN(qn$Dz77=dgbFU
zlH5PF=3K2Hyn;*vW};P7B25tDZm=Y}w4VS1m|r9BrgBNf^~GBDqJ3RVYC9%vS@!(w
z8_~vIn)}zevec2^1a}6A#Iomu<mAqjKFFA#Hply-&o3+*c$Ce$I8b1|ZnB{xoqZfD
zO7)e*vt3kFepeO3*;ZC!2oJeE=(DJG<V}H_LcZ3?WNVipCtHkm*3s5R;KCLI&TotK
z#ObLS<BApDeE6X5=7=bo3B_kBz^sQ3oW@_8{>G6Ik|_8UmrUvX=lAQ=hx+bdVv3z>
zumZp_3mqLboTtg&q`DEWDlQ3U1b3yMswt>3jjxrsa_cDT&+vi~sVax7fN_i(dF8~3
z#VF+ndXr)pa0~(KkfB3I;^6?81l<&IL-vUntdKVo-t&1pm?M0PZ&ZA+>{op1>o}$a
zMpdljxPfpoJ`!yU<_l2`G>pytEiEjBBUSa=w{Ip>o-Rh44i}Sj9zE)4La8uehOvso
z_oVtCe=LNlN<et&{CQNkLScykE0`P!lx@(*F}NuB82|YDYXqB1uf=#8SW{YV4Gu!x
zmZvsI#Ug}%SVjVc!i?jhMSZr344Cu7GrIz$)qbbQuHh*tc)7Xr982l`7D}y<89-hl
zCLXZ<q__#<34fo4VDs1RfLREo(pJYC5wUxr$}-wlv870CR#Gqr%m%$WnvhHfh*VaM
ztV1-K=h&;ek+`If0RhZEO(pE~<BSZ^ai}N;KC>J7VQ(^@?F4+3sn;@v12vw;B-o;N
z@96YW`+W7v6;a2vvAI7>7QRX}|K9%cSILPu#uxR0@#+UyL1K1I(=&>xHCF(_D|c|d
zTdKl~fW3cgWHIjS%^Ov}0K|1D5B70jI#L@2g}{qqRB^^-DkYHH6c@02EKiNyxqJ6}
z<Zk?s(>^$8{@=%)V_d2sri5?f6yCS_oiOnn1W&M76wFn+b{+o*OT<vk#js%rXnuVE
zPR5vbT8sdaqzJ7Dz>uO>*Dfi?pC<AE%3_CXY9Io{^}&AS?&Ae{6g=v#dW%4&zm`_W
zh_QzK_wU^40&J>kO5y+oV#~vBgXSd>pZF;fIj5h!8FjL2F&t^D4Rpy7p?+bQ3#@L=
z5=dTn8Pt2GeYaFaQ6d~Clcpb(0d@&HO?6pR_QF*}sXroL!u%`*pE=j`?zXOO`6(SJ
z3OXm$j8Rz0mc>H|=#Ipkef;I?SEQ4dpBqI0EcOD(gu|jm`wtvQM6MolLn%dL=rTHF
zDZ6(bv1&2PKnZjgI+AbSY9o_|Zv+sq_4zT)*0vrpcNcBDfUg6ZLie!Z>g*lfFoR@E
zQ2QA%!jm`2Tv?L<zJ-Aw0I2>ygTD8Bo`_{Vxf_#-UVA=EnNB*D(6vX8<qH@7AwTvf
z%)|kBLcKP;wVRvU#P<xTG0A2Gf@X~qQC9diA`qt;zSLdM@7T5L=*W?fj0s`+S44v5
zk!w@>Hk&YEcI+aAV`&XMlzHEAqMQq$@AhoPIuAlSv16a-vWGL=o}{On_i$(XbAXBp
z3pIjMpw*3Yo_!nbf+0hu1#f<on%Wb=F&Iow3ka$n9yVescomdatDZ(sN!cakU%m*^
za%BghAV6Y@lvxE1Z1VnC*d7ND9%S#t-?I5l61Ndyf<hEl5IEVro1E?<&R>jPvDB#h
z)@d196F3`Q7>dq<B9eDT8rp=2gbjo}i{`fTv`;@|gG@sBdF(%ZTg#ky+}vsdg(RJE
zgJ%jY0-KDJBc6`D@Ng7~+_5ufL<GC>*V*x&)3Sy1WbNhc+aGbvm0R2<{(&gj+i&lX
zB~W$OLk<BfKK<rx3@Ci|`ubL+NL{vW*`gnM-0q#tYD@x_l2!-?TaO}H1lc9bg=l&#
z_tY(TwVQJ=?aNtl_5dHGam1dB0+&N*Gan!(0r=4Tj2j8`Y>TI$IZVj!QWf3fk^wpG
zaW-~84i#4|2-jpz80`dR^<=oM4#BOW(F5T%<IJ2F#DZ#SVb3ou7n><`XTbLD%QH>}
zYkG2_*P=i6z|d_C;ZxF*0OFPqjp4&*Dx@J*XK1*s?#yMd@bgn}k2|A378cUwibfy)
z+xBJ?E8rextj0g34uvQ`M%^1s304>RcSjQ~h+CxbT?P!eCgR~`*;d&j5t}|wEvQrL
z=&ytu0A2~aQDEJZzX=XaPVgF5f}_{L1nq}GBSy?JH)^T6v2D*B8WpIMG;7ujJ}W}p
zC!;Bud;=Pk>X1TV8_<2ETk&jFQwn^1w>KrkQ>V0mDMV6Rlo*KC4(?&dzTYM8`3&2b
z2Pqo`j%!$uRjVqvIZT+J;*~2`gzu$~4i85eXcSF=bb@{;D!M;spjf&9fG|w=0Avxe
z$pybc17kc`ena{s(N@E_6{6#OtmDc3Monq&v&kN$tLyc#nd9sspNA6~Ay-9d&rk8r
zojdIE{=@syHpl@bKLONJHlZ%D05GV_?%EcDp(&)Ok6toSSO?FtC^KsRGJrs9k)R?M
z|N3OuC6fRwv=3zuph!`z!5bf|yRHei7c`W~qwoXJ$RQ2;bDP=9`QetL?1Fb-S633o
z?6<NxwRW&XViF)l1GCaoQ%m?lItCygfu3|QK_eUzIzR9gkfwTk|NecV%O5?p#ZsIY
zMEuKg9HLb9aq2`?N<@u-Q#m)p|HgO>OlKnvFfy{uetfn=lwufdu-Wd<jy^jUPR%OV
z-2PmvhwK`m0K2RpFHb1dcz!&8<cR2IJe-~Ob(>z!42&VS6#fchnt9qV2X8UH&B^*W
z`Zd)2uv()CZ?;SnV@_gX%uywMe**sZo}oN94uoI}5tneKY0CPBhI159<o097I9Z;g
zrrPKKb@t|QHLq{@Z+qM9gs<3To<cN`dCa(s70PNglQEPzBn@g)hR9S)h73_ONTMVS
z#tel>g`^CHB0~x#b>2^_*uQhmKj*C1_vO1~wT92<dG6=Fulu^L`(gM+PW8PG)mVcL
z0F;(?eEIRC2TeL@$72qu$AVazKYsiODBeZu;?kFU`XR%;7KaQwEq{H%{6K`~Vly2g
zG!NoFnaWdgD&VJDn|`=TU<^DMr?HiRVmyj<yNc^!@(1Dh&8t_du*bHKBf0@gb0_xH
zS9`KXB8DH*S*RT{d(fQu^Sdt%-II_~@cOmkL&Nh?Mn*=u*%PudGh^6d;B07(JhI#N
z<RD_5Y}oK8y&3}EV#|}SH(_@+=!L1kj93pR?_C1W2eI;LMoIuZILbOIl2jY<S^9o_
zq+#=p)WBV1+-(bW6>oZh1QHUP=zvVWvl-_d#>egs2<Yj>3>f$I&7uyXR*~MdXn1JQ
zS`Mj<!yz%er$vq_DX;(6UNdy>u8bUbGd5#J!x<3)M<XJh3J@;I)dQz=l+;29`TbK+
znUrn7oIhwx%<v<2-i^8_iZTB<q|$`lN5rGOql#r(YHSdM{IHtCazpA$MrjwZ!OF_Z
z7t>ipk&MQ@S@Y@cWq}MI&1?+A#{9?$F{gmnXeH57Qvn77C590_I%4x_ADCDJPX=Dh
ztEQC)XT8theY<ymK2^c4Pw9VMqD&wwvFI@#1nbD?vBmAQGB;$VWS+7z9Z^yey6|{N
zT130)OHpkFXR0?)`qY&Z8%_=Vi@f+%CyRql?;(g7mMlN=CAPLbaNh?nIVN|A=26D!
zjT^54v4iLR3KepdA{(XjOn!-tkIsSzygdL<ilSg&5#Zehb?Y|H;rNftWnp{w?iJl7
z)A&NspOs#+U6bzMrYp?#N3D21Ci==A-}SLqA3y$H@sG^ify(Ug@Ug{Dg)Mr(={dwh
z!fDq06N(5pt8%>A1om^->xj6v+4o7iX-pZq%EAO78?(H?d+j`f|Kmnx45z(C`N;PO
zxly7+BG?fVRLE9zL5KbHs|Aj9kidyLPSl}LQ_B(22(p20+QLO*<<subq|W{{1!SHe
z-$EknB`Jm1zi<O0w0}ie(yOAF{B_&Zh$`v~S$M^!O`GIB$1Pc~fTYgC(lWF-WBPWr
zJ*UkgvW<?sB<N|n{v2gS+j>@IFSE%FN9=#bx7VEO6f$@AY?D9w=_@wsD{K4Hf1Z|a
z0&k>|gk$&qz`#?ZmMvI72Fa|p&y+VsMW1VWiBFjNjAEg^ns@|P4qne8gyRfD%(^eG
zt1&`@zfT?PxsXHynRv8nSER3gWsx`&M@Iv7?i3uwQnM@}`;D>bdDw8)ETn+h!W$hF
zK?J+?|6(4GcA<qNksPv*yAN&&$dN7l_N(ve-atvF!HUdBY(U1uF6~pk5@drymeUSv
zo1Gr|;zP>zVaYG%^n}-p3xj(=PyWBhkG!o|Q}b!_efW^f?jco~G+}}}ys4BP1#jM9
zO!5fxft*1D2hvupVd>%CAlyfZOA+Za5}YDcuim+H3Yvua*H*TxD9OZRX3~D-TVk|b
zbp}NN#jBQ=$<Ob$8)#-hI<eBh?(xWw<%eUEP;CMSwC)5*9?qHGSOX#`<bl$w73i|V
zx10g3jq9v*OAMtJ<IEmBpfhV2A{}&^NP+7aIvP?}TC{C-1$*$*MFC4O7ZagcOhi<a
zFczf`j&tX_CJMxHjOHXN`7bzp4cE|kL7>UY%OmB4YP6y3dI?yL_?9d~?vd%v<?BDE
zwB|j5=7A?*`Y(BUK2o(S{v?R1Xd2b}YD)*HFTm!H{sOm?X_43)etd9zv#KOa28Hwi
zk~L-MFx#lhZ&}WvsRN80yfkav&%=GRwGD=&s-5Up0V*GEDn_%|h5x1Ge;05Qh@92e
z@t=RXxqll6u>Fdo8H;GeIHFsY8hlyLM<(mnS3}B0#Ky-B*-eDTE8-%3vC>uwinFKA
zzh%8W-(bIZj|`lceh5g3*pDOWF%EW`M|=yGm_(Thy}W{L-=ul-Z(qL(7e@*x%-R_;
zOt!^fp?MnDQ9&$7KioTAe_ih+%$CW`P$PO^YegasFohg}mv<B;+IV&uws6j;hbSEl
z<TN64AtU%e<-*DSQ<DwMba<8=aPuuQA=3AJWHn<3Sl>fbFuh7BVi`_BCqi3I<=2ff
z8_SJrG7xneG+S|oZr@%-s>^%p`hvI-kRgl(J+6YGM4Nv-6Ip50n!B%Gx_GfZy(~(N
zqPK5li^`*txa){a-9VsVFi@8AO%JGEy?%W@WHfSbXau_w#wcBmxSgM$|L~z{)>y*~
z%#e)5Bb_`W%}fw^Js;(w#TZ5llTyz>fGuq3n&OBk0gd;rqb-{?VRQ9aFn|6)WFB1b
zXUHkoINWabd8Fyd9j8x!FnzPOI8$&dF|TGjr>CWz<D5nPQ9K2jGZ>Y!1}Vv`)8|DI
zp{~CFz=7o47ji@*-9#4T<(#c)*1BcOJ#x7ndW;t@=?RebT99W*rF&M-CSQdzVswvY
z_`O@V7BSM@VLrJdD>{6;KS@p57l(%@d^auh7)2+@eyoNU+UXvf#cT<H8bwRwb;O5$
z8eW)v7|9zeX~mLIhn`3CGX!{_wLE;-u-ldb^yh-GZebFTo}YFZJ_$$#r~e1IdH%kI
z<11T@(I!oX<}<wAw82A$c+8Q=xU7bUl4I89tvlIkvqeimb>yJl<Tp$2BFQAyt>3GL
zv7U4XmkJZikMuSDKff{Cj#rd@Yi5`6c<8qN$;W<5m(!xpoN2>B142TIHtvfnA4{n}
zZK)gtX{6{_3H9+6jOi|2E~lj#1)1EkX)m5at*k}f$s^_jU?b6sJ@5h11Q#f`Q2gBa
zSx4M%DP6gJeonsIx8t|kA92xtBa2*tr+YfNReiom;?j5U;Jpd80Ea3EeibYe0!^q)
z2l$vFOV+WeY>N84_mc`q3hy&UpKsrNE$|7sFau^t{V@SMeDI*I@6chxptEHNsJ1Ry
z{*<ctCGlh>TXKtr9LF^;xAh^x>gkKF<sNhkk>OvwSc)Nqcz9YgW<}OLOkyd)Ydt{H
zM;RF!?wLrjIv5C%g_;#QeOz9S;(4EnHGVfg!q2eGMMoG$LB?rG5SQBo!)_9gUyD9U
zr(=BG1Pxxmxvfa$CUtf%c>n&CPoD6l$2^E?VSHVi!esq_(CP4fkRE6n5(Y*PD%wTI
z#R(H?Fw5UB-#~Gz#D_bY>P%?j0Awo7J4!eEU}7K)MBF8*$4@85)*<$Th#iVA!+H1z
ziywE-PhY;kG}$q5?vf=WXa(#{SmCMl3&K{H#JcWR#)O#5WM#CDD=RCR-e}a##l?jN
zqcub&cFXTKVRip{6DfrbUlQuvcY;(%3t~Cd4ns<XG19NQ@u<~*?nc{`l$5*s2fV@l
z+QgSKyc>3fK%@TAb)B6t|Lr7H?-Yjw+X5C-_}fUFs3D=wn>Vi>GaKXdpU-bcGovv^
z))p3zfNKr9fzRfmYA0)FlNLlaQ+(=OZ*TwZop}zWFlHJE&<W`OKn(Yuzj;IZzl*m-
zWBF3WFL)0BeJ93DBNcMx7YMFq@ix#<NYLE)7XMj%%t-ltg>JpV%M4wAA#lH0H@r%S
zj2-*U^}-rxyLa7{Pyg?2i{T4%U@?Y$(jWw40|a^G)lG{C^QkxL_Je0@O6mGLpfioH
z2iMHy!+FXk5T~AN760ZLzM)LPK;WsN$}C2D=x87#9D)HpczbSb#=}>w>+WS}|L=W&
zy{Lb;5UKE_riTTXl+`7^ZZFAZx&{Ag38ix5A8;9f`2f4@v&7~Aa2ofa?iI}f1T4$L
zIY~`jop}9$TEnsRpZw%#hr-vdahtk<&nS{@*@We+8-hpRzI|_By&6|X15h$+&9aHw
zfWXFE@_<cy>{!>fX}9j&iHV7!1QJT_l4$yQ(CUe_LZDLBvqz5;9a5PrFb$BI_||#A
zf+?Iplsa;n)av4T-#?4XDp+aGlP6OdE==i>ag=36EMdFr!@lm=as3d*jqwckHNIeC
z%fKb|nt&ZUrV+A973f*JlNj#VOqbed0Qi_uvY=@cWg&8oXao`$a)tD9@AhSMU}P@k
z90NJ-YeQws3`In@O2g~Oky-29R(O1QJ=I&%qD`CcSNHSL_kk3s&LxQ9`=y;_CAt`7
zRm7l$c6LeggV@f?*lD0z5SwDl2FW>nxr<oI9PDv{FLhb>75jo4%mQN#2&avKT7}CP
z1CMXGnu>R#y148-tXXR6lfq=M^?m{^l4^ZS{BjTB2bEDwtg}>_LC}tgZI`-?Qcqeo
zOj|oFF+CR7Hj)gQP9Pm;64T2iT<_SJm_!gF+HtQZ&AhaJ<j|p(%Aa3s?o8*moGU@{
z8YUCU&s?2`N=GFiq)tT{b~xD%8#YX`g{y9KX*G6qA&o_!?UM>B1sL<9|A8w0;lqdC
zs%$9Apix*?_$SE8Q)(Y$4o=QLmP=heGqpu4b#)xCyXfeoW@no#1I_fK80&p_5nVCs
z8O=l?cfj99ATZ~LFAQdG1`!zwkw<X^?V)xkYXy{28NPMvZ4ON@RT3gDaJ$2w$xt#1
zgsG2xhS#Kze<;~%DwY`O6MIOa>@9ggCjvU%KmS0l7&rEx8GN#%K9mF78EZ}+f<0b2
zlvovwrm%z(6Pbo^0a6(OCUs^P#kV->{#T;ON^T2%hRvHN!Qo-ryehIhVe4Zz>&fHC
zTuM6HCxe6i-3Q5f1BOt(b89;xH#nc2NIc}y2<uT?oCWb^kgbV{uyKL*2lJ?i)GK{>
z6@h$jt0xrTn-&TQRSXjVO*dWrLn(AMT-1CTkSrivhAfpIZ~=hh2jNw4{dzFvDm$7H
zIYG6z#zfe>3nzR!Z-PV&*&0=;w*cu_1l((;B$G5H<qoP_VTImhz30zfx^yJ&&l=Z3
z(TRNlXZtK-QU>G9c`@M!^FAWTK}-)&#hm&X-Bg>!V%9Agl`TyoTDpks1(!$TIx9&~
zAKtzDfxEk4P=NTR4^Jfb=S(ooI<d8?%FTY9v2oO$!Ujqv+LL>WNP@&$t9tP|SIiJN
zC&OMNe(MGypmwK>$tvaMZ1VB~PJBu10gm=-M6rfnJQfnd?~<}V%|=6Ur%duD=Sw=#
z*~-{t?vk&fK_cJX%Q9}+Oma~3R;MNd8k%aUfwAW>RfIqTw#CP7-@3I0%{>alA$iW%
zZruu{Y!sNvI^hZM=FWZ=$w5d$?HI9uWnTNiQ(R81>Iamg;w=d>IfbP`K?JQ?{$@BK
zf>Lbi@B)%Lh=aHRY=R8R0;ewogVdFHF?X8?GoO!e(F`|&Bb$Z7>~W2cD3ig#MFkQr
zF7eE5X(}2ZLqK9;9ydobsd-}!I?r$*#bqK*hvxsWVzZDlC`XeM6Jr<N6~wsWv%O|q
zFj-?;o~HiKKc=OZsZe#*e5k&aH2|wbLc-D}r4R%&zNcpU4*&>Aj!O@%%>#Pu-?uMK
zg0HS0lW;w-lF4LCPSApq1i>k5rlJxKCeQF?fB}T&(9RyBI;YBo^CQl@l+kujqrL|2
zjnnM;@GsH22-OkVbv3p56g!w1a2DINW2{>d)gg=e&E12Ty=yG@R@}TXDMxR2EeJaP
zrDjh)Nu#yZC%R37`O4-&MK>G@O5`c4EBqphFr<ki?~h#j{<CL4u`k<Y`z&tx5saXj
z?NO<La6(L-OHGyfQWC%7-|2gI7N%R;O(NtubU2xrl?0?`m_UI>0qVPL+Yh`_X3m%q
z<LDEx3U!P9HO4CkP77M!Y|y2{teBP(wN|azMKEhG%+UD<_4#prkDIpFiBx|%VIE!p
zxt`hiB32{{l?wFLvPr1lMx#CAF=B)pSvnAYuk%Z&AK)E?lwiRqXa#NBgkH@iYQVe@
ze6^E9Xpa{A8`{wjWV&eA!e#0zDfIe8!|u)6uz&;4iR&xiiIBMOI`CW=U$DERHX^bq
zl`{55<$wRPO>kN|ZQ2fp`za}Q`0Y^)W@W$GRd=Dy;27N+CvO-N8=IB7f!1ex+;3FH
zXm@L?GcSqKvsq)<PUN8WQ+=6Frp(m?>lc~-#*N;p_+9!{CK?9EEHgLX8y{am7-SXD
z38HNi(xF%+zUUH{W(_960|Vp3c44|W=>$=mbAk$*L(k#}HXFSPJy?i+nsI$}ve0(e
zN9br=r+o8LrQ8jr5^WWrE&<1KW859(Ct<Ig3*oKG-Nl%}arNpBR5au&hdrG6P$63?
zz!Q!PTF6%lPdhvN%AZ;5viev+iaxw*FBlcF(NlOz!u-Vbl9nnpqwL78BS-St3#3|9
zUT!q-U;y(8iPLWo5O}GIyd}HDa?gPS$S?As!xg=8W$6)nfr{C7<i(cCjEAhKifgXq
zyi^`uxYS_a6_H8lw1?+~u8GDJhnEM94r!S=O;jR1=j0U5k%I@@*KurU4HCK<!8+t*
z6kQT`0Hme8*)`4`J02n|AEpaxv)H^lP#P4EQ}lLW6Wiw{#;YEq`$N{rOxzoY?}P7%
z+9~LOFvfZR9`Yq<J}D5Q5i5lD7-xce^UIAjnEuj4`@7VM>OrVT0UVf<ln2R-I?LeS
zOil}I%U^{Auw?g21`yzkXTG34El~!Z4><iIjEOkS8EXxI5r6Tb=zB5;cdP0#GI=sl
zp-KT|1&g@|dF0Kh*$xgmnrZm09DkN0qd!HajBy$)QwL5tz+KU*CKCba37_Rmqvo*w
z+G)zsd<9@KST|^}IV*{J2E%gb4`C|V+D2d&91Y2++aSILw*b0928hChI$lKsEl&lb
zAA)?3bWwUP50dGSQ`Q|df{Qt7)F@<uANh!3*}D%s$C-koD5S>)^c#9<9*$z4I5#FV
za5FefI5x$BF@5@O)vG8N>1u&bLFfS)lqXEA+f;)gqIo_t0+FQ?#GsKMR2s?}RLaGV
zqM{;GAm0cP0WpKfO)r1CO`7>WBE`F+pO;4Ofl`Km#3#Jp8A(tmsW*F$ExXjdd!Ihb
z+1|B`qBRT^bni&q0g_%#!b$Iv2!@qYFW05e6d-<>b}j-Y1ljE4bpbWbjP#8->Ciq>
zmoRb5C_$HaaRE{R_n8k7peiOVXIw~5u7=x&E%6zImx*VdF5`9(AQ7>|+8j)!S0}t1
zH6ycp@dS%a!hB1^3z(P_ZrfCNn_|ALYS;e#GqSSGeul8cjoPzRSW`ssaGUAne}Clw
zvcB{_%9AZ0@i7}%E5;*5_)--It)npHwn`&ca4vIU!gc~C$Q``lt<C)wNs>ZS;<YYl
zFB7lEx(g>K8mD?@ZOM0TrKekzz+phC6fDzaq#?bj2w@n8lf}$%b*h7tm9P4U84+M5
zuXRU|H)Xw>nGyVBl|^Pf7r@{j7K>DV#a;77vNSC4o0+wB4v1)QZlvllaxKyW-K8JQ
z4`u$fmOd&RYH36y>&2*9QBk<q-Tn|5Pm4uM<BnJ<ZH6JtB9;bQ5w(pxdaz<$9Np8!
zeVgMq4uic-4HzfsLTH6`ku#FPi+;eVc*kW!+_*uvF=1Ok95_qp`H*RGnv+E9^%-KL
zH?Lo-Xuxrp?x>yhoMSe2Px2ZkC%wT^zN!7VkzO)ua0!}AMMj;gKfM@NG%S;2q)*Z<
zQydC=cJDsoT^868xpLBq7GXNtoo93+k%b|-ss~jbjSi{uXYebKN_E$)>}*k#691$%
zv)807OC;E3Z7Gg_kr8Qm<H+-flPt;N1h)BYm{F$T>Tt#o+w{vijiJ=4Whp1Elik)u
z-?$UvKEoB%6l^7U)Sg|tOiKIw`{sg2&H8Ag&)^1=M+Oi2M$;;yryB6dz=~BT?=d`t
z=6I|PHs?j#Iv)^o%OG84O%g#xs;QQa?odE%JK<i)zvhqnpxgMH+^{e)ir#4OC`-y5
zxDi${i>BZ9a4|G;*q{%qk1Z$$DLHINV2`qDQp@iO>h^OvP8c;boH2|zP7im`TR1MT
z@rbo%9~JjYNoly)$J5#KJs`;fEks7;(nTjiK|trInf7km^2cY!PQPk;ElP(HQ)k^|
zgaxKONqSp9)f={l9%D)P)p5p4a~@G|(1<Iyx0CfxJrzDRY3uBUrsa8|fj<WaUoR%6
z!+=Vi=dQ1x@Mi6BxO%tXlBHiTkiF7ezUNJ@Pk6JeNM$8YRf!KH<q^;i#Zl+%q~&JA
z*M6v?-4o^n54M!DWAS3)8LG60;srJd%!Jh+a))t2A=gHj-%U$<$6lorV$s!OTS{x(
zY&SVRBG^HM))-G6N+AhPjx#_=Qdm#K07Lf^J9bQ5xkB5bQl{cP7xyuCFeJ2(BD~*7
zwJSR{NhiXkS@_}ds?60e+!({|;^cH*ExDI|GmnAlRJ>DiDE@mr`X`IyHD+d~6c4i^
zJWV!R#lEE<?p*ElCck_5Xa4Aus2+6h=FLg&aU41!GlS>~7aK==m^)Su6CJcu+cO;K
z6Gf{Fm!xoYp#N==&)>ea%;`^m=|rCO)7!VhVsycD#*GZBu8OHN<;%Bn?MAK^VjV>7
zm16F^^@#R2xz`WFkd}ECYz0<0svjMQ;|2!n-bKbSCr0F=^xJamwi!&G+=dDt$RT(p
zg9q^sBu4eu6JX1jLNXnZ4)K4mhGZLwj<Aznl9EK^zDiLnAWxNX)a*CGVlbd1{90HH
z>E3-lxfMBKk+2Eaut_83jd6R1Wg$36)ybqpn`|GWQzPJnN;`5m9(&eKCNn@e9k?e-
zN1n?2StyHPMlUviHasw2df*^O@}~57V(C{Zlx@xEw$hTIg0D{aH&sj@lWc)0A%9p&
z>oNVvr>cYF2c52%DwVeN*bsG?NznaI!(<HXfBN~07t?e8zTf&)L4oYX5Ug`-zZwB?
zSl&=i-PcN8!X<rG(d^!=P|kR}OKQ0mEkEeW4P9p;Em#TAPwuOyrG=LORN0%HWaK$D
zj_(Lp$ThI7msJ|8ap)q)pne(o#>P@yo`DuXmgw)G1R}Mb`E%n~LN^CC(B<BT-)`68
zI2IG_-Y?v0kw-`6U1nG}uYHJu5tKO0$V!c2Dp|qw@crb`ZuegDXNSQ{_SL4YIXV|w
zhFe4(j@ca;IHNR+&d=n|2O>^1j?Fw<Rxor86%@>rLg}J)998nOkrbV~684_l74AS8
zIisOyj;lN$Ro-K|Z$y4~h)6LUls{SRoFCG(r!?VVz@VmoU05i03S$i<0WhHG$nZ!f
zA1ESTQ9f$8h*jc`jNu#SKCLm-DkzG3|FnqXWq=MOlB;}K`keK6s8S!W$NtPKv|=oy
zYS&@IDj>v}`Z74e*yRrXQVzF|f1o?CRG5@jyfZ#9(4e;3qXb`?H3L1)!-)>=?I>Sq
z*B-6h7|ejE^0G27RWeS!!8Jfx##2WGOrS;&_T|$oylVvqK{_Da;b0}TePa!Phm+4<
zrs<J6q@F(yAOCbaVk-3CE?$tkcDZOoBv6rN9<0zjJovQqt4O8Hj-5O?gvCQFFzUS=
zXkp`q4LNbkJ&H56n_0lqr>nDO&6=MacV4QjKE5`g2M$%C<q$5-CzqkC-_i$J2wFvo
ziv+tz?XgKG5@#)AnarPB>KFKBrd(umYaVC-QMRQ7y43sy3k)~Xu>&YplZcU5nhh>+
zpVcJD?SRGf(`V(<79|s*$4P7Lo!p|OC~iPuR9}J<3EM#rZ7LByT`S2x68iY~up4VQ
zMG8^@MzcEi?CB*un6q`^CY_JLaIIlvpVM8RH3fl!ezJGdW(F9}3?hA0Df(Zpo^U*v
z{nAoGxQ(y6S1KX%RhNiN<DzCJSiV7fY1E>4jD1W$ch+uJB?<Z27^>prH9q8e{y~;X
zo)g)loS)GM<ccDO9vE_Eq|^I0y=&q9$h13mpx!J6c_8M|c<wrK&Xh@$PU0N^dJpyS
zi2K(k?UyHNtlckm6_H7upi_GP(Ie5mR%Nb|*~r#IJ*wR_u2Veo*}rm5ETrlzEIc&Y
zV?A)F)ZYox5&W<9FDj)!p7LD+9<>ks3!~pf&~?0+l1x5r%zVSUcToW?0&B&1v8xx|
z35K6dAAKE=LRkz76}lfz4hkmWWjM;s8FF%re=AW8lXgh9(9#vN*`a?Vyffm1^jOLq
z+EXL4J?y0;DFr!Y_e3|P<w|DNL_-WFpd-mG6-i&bqnOMm4SdFIAMce;N=o8zcNGim
z;B&*?%jtVhNfSW#m~GRT^G<5Z8}fxdKp4+>&c%80?j3%eS)g-#m7+OTwHJz4*#_V>
z05E~xvc`2jyP0uF&GU`T84nN^73Bfh4#R}Y9AEeEp8=qe-c0?L=h}x+wtgwh>s^F$
zZ&|E`3KCP|-H?9$42+G@a>YQl=Dj%WDQmn`Nl5l%1sOeV!M(sI4A|DVPOQd8(0<V(
zL~)m>u!L9ffddY4i6neX{4-MCy=PAb6AZ}W!zx0{#yx6<WdPDfjqBvku@{y+5kBIQ
zEu67dmX^0HC$gtG+Opu@W7$+&BprmR@Z{-J*ep#nND(@;XrVE09i6p<o2IU?Unc_7
zqJYfz*&LRnanQCQGGtuv^D||q!wW}jn^qoH>w5}3{mzqXbo}JWZ#WkzYbdM`Bw61H
zD3E&lG75xNImvs21{=BN<sb7sD850<&u!{$O#t98Hnt%a?FfTt`BM3etaLk=SRAm}
zB!GN(2G8XbFw;^?c@5ckfQH7{VrL|9v*^I?995~Mp)m)MU*YsTL>lqr;^gx-R#x&^
zo)as_G9r8cM5w@N)2$K7`(NC;z0*kR57y%;fmo#pz>u|N;Hr@$lX%<m_fG`-gFDxE
zzyQD6{`vcg-xtK?+*7nJ<Jt{GjQsiYFgOUrXF4>&H|tb2N?P5#rD6UFjS_>oWrp$B
zeyQcU?4(X116i<eA=kWCljP{7Iv217YPLPF)7-gT-G3-4sS5l8w`8UvcA<Bu%-0-E
zk|lafl&^bx4As{sKT2jzoZVY-pdD)|vvm!(Logo`1pmrE2L>NF?zO8|=|7$tmB(?*
z|NhsuKN!UL({=4oPYZP<OR}Y)d1O)Ql3q6FPxW^@)P)5_5~AFXuCNd{O$JuH#Mob;
z4>2ZQc{)fmdP{9#!&gp+v(qp~g%ZoeT6*p&Uyr#R<J6M`2#i1zWML(pP2_^Abs8|G
z)CM$t%<{-FnQ1D(b4FgoFK&SZYKpNjE|>^VF672S8I0$&dJTX;A9NtXu!%xC9XO0*
zb4v1AfkiJ8Qhm-@BDXzM08$4*E&j|IHEH~KG9_x6@s+RJD|T_M@@0%1BKvuS92(c@
zLTvN#nSZJ~Bp7TrBwO&1+THa}rK8=$8h-$k7F;9iTHbyL33}^}9ZO!n&IaC~Mn|V>
z(2g!7#-+nxPm(<)B_@(c=q*2_c(2Rv#t4Dwm?m^XK%3gcN2rn?PBt>~OMo$^qYoZJ
z4={D6C<;yN0!O*yyz&yraWZ2on?VamMDzkrhy1(yvbqFE<+sMo(@;<^i0U-SM>B%2
z(izq^^F+k(AohQ}L!BhicGd6BX=ty~FJFX~7JdmR2$adJ3+(8F2M=PT3GfX2aeI9S
z>31*Qvx+#`5my108lYC_PPA_o*mJwM5Q$RF>aMJk5J)M$xYDk>B)FVDXF2~^YUQ8V
z?e4G$C~<`zHKMM>RWepcEyZtn`h2B3B_#Q|7sUBs(Rs+=!A@(}GXCZ}sZ9i0Sc`Ra
zFMoLB?~bRsKssuwe}byOdr76d_kE{OiZgUo@d<0$MdB^~(@z~U0To;tf^-1x;8Bk;
zK8Jb*ABK}AwU!Sfo-eH@3fQj1G+QgnjkJZLi(DI_@ZP=V;p4=`4zGZ51g;Y@oN<cx
zu*s2aUQtT{?eNHO6*NM#2aD}+An?h{26;zsvtrGfr2>8vC<?t$w-?&BM;4pi(Oor5
zarvHSjj3xZ%l;s57qNxv9Tz5=Ei=RC@kw|~&Q`?fy|Oew8@4g+DN93mgl1I{l&&%T
zM7Ej0Asj*e%h?rFKp)!(mPic6tq^b8+I~#>mPcHTH*e=+_d!>2eG}W&i4r|iBak-2
zE5q#>MU*9=Z7paRG8HkT$vSRmXPOru>iYRg{`Fi8leTsX7Kp*uaP}|IlB#qjhTH#m
z+^qAcMA?^FFI!{?PV0}F*tKuwOqZ85xY_x93#$nY160)mEUxtKzP3fm<dw}IW(*wk
z&7$AkOxeI0(y(5$X5BTso9WmmPP+1sN4M@Pw^nP`aDtlJYR|Ea+N}2o%wKbQoz<p4
z8Z|2WQSt8Zs>5GKeCx2VQ)1ccQRVLf%EDXPlMNc=Po@`ccv0n>p+~On&y`VSH<L1E
zWhGhUgv9Z8vuCdA^!Wn=@k6`0r-x$uc&6TAfu%nlcGMpS>!FNE(6}y~Xn|MfofJ<w
zTD%TE2=}p<7kNZD<L9Fv{!u*(G!Oig#>;7$PT@xKSkT80c(RegQX+5P2T9DcTG%=h
zqQQec<kwVxi-<~gc)wb44`#PfXrGmo<hRhIer5VhNO17vJ|ybA&<Ly{NDSD11R!3i
z<vNfi7Kyhi+u)xbQ$8`73HJe#Pv1kLox&s4rXL>g^oli{cRlF*kS9F%D!&|mpc
z__O<BUdnOq;>A!v%RpR+os)z+!hX;JRtuK&{<CVe_4H>lzTfOv`*O40Sdb!C&u~Gx
z1>OOfV2mM=U`&>sg9GU2N9rASx_L0TB+7C_^$s0I0Gz}>99{eCO5t6@6=B1(UtvK3
zJVqCoWjb<0WyX5M5H^U%OeboiKZvI{=sReT(8Y49V|F+dvXaJiB=Mrh$vT~PvW^oZ
zJyy4W+L)qwq;08&cW4RIDdFy45Yq+Q1k!&d+Z(dvF?fKM7kLb}a2Z_Z1@99TAIqej
ztWNMdk0Dal%<2MSW^=uwV@!B$uiuSpRVte3!l$0ZNV0`Oi4Rjl3-!jdT1K#vlIFy6
z1vR5@e-Ht<Z{L6XXfMWKc{*Ak$00~Irz@xoy*!SuD$u;g28tkXbbe@^DEEd1@cTSr
z=xH*R<dWMlVCBp4MTM+FeD0iHo%pNv6MMD{`b|bTdJM|LO4ful5%@_AcYR%V8vLC8
zn^H3SFR`-F6L_lvG2mP>&}q2h+usj}kN!vTpZw`;0EC-NEuWW{N5AF9J#k?1sI0%v
z{P*QWl&tpe>y&yX{ZHJOR{Na){1Mu}{ww|dVcCSb3-tR(NBh(*h~Ga_a{te#uKV%Z
zZT0Ip8CqD>gGJj9)qO0UFuP6N-Sc!@O^1uq7cwer<=n^dzhAY{n7a4YngoWdE<sD9
zudRLi|NSX_i19SqYKf)TCMx#6;<ef+Wsx88IGPBRSVLFWNcqK^Hw*cN`pqo<vBA;0
z+v6X=$!jo{R_OMFwY)g-uS&DEetqJg2aCk~5DhP1U!fP;e&~OcDsunBhnLk~-d#t2
z-JU)tXeI%us0i`x>}B^xkLAVuF9ZM5WM5yHDhF#71}ZnEDZC1lYBttXDmwUucpFYs
zp#wiBcM880-hrZx9BTwVMf&<r>n}r>(Wdiyicbh7Zwm0h-VHRh+YMoLc!mpnN3w<5
z5=rX3H;PyFtQlK>SzBY16!0fJ%vTHeHxu>S9$Oh!*LPVD#mn8=Tz?1k3VJLhGgADE
z+3u<Ut$HQ6Z18IqQlq}CPr#tI3a=frbiSVNp+9ZsLot;o(?P@^t@iy*%4v*F9t$I6
z<p9M#8Z%0kpRCLjje8#ZYqx+-d%2;e*BRu~g9@MMdyCdOCExCkv&p6?nHT%J*-qw0
z^A722SCCN(RDR;{&{E6_9}0qO{uR+X0(+l$`{bV;hOgG*kuSz9W6S(Nt75XwLUYe6
zRyH<9|CT+fI~Zl1WclWEu;nGY8R%5YnzH>1Im3-<fAr}^5My&U_zs<tVHS9xfLxMa
zZa;Kyn?aaN4>K)_C&`vM*EYeWn^d!y+?Hg%L#Ix{Zxs|4@XkDf9uY-cAu4o^E;KzD
z@oaqKMvnjj`~-9hH*eynawj)eFiPMN(!?Tv0C&B3#lqje(Do0l77dp5xsem*r7DnI
zN?a;_Qd0;~w7w1!8ZombKcZbPHI9&mAw^@tyjD^!#9YW=P5n!2{AxOdJXq9Je}LlN
zrfw;Is&favF^|WJ0{YXQj70-sl_P$j%MPGqFAn43<2Vi(u@&r0V+u-2*2rp<4HQBK
z)289TD8?8V?OqjjXq3CP3h<Y%<Fm70s1(b~)g_@>#k4Ql=pX7jGudre-@c#FQ)(u`
zQgx!7<6S7Q)7KUYaVEoX$zW)}zkU9^FE>=Q&b4OkK76>EGF9j>avIK|feOsWCnF!`
z?h3SHUG5iYzV1d`X_7OiR&ZQ0(QV@GpbgF1Aj<XpL&}sw``r2S<pUyGZ~r5%^}G_9
zWD9*=A^f0R$@v>r77ywXBor~2FnauaH!I|z8v)6&sE)B`JGlOs@t?S8(ObYIZ&i>e
zE)!XOpO5MRy{uZcZ?Eh1_`=G0EKoXnHhH>+Ko%NitvMDiT)b$0=UAVexw*Lr&z7+w
zLf$PN3OgJxH|_O>a*K)E+kDPh97X3a9Yh`Ydc=q-QDLCMoI3SqfuAvSgx0+*O}_5P
zM2Ntp!oZ-l#L~(NGu0E^4o)aan+gH<knOQ4um<YerHg3ub6PEE8{jAZT(^8CK4JNA
zsXGo?78qkSXvyOhVDFF>(6i&Alr82{av+fp>>wO-nBv5%Ra8Kf+FRXDv1mL4{;7|o
zGjIl<9wysc`0qXW3=Z+2l@$+84?VR{U6b1*y-%8<VpFF1nJFSjS|?7J&{%`@U(RBY
z8fQ`2!0u!frA?w#1iQnE{RmSbtLYs|w!k`n6lS^*g@fJgVTkTIPo;wnqD|sF?Kxz<
zxt1Cq420_%z;ta^S2sL@>L~5(vFo1SR6->PqM?CJ(WX;0kD^0o)Zc6$_4B5UHP|si
zGHHL#;lZ&KUV(`m!KU1BI({?J>yx;EQ*t8{V2V*A7mr?NZk`QVjSe_<ALy}ogzNn$
z;sS-NhmBR(bK@mvK?H+h0fxc#y9;FTr#S}>k)av9CZcc&sQe$gh6Tko;9^`KJ3;k8
zkV^YYTg}I#=0`pan%?CyvQbplkMtJ#!9Y6&ahn#Ri;(C9d>&Q=lX#a)-zFNPaPo*R
zGk;3-p|94>>wfL#WtZ4lO>a|HchA-1`Oe(cFBEJ|G_Y@Dy><=AOXc%vmdxe^L@ulI
zpn$b}3H7Wb2z0Q|yu*sG8G5Wf9nflqM3wu9@5urnAl62Nr~1>{!FS0lNk)9jE<5hw
zj5Y|_%aug6YvvvKqd)bJKnjo*lZDYqxqIi1;Ydav|5RzU_cn5AES5dA%_Fh|+*&SS
z?E?<@&4tK7j;TJ#k!vH}wQt`AZjCfbtKajaKTWze%C*&5a<Zia8X>c&4Sb-Dtbt^_
zVIE%%xqQzQfTO0(n?G7r3QWNZj8pChro~I1El?cgp5gUriISQ@!skd3*Oh+jt}vO6
zO=&Nh^FelVG^Ovay&XAnWZ}^M{m*7LA=KDLnj5X87rc5qu<yCt7p0WKA-2=!UUqGG
zbbQkbNwUY&G`*A?Hzv*BuSWeI7di);p8wi#x;XC!oU@L!1%0j^T;~ekC2ddrN*cJT
z{_OTl)dkH_5lLF+c-t{Rkn|$wwC!0zPqVD;YqDT2(`dbNM<M#|@NQYf5?nZni<yQs
zNQ1#r?|Dyh5$obL%^ahY{I~oO7od{Ie&^KN^(Qq1SYzu!r^sRHb0#}~K2|FY!&J@6
zs_3xjObBK=d>^{xQ>GRzS7ZvGO6&7S*&HSsQNs&AV^Ufw1r7zS*Q<{oKfZV|=Y3}N
z=84B~uo;h!0-*>B8LBEyCX~zl>DiV#FIqzj9so|%oPfY!>40NT=}Ln?aN$MwjKYkf
z5Mn&^9f>m3kd9eK&~lm)_L?_G^oo=zoK_Fy0cdMHIUv9SN>$S@r+xf(?_P<L2K<hI
zOMGv4PXA_dJr{tq1<BgYn>X=H&C9c`SqE26D9V8LVM}m;^#y;+$wAtZlkv0&9aT_j
zq|VvEfq)G=Aom%LG<I}6+u;!`i*L_4`6p+gXQrWhR+ir`q--#yH|+=Fg&3fXXAF@9
zq?mR5ln5xzy+*p`<?TIT;>1yWV~i9@(1B^`Cvd`(Z1mb(7BQwr;Tu3facL1j@#$=B
z4<o-?9F;T}*(C5)w*?_G9TK}vty(!DC576+=ScjTvsj!cAfUec%$~G$aP9Y825$ty
zz@C?oBU#W>Z$76mGRTT^h#fv+L?Va9@J9IrL;wlN7NA$Tow1_kxZK3EKGA!^%%7Vn
zDTVsqaV}w#2^>#H$8Z!Ans_*~qd3Fa?~8VdUOO^v+1o14E|Qm1U$L$1T(LFlEV~!i
zh<CScBGbEH!2t`$r?CHc4KM3R9CLk{*n^EQ#C-%sDoG}|PWR^obNid^I|MDrhK=Ct
z!uAYcZ)&9Z;?Bk(HwFgra6~}U2<8`=I16X9*iz;Q**&aPIC1CoYws$3sf&wn<Utg7
z>cOIdjiBH<avf|VsUvwUp;$mdBD}Iqw1UHsLC5wKqf9shVs`?!swMU$srmfni>^)<
zcTI3P69x|?DDmjuzZ6Xc=Hcr&-?zD6BCF+tFcIV?(Jyu<t<zdMm@B6eX9g+ZsZ@1s
zhqb#ipPe(0gK9DCd`wK{NXrE80(PHaQt0$06#kqFIr+z#1<ixSLdOcVjkG2VBnBCi
zhWgnw;UP<&LU3OG<ZKG2W)$8esijs8K3Le@xqe2tOc2Q3{)YX76SjUyH^GJ^pNied
z?m~Ms>xiaICkq%&B9X*2zk%E%C51S#{MIfF-f{2OW2Lu>fl!wxex8^dmUDh-0w4kn
zH;&@e{PD5Y?yPID3<Qe8g}ULvZJq10X^@OJ`iz@r^v5#oYFaEj@uYh;eq7<YS+7e<
zB7aV3C25LDSKq!S{(FPYw{9>^AUCLrQn<_6BQHQkaddl^-MeE}95i$;{O!AUk&NEv
zlrTDEyA>>GT;*0VkIu)6Vq832e#IgK*jxDJ3>-29IzbYV0Oq)O)u>sW+qTvAdd$dj
za$fH1!uJCwK;RJ6xo{IwW8M^7*_#-=4jAx-l_D&(`M+l0-xkw*=EQBHfg%A)UCFCg
zUs83WaAyTg4IJs}#BC>WVlGz|AJ7BF>0G0Al5yigj?T8*SL>HU;t8cmgsp?@Easg*
zLMuw2M^EnqmY|?)CJ|Y*OOXxGtZ2^^fS|kuqhKD95T@O}-9p0n7*}{0ATi+a3+N6z
zJAa3A)!@%R1+RDRn+n_t1@ltFi!X~TvQhI+h!+Jtms3G3Og6RXv%=0{rCuWCfGy(x
z@Tqol!6bbpsTo)vJk#VyRmG}j8)J<u=(Ly?PteV?nB^|4g0%&VJ7n1m4iMrt1&?vy
z&}b4M)`l@Ylt;ptI00MwxRe70NE9t&B8?HbJc|;9uE;t2hduCZpio1@%?f9@j!WUU
z6FSB{3VVY6vT9K`KoAA~7DBs;$2oMx7lL<+2i<zeK+rb@^>KS|YolUddUh3hxK4wY
zSh7iwlvNXq=rz42_Bh14VA0R2?6q^$TlH%uPq+PXvXa`N;4%Jq-)5Kmk<+?W3@|uz
zmC0FzW9#Edx1Nee!rW}du{LI_Ngc$ZBV2O~gh-^IT`H|-mfb7bsR2e75SZv&3J+ZJ
zVHJ(R;7}k2^nDJMLyi0L1)%E7w{Nw%ILtN@($M>xVxe-XNx<%Kr(^y%W2TaL|BGL=
zWlWF~M8*soZ^>tJ+_a9C>M9dRI5{5Ag2BUec$|~dYLFN=!b5HC+xue^(w0oeFoiH!
z;}Ec!eku=6m9?bB(kAJs&^^$b4(~w;wR6jsUD6Vuz~pnB=3=s)%msk`0ym}NMG<ah
z3L{;F!h%FDE9vR?)M*Fgrs2ac-VXua*6Zy~9g0`w+x6$b17PF@@dEGPBOhH59~tH6
z_mC3~!-XEx0x`OfJo~Z;S0#8f{#7E<bN;x(`4hk!cP%2TnB*Q_!>n%)@94_Uoaw8*
zh%|u|pL3Lyq2N19>f+yv<5cw?2gYUq&hMMyB92NpIjIT4)Iellc)L-qSHBruqwN$9
zUVzn!kzFcGZELKLa@WLST!tM%+1IbQ!AZz9*yDUy#<BN`w7dK>eEa@3+3<UVmuK^f
z5E`s9&(N!usGt<3mv^kDicQVbE8FO~IFs&lyMOK4xXPkWWY<juZQ>ia>Y~=h?fWuj
z*TKLwKvh_sIOabA=A`6$%{V!^tzch);d6HZhtu!fBZp<{q&_NgqUsd%^Z>IfI}&q}
zF*h5wYxK2-w-f$FYs4p=eywa_Xi18vsTiHIL$C~8&oH1RsP$`>UCrdqL5K$~lG+Rh
zGZv4;zD=_&6{y+Kt|fU%`hENMo%QH3rBo6k8YwC*XEF{qi~ald0bClb<SmkKZh3Q#
zd6&KuABLMnYzSH6l0(Wy)DIELMaQB@r%?nQKnmQ)wV$>0c6oDXtxCzA;(~(iqishp
z!FsLVCGao1g$rpm!_lI7=(e!t%aNK}%a_tY=LJyF#(^uzyVkU{Vs2lq*BYGq`1zfj
z92d?BX|tG7xzC=(8ZEbjT*j|J5|C`cx)gL(ZO}F918UwE7#E@=0hSv1sP|}*v=j$>
ztI~tpWt3qQ#I<&GLzeYyW1;aD)vmKzIE8?s6ojvauSKNZFd>u@1P#UwF8>;1Ip8dW
z@S<z>ix(%%zuhwax!G5u_nm`7jD6stz<$vyIkzb%D)q2|*=IC>bW;{n0zpyE!HZry
zkNM42zM)EhJLDO=`PfW!aWsC2*saJyIlKdvW1~PF%vp9csjf=B$>5_Mn6{{NJpN!2
zR3Pd}W^U0qFOc@EL$_Nut}l`eiy<T8T5{zT*FQiU<gJBWm{Po55g3rqfzc3;sE{RF
z;__S~R=1_WO9#TnhbG^QR)IUQ!XQO5j2k~)pzN92$d=>Xi?DiGGo|adF1^0rc}Ktz
zTp8(mTAn1=$9gbdX2q4^qZ=UKYSz4Yz)LcOR@SR}c`YKcQN5G}n%;ayeJc!1)sE<|
zJ$=6)r!P3S&(^IJ^HlDRLk#ELPD{gz%2$nGaA|h?S1pS&!~ABnT#E^F$8KED6EjV1
z&BB*chFHYgMb&z=nFU>I^uG?4Z>x`!F~28pN=KlF+OXzDOE4*V_S(3QEKGi#-RMbF
z(>(@e{ouSxb{^cf@0M9Pbp=_5`rbX#F!)}UK_?43Thd%`aGVj_W`%3N_tWJ2_iNuw
zscxFl$s}`HEEx}1kzf6*Q6ux9({B{pM9ZQco7D*m^+|8ktlh1wf4xUZa!@rwNZj=j
z2Ezb7z0>#CfdN9tn7e#A^N6AX^seEx7oQ(a&u6-wn?#wwlhJ$neQXrlw`dXc>}X7v
zg$oyI1NKdvJ}=XuoC?@W6$_Mx4~3#6%OO^+^{KmA3zjbJ{q10{=G6D{dC%geA2~Pz
z5&#_;$R)-%_s#ui%QzVh(t-do@ZL_(9L0$n)-;JcfqmBd)e@E)Rbs$P@>pVAc<657
zgt9*D+03*3^%?Zq;obK;Fq)j+Wl#MkbM(G=-zuZG*Dj-rFfCNuwhePn?earDwtra5
zb~tr<jB8Sy6B|K-q}Ca_LD#fx+rl;FX@|U3K}RA&S2w-k`E=WywL0+{7gafZB^;Q=
zHYH8$K-I)>D%Go?f!D4b-yW~PW}kc4L&c$cm?PR`BtXxyg$sM8<fI?YKJl#6*-$4@
z_2Yttq$4xZhOB-0Fum`+oR`9w!E!|VnNCQTZzV=kr}lQZLmNgTR>=*hTiD`!TteHY
zLu;EEBIo2{u_*)k&pZ^<<gYagh55n8Q(E~+^r~Rys&`-<`N)#IV;J~1vWi<xLKFpF
z)mQ@wXfo@6?vcx<cM4`tPL8#Y#VO0_i$=a>dtuua<~}|-xW8#cmgDqT#qgCiO|yS)
zO}%?pJ~|LC^~KSRa+;{0TG#v4db`V0bVfh((1PMF>=@?C4a<qhvdPk`_wJnzU4PB7
zr=y0jnMx^`rv(VnG}+WX?j#nG*&{7o$6U$(2lNHbphN7%X@`cB4BAbpGi2z~@>hj#
z(vG@$!l6BB%T$)m?di4X$=MJx;>-mf!4jZ@rCF?4MHWY=?<wU-e*UG*wt1M0?N}2t
zuXeg%sPzl!mYq9Cvw1izky*e~rHW}1`A_fSYGsYubo74q`JAP*VqJ%rkNCO#Ebgp_
zSBa?x=o$GkXb+vpEcp^aBm3icP9SK~N*}7d2c{!rrZBmysXD)6Ui>!x^c89aAT0f!
zJ#P}%iMJ{1mCFkl(hW|^8INq05wheF4#2_`v)u5s)Kr@D*+8V2^?0dLoqix=35n1-
zo1B=5eyH%oJubjS-H)~Bo=!!July1umWTZ;69x`|?4AFl{|AdkcY;eF{@mQviwJUs
z@+0^=@nn-u>Z7Sgj+jf{Y?{%y?Fv`zoreq)o+MU<=owOU2emfDs<ExQx~3OCf&cF4
z^i>fSoEz77rNX;+Y<~99Nt+Uz<w@#lYTdmi8yk-%xx;ZPCf%@E5Aw+>jtu|d*FYju
zS4M|y-V|O$ZY8`8gq`@J_@$#J_%cI+6)AtYAuA46oTkfSk=`07%(q(GqjI3>owT&4
ze1EcbzEj#BOYWsdWzosAcTB{1KX_vT8zN^0YeWj%bm%&`hr%@#`=#zUwS`C{DQ8Hh
zavIR5WKN0j%qf1#5!|Nlzk8$+S#;#Ji^Wt35=UJxXI{+h%=Ub%k{cNm7MMo4oTvm|
z-SBtX*;#D||I-nS<LL1LZ=vQuZZSR9*eK|+uPov3b9Cz8tgb$?<L;y%C*`9KI%&?i
zToMxKq*L;7d9>Z#oCc+9il)I39URAJDSA9kaV0-F{;s|*>JM%8j;Q*blP9D&0{qj#
z;+)<5`5l<RQuVPxzR}o(4W<cK<^%y?r=8u~9S9ek8Kl<VmKe5_8kryh!kH7dn9K^i
zF0-F8?m4pPM}SY9bcOn&OewKFlad@;>bHM?*rBZ-!WX-b>I*gsx_VR%WypNaGgvp|
z64{5>v3CRvXm5JD3xx%dcAB9f5(}Xh;T|+g_F}LWi5&%;J6JONOCOn-u5KC_eo9IR
z@e9P6;j5QCYLZ6?vR0$?E6)o$MN4*eUH^>h8)i5@A!ere-k+&w_RAbomIdehFbn%Q
z#h~qc*QgOdfByL}+qWLZw8{0%S5LP?YsqFR!5qLSgf?+gYBR+(?9;TaZF)Iva*zqW
zfhG=jc(=Paa8avGKDnL)Rc<KYQ3`{-c8yE)w&yfIuQ!_S;Agb_CJZHjvW&a`v})C(
zA=qLs=WNCm&@;eg!G(1da=iJ<_;5N#$X920MwmyW?YT{|U6^5m4FueUOO*qspZ0=-
z4uNsczI~x4TFAt<m1-r)Ib1-9&)sjnb$mkQ*I6}{7Q;g3jLBKrXPdiqQne`_!H(3w
zcKaiH{gBTY5LO%$H+S5c4jy5g&)Sa;g_8;rL=PGSwGNksbDnC9N2f^hw9INPPIWnb
z{%+Pz4H;y*x8gE{`qZP!g>!+EIC<E?fM0{1^t{^x>?69<8u{LplEiV))7o}RZ6Bk3
ztDL7#(^k5;T>Nk)$Jf#54S5h{#h$qep_<%9C!QffPZOS7QNEHUA`36F<Jn*J2b~HG
z?9g%H)G1T&3pz5=wMWtvBO_v;uEUDxwa%Q2jx<!?>sY(suE=l{f28RL_wSE{e-Zo0
zcEH^J<b}Kc#Dv1*(so3_lg(CNwD;IU>|NbF`{z(s=#s<mxzTbenk#p?)Ze+s1SZ^V
z8#N?;{OXTN4U4MBwWvu@fA?5#wXVIn+}t2v0JEieVVtIg`7P0>p62T`V5uy4ELyxg
zKK*&Tz7LJzP*eOv*G`V~qEDi~rnrpSzKI4oOg~&W3B5CDs|_BUef8?x`$xyx1qwPo
zB%?iZOL`pj-eWY`sD#kz_^&-!6{V!%5aTIR?m!x@Rbbx#@MmZw1q_O-T`!@El`M37
zmktw5g@krnJJTguj--VJ8!mA*-rWj*tnY>qt@fF7=N3bZ`fx@5kYbPQvqS6F6vbyj
z<X~yF8FB;+#pqv*k(mm%((eK0Yr-G-VjdnQ5m^o$Cp=~kW{s5NVInK3#BaoRqUjBX
zYUxLdM(NkP6hvLpdiuXz#?iCBuqSV?yfQq^F*W?<?d1&ad;9&LJ`dXWd;V4Whdxq$
zGXT~;wuxF^aF?7|K2T<T+PRS=Ys26cb^PJ+M>dqA)1XY8{F&|KQyE_e|F_=xxh6!t
z_DWy2%yW6yTN`8s4ymeyEig&>0b;<CO}j%L(#IQyJ}|o<<_Tc87q+KSs|?G2_Hg=I
zgVl=`g{qbD{Lp?m?ka080l?GYpkMH#tjO=<1uO*A-oCvVHdDt!>*w-PQvWr9?$uTC
zvQ^rtPT#gq4wI21U}0^p{FxF6KDqkvI~XlId`MYwHu1~t)KsR{1iYk>^#A7ANR0S+
z{klg=JKxvP(S|pwp;8=v4`LD3WrP$<LCv^7^it_tg^X@z-%UqW^fjXNR`7eS=Kc5v
z^pyh^S(Da)SeEXc3q9?O;n~2m;m(zbyUAB?AGD2DG%pHYz3SGGw2|1Q<6w!QT~$PD
zE@aI4^LphzwO32Y`AI!&VrvV033z^3?c>w9K6tQ(kCaE`re5VMh#>`iW?l?FzV{u7
zgx#!rI6~||3A1`T)-1bsoO3LX#G7IIJkYtG-~N){Yz+*7p%X5yoFi^0frm6262vFW
z)Ei>l+sCOe8!U45_Dpv_rUVo&yFzSh=X)Y__r;Hgqp@n5a&T~OC=$y3z>R^t<oVWI
zEsjEHbHfj8RM9|xqdREc*H53`a_CkR&a<raI3~C#uHi!FwgZ$a$W=6@y;Ui__@xQH
z+U4u*K21rTR$bXdBY45nhMW}YYr?hY`0~~h4vmDyIb&o#Ny3JXbJDaU`jdd^7HQ@z
z4-Rd1{_!)j1)QZ_*XqDOsj$#4$ct%+8YXq=t7fIg72`#e`ZWJ_2&ADmB>|CEQ2{a+
zjsjVDc0nj<v|r30+B-MaI*9^7ve|y%0d9rAdptN*s(Q4zMa8>ca-b$6y_@dWy?gh`
zBj+y83eO_@IJODm_$tm=DU(Zj#|`cMRw(E<Y%rTzIxnA~CF;vh*7SXrgv<tQDNMb`
z@pG)5W)Drxj-%Jl_$H$%;lBZYZhl6JS7fRq#_rLgbY8daF2m6hdafKsrO58*oVutw
zS0H9Le09rZM9H@Fm60C`E7Y_09k&UM--JViU5w4osp8ouMmY_CU4h_F9M`CaU~9x@
z+P61h^AOLpQ`ptI;ftzyf#J!M_OPVm7Cn&otS!k2Ij&O%s|qTuM416=&t@($iksDT
zgx^_FNhHn^j@bA62~QX9;novO$m#8H5_ufyScKZa0|&_960e@s=>`d9*m^eN*D!8B
zi**%hi3+{lYb`Xqavu-sJo3^?(Oz^uc{Vmy9x#+mpSg6%5e^S~8jf&ju}uld6znt}
z&o#%_k!YO5nspjsxy(hU{7?UPhH1-?Pl@44FS(KG*6kT#dJ#l@!^}2DG;zb}XGu@N
z2cbwg^Y3B|F7XT6VNfvfJZul7LJ#^mf|ixqU;(2Olk}3Dy_Y;4&DHIVc06I%s|^iT
z5eBfa&(hP^O=W|z3H4HhvB2YR7Ln%6>=pOyuo#@?NO3gdjDz{{2lS5x$baz-Mm&hC
z%(1^T3>gk!!Kvfp-o$P<uB2jKiMk(W!(p@yRttX6rNd2+-hrUfU}y4dBN`B^MN2=E
zDlNRT4~_5ubj0uo>AfCXFqG{_&CHcP;O`&9WV2(bf9Xm|?!k<^r5J`dA_8)ZlUbpW
zqPMs0=e+}z{ztI-9fs>ZDbI?>T(OYlAvnJIYn!f4x4QT<d_;fkZgjfH+l*E+GVV$<
z;mAT7;`em(yW8v>{*%<Pm$L(*NSbD*aUEc%$xq9@rsj4ODP3=paytexba~AS(br8j
zA{*@8+8A?5fy(E5QgGj-=aX=y9Uw#MG{_Kriynz#fy@vNg!!}lGL3<d{GVr{qxb8t
zU1lX!K*yJjw$ydXK1m}uZAo(q0>HO`kq2dBJNTNMQX00`I`S}?1ThYr{kVAq`%d&d
z86Zx7<_bf)@7|4kpEec6kzNy81CJp&Ls*EA=}?s6td9jmUuqK&?2hUE0wR_R>4Ix-
zpyR@TS%d8}#w#O(6N*BRcI5ILtZ+7A#$aFsrmmua%XBs8ih5uW*(F!U?<JwN(cgdH
zym2Eus~BYiOnf}X3!%%1KG)Zo7@~;&4xe7K1!z>bLIS9hM5o@nC;XZTkj(L1$?B)b
z9`kJkYe38xR?7#}VF&Ryrq@zao9W0Yj}D=lq?q64u7X4FF(>b3(>o4qCJ8T-AL&Ad
zy*~Zr+kLe4j1KM8Ov%2A&-QGTKsYok=Gs#iP@l?VnP@{qnBWs1*uP)U(s|2A{RDJa
zFcget#z%QIREwWTWeq;l?t+rwiUN~256dd%;|N_nfr*93r%#zyNYRQrPOapI$Wx9$
z4k+~pxh0KJYBjv=DQ_WSxl>bxgh0vDrFFf=*s}O%>ht~n&uICCs4TEwE_%~I)g(xC
zD4BjIl0~6hM-VlBmEdgC5~TEJPwv>dbscfWFa)IH(kQx2MAgajXfY2*)b4X9d^_X^
z?HbR=)X_Aqs8wFRe96M=CC_SA)s)pPbT2*_kEpAve!_b=OcqXG3aB1Op_4)%hVA;2
zB}-PHT70=A#Zq4`2_lp4LyA~SO@cnf;s?qkP7^Un<8m0jq<S?s*9~5VXp}{VwwQTo
zy$I}7;_{qEiZ1Zc(!OEO-^{O#Z=M0y8CV6z0B?nyZRU-z*Xc((w`?iRd;%;Q7JMf?
z9W&9e`~AT5_ZUT~_-x<afu@iez1PJ1(&<ZS_K?_5I5ZQjDj54$u}Fwct;{qALI>b$
zPdhL$Y<AHLV1ADL7K#?|vAwI-tdSRQ?oK^RUF2so;x$7<+N!FmwraIEYDh4h2<WiP
zs1am0--)P{R#kmp8J}|Ee`Gg8bc4Hy%GW1yj;h|aiI4O0lJB3x%BPwAdL^Z%eg@;m
z%EADcwPKEil7Dd-_eVyS005L8Vyj!gA5Pv#n~GMXc3A^j#7yP;@bTlUPhl@BJ{&d-
zu||eZ*R#ppY-~G;uc8m@mue@*7z!_b`mG6+?~YCKFp1Z*jtRZr80n9m#jPSp=@-Ce
zTa7sE?~nHFWO~yI1QgM9`Wo&unq{L658(?pqREcNjL047dj3O&O{_yK{up$2i#j=#
z78sPQ_v79#qs$HJmbO01P|QjYCumS_Ka|7@44Ot^=^>Ql`*Md{I8oDp5KSCEUM<#~
z-y%-t0(85#)Gb<X_H_y?HTd{KC$2E?M9-hA?tf+80p=-ZXRn>3A#OWw3`Vk=t?hP_
zC7nh8RC4aX(3NvJr?J-P+_B@#71N4zs|0?p=Hr+O6a3vfn==jK#wT0w*18wY-DYPd
zu`w5S-nxA|Be24)BlU(llkDJ5sO3C^LENw=McQM`3sA@jr1^RCsg!M0?}pJ(^^U)n
z^~d>+nLripg<^ObO{}7d62G_c9oIQKyJ5IeX>8+9Sm<MOeu{E5eW{X8C5k+?@9<xU
z+wA#bjZ?A6qQleY<r<|vc_PpP3YQU}Xlw=gCFp^0)F~02P&Tsn-d6GhFa-e4DYL+F
zw*ohxJaIxrgGs#*SA_C__7iTE-Fo#pb$>C+t>CUXR2apkE?8YhM>lTT6d<IGr_nPN
zVmKxTV^mg9kjYCDE%@|rFXUdiGKbGW*n7tgpy7|>Av+DT1raY`eb7dx1}dt!LD3gO
z(m?Bk%c!HLH!VztU0xQUA2fuozZSoUm=(P{Lpfu>6k{DkHp#AM`#Z$uz^~yg0ZRCN
zq1qvVTy4zo>(5N4g>1mc(ICQJ%GcaT85+rWT9H7{3=Qkix9^$z{Ya*SJOrH|tBGam
zci;d~_95sLnS<dG2k0JfyT}9P27LMaxh-S`8cp!9XyN@lX1Lt<CB3qetcXXDIhm^)
z0DnX-FNF9<u)~{=b}Ein2ysV>tp9=FO~9IM*Up7=hIxA-Qpjn{RuzZpt5*-{#fIgF
zk4hETmfu<Cu}$Wlu{d!~389cdm;ep#B<LfNhaN%iXMv!&2#AazB=HYi;$Vp-lPO@F
za{J;F654A<^v8AJ8E^wf^Jv>-H!H*2d3n{qACKta;TzB-NzAk-b-|&L0#RsCS)CNp
zOq7EAX5rycV_~~4WJCUjIYsA8Z*y5-+pOVY(Uf8WhT)+HoFak@#5H3;^0TbRiuE~W
zdcFN<s`nqR`N!rWf1cdXTkzG$c+sXPJ-3@wHyq5<FyiI#wu;|7nn5XwchN~zV2IU4
z87Ys86#u}VGTVBV>X<<NwR-ycHp&lz#{Df=*R>j$+9#T<+uvSak-jRmIa1qdj*33V
zKY5dS_H(LJM11dDuWKp1O0P>91N;2}0Sn&?3##x=P_r8nEr<NSP{cWryj0uy_5jeW
zrKZb95|*W1e#X|lu>9}(w%821-G;w7;GsUOezjX3k#*g9%{j~7D-;u#w}sIpiAt`-
zF&J4*Mae8U?Jc+M+!?TeQm2;UF1}d7JcNG>-wQ#d;_+!T>+}l_(u#o6PQOlJQ`Ouh
zz5Y2$GChbrFzQ-KaCpA+n@s;7ZG%n#sgmKpfb_q}>SOD#Gp{4lqq)3$0~vl>!L}C<
zwfd@n{~n{c=g^_rnTi57!N*(NTJadO{=YBMmGhMq8%cixKp6c<t(-tSuXF&NxB*Ux
z3+f)<J5w4{q$6UxF->ualq%{W?)oPwDZNe0eEeItQ1^4QMs-*J_wziH>yfzMKl}eG
z_s}^o@%XQ$!^@1(uKzqbGDenQiu_Oe_KV+-|Md=@8GZlrt)AO`Ce;)|)JXG$3m2#m
z<B>z$xTpN<`C4u3lm?zYEP{ie>rFK1C2-!!bmY+k6l+F2+i+7ohixT&@2?b}h#(tl
zFm;p;$b0PD=Jot7Kk@&i2$8UwDRO>BG4??9B|JPzTOLE4q`aoOm@vw~?K*<I($RtS
zS4{nGR@NG_M!=`qiAX>RpDD6xbwx*Cc0er;Q3KC#cp?Lwgs~<Mt$uZD%d-V;L6rxW
zC$C(wLRCVuhrw&SQEIKU$Ugyh5Iz)s<fP;bGt$Bh@^M?m(m8s){sY{20If?Tb5XNq
zDew08M*<K`w<Mz)v^nuIPI{s@EC!x=yje$w-<Eu#{sMTm9PpwhruHLwwm`GyMuIIm
z@V3ct3OS)yI5`boX3oEFqO-mhVkP{P;E<5U8q5^}O?dn9<MRKf2&)@*0y9~_oN~b)
zCs1#Y4_0yeu>c!XtNWU*1ck905k)t&1OY^9ksCtZT6wN+--rELIo_Ng?hNP=Mjp0^
z0-PD+abqe=`sjApf^c!5P51Mi*v*T$C2X;P2B2|oKUBw3H{4mj60AaH;Zc{WUFeud
zwovj7amc(?dnx@lO*~#f*cXf%u<zX58AZ=^qY?!X5m8{RWKUclf9Pqo*XgL^R$t#F
zf-e0dyV})kIFHl<845eHv;`P62R@0FEXbryKc>})M;o7C8ldl;(RA?8Oe&@*BrI0$
zw9r!H1#};@<OM|1^gxs9hf0!viO51DsA+wSz(Q+S^P-n)$mkF9>I)W?uWXBUD*mGL
z+!ikq3%#T@j*jg>&5_(MwX-ujW}tX)ll>2vhb|t8%Cvu00ge8=2_@|ttwv4jxztAj
zaCxAKnAe^WOVzIs@jkxfQPZ+_KMrH)|Nd`V{@>q{_mA*&Ehkz0>+ipDkhdob7T0vp
zpxzG(6>l3_Re!Yc_t`=dwC<JfkVQ_yqTs*CcA|1sRTbVMQ8mssHc$8Hz4-7!O!1Sy
zZHeN#rEeUlV^K@e7R`2jE?8(@Ny_N_e+WVw=0=dR2b2jDcpkFeJ!r?>xL0?yDSpVG
zq3x#l)(O3T{R|m*J&O7JXTOwczkf7l)_-Dj-H-pPrmOqW{nr1#qiR%5gMYkycE6ii
S@I$<q!Gvk!PfKQR_`d+SPvjf`


From 3cfa63ad991665b2440155cd29352342024072fd Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Tue, 25 Nov 2025 05:02:21 +0800
Subject: [PATCH 209/249] [XPU]fix Kimi-VL-A3B-thinking on xpu (#29309)

Signed-off-by: Yan Ma <yan.ma@intel.com>
---
 vllm/model_executor/models/moonvit.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index 2e3e6dc166ad8..63ea6b259a71d 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -56,10 +56,13 @@ from transformers.utils import is_flash_attn_2_available
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.models.utils import maybe_prefix
+from vllm.platforms import current_platform
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_varlen_func
+elif current_platform.is_xpu():
+    from vllm.attention.utils.fa_utils import flash_attn_varlen_func
 else:
     flash_attn_varlen_func = None
 
@@ -106,10 +109,10 @@ def multihead_attention(
         q,
         k,
         v,
-        q_cu_seqlens,
-        k_cu_seqlens,
-        max_seqlen_q,
-        max_seqlen_k,
+        cu_seqlens_q=q_cu_seqlens,
+        cu_seqlens_k=k_cu_seqlens,
+        max_seqlen_q=max_seqlen_q,
+        max_seqlen_k=max_seqlen_k,
         causal=False,
     )
     attn_out = attn_out.flatten(start_dim=-2)
@@ -291,7 +294,12 @@ class Rope2DPosEmb(nn.Module):
     """
 
     def __init__(
-        self, dim: int, max_height: int, max_width: int, theta_base=10000, device="cuda"
+        self,
+        dim: int,
+        max_height: int,
+        max_width: int,
+        theta_base=10000,
+        device=current_platform.device_type,
     ):
         super().__init__()
         self.dim = dim
@@ -437,7 +445,7 @@ class MoonVitEncoderLayer(nn.Module):
         self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
         self.attn_implementation = attn_implementation
         # use fa2 in vllm by default
-        if is_flash_attn_2_available():
+        if is_flash_attn_2_available() or current_platform.is_xpu():
             self.attn_implementation = "flash_attention_2"
 
         self.norm0 = nn.LayerNorm(hidden_dim)

From f32c7d6f5455de2684686c7238f9c7ecca6b58b7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Nov 2025 13:54:59 -0800
Subject: [PATCH 210/249] [Model Runner V2] Simplify Eagle bookkeeping with
 num_rejected (#29347)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py             | 19 ++++--------
 vllm/v1/worker/gpu/model_runner.py            | 30 ++++++++++++++-----
 vllm/v1/worker/gpu/spec_decode/eagle.py       | 19 ++++++------
 .../gpu/spec_decode/rejection_sample.py       | 12 ++++++++
 4 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 3ac43ea4952de..2a7048ae3c0e0 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -344,8 +344,8 @@ def _post_update_kernel(
     sampled_tokens_ptr,
     sampled_tokens_stride,
     num_sampled_ptr,
+    num_rejected_ptr,
     query_start_loc_ptr,
-    cu_num_logits_ptr,
 ):
     req_id = tl.program_id(0)
     req_state_idx = tl.load(idx_mapping_ptr + req_id)
@@ -360,17 +360,10 @@ def _post_update_kernel(
     query_start = tl.load(query_start_loc_ptr + req_id)
     query_end = tl.load(query_start_loc_ptr + req_id + 1)
     query_len = query_end - query_start
+    num_rejected = tl.load(num_rejected_ptr + req_id)
 
     num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
-    num_computed += query_len
-    # Consider the rejected tokens in spec decoding.
-    if num_sampled > 0:
-        # NOTE(woosuk): We must skip num_sampled == 0 to account for chunked prefills.
-        logits_start = tl.load(cu_num_logits_ptr + req_id)
-        logits_end = tl.load(cu_num_logits_ptr + req_id + 1)
-        num_logits = logits_end - logits_start
-        num_rejected = num_logits - num_sampled
-        num_computed -= num_rejected
+    num_computed += query_len - num_rejected
     tl.store(num_computed_tokens_ptr + req_state_idx, num_computed)
 
 
@@ -385,10 +378,10 @@ def post_update(
     sampled_tokens: torch.Tensor,
     # [num_reqs]
     num_sampled: torch.Tensor,
+    # [num_reqs]
+    num_rejected: torch.Tensor,
     # [num_reqs + 1]
     query_start_loc: torch.Tensor,
-    # [num_reqs + 1]
-    cu_num_logits: torch.Tensor,
 ) -> None:
     num_reqs = idx_mapping.shape[0]
     _post_update_kernel[(num_reqs,)](
@@ -398,7 +391,7 @@ def post_update(
         sampled_tokens,
         sampled_tokens.stride(0),
         num_sampled,
+        num_rejected,
         query_start_loc,
-        cu_num_logits,
         num_warps=1,
     )
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index e0ed183d3c5b0..e34a45f979807 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -46,7 +46,10 @@ from vllm.v1.worker.gpu.input_batch import (
 )
 from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
 from vllm.v1.worker.gpu.spec_decode import init_speculator
-from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
+from vllm.v1.worker.gpu.spec_decode.rejection_sample import (
+    get_num_rejected,
+    rejection_sample,
+)
 from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
 from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
@@ -311,12 +314,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             device=self.device,
         )
         num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device)
+        num_rejected = torch.zeros(num_reqs, dtype=torch.int32, device=self.device)
         self.propose_draft(
             input_batch=input_batch,
             sampling_metadata=sampling_metadata,
             last_hidden_states=hidden_states,
             aux_hidden_states=aux_hidden_states,
             num_sampled=num_sampled,
+            num_rejected=num_rejected,
         )
 
     @torch.inference_mode()
@@ -606,7 +611,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         input_batch: InputBatch,
         sampling_metadata: SamplingMetadata,
         grammar_output: GrammarOutput | None,
-    ) -> tuple[SamplerOutput, torch.Tensor]:
+    ) -> tuple[SamplerOutput, torch.Tensor, torch.Tensor]:
         sample_hidden_states = hidden_states[input_batch.logits_indices]
         logits = self.model.compute_logits(sample_hidden_states)
         if grammar_output is not None:
@@ -632,6 +637,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # No draft tokens (common case).
             # 0 if chunked-prefilling, 1 if not.
             num_sampled = (~is_chunked_prefilling).int()
+            num_rejected = torch.zeros_like(num_sampled)
         else:
             # Draft tokens for spec decoding.
             input_ids = input_batch.input_ids[input_batch.logits_indices]
@@ -642,9 +648,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 self.num_speculative_steps,
             )
             num_sampled *= ~is_chunked_prefilling
+            num_rejected = get_num_rejected(
+                input_batch.cu_num_logits,
+                num_sampled,
+            )
             sampler_output.sampled_token_ids = sampled_tokens
             # TODO(woosuk): Support logprobs with spec decoding.
-        return sampler_output, num_sampled
+        return sampler_output, num_sampled, num_rejected
 
     def compute_prompt_logprobs(
         self,
@@ -750,6 +760,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         input_batch: InputBatch,
         sampled_tokens: torch.Tensor,
         num_sampled: torch.Tensor,
+        num_rejected: torch.Tensor,
     ) -> None:
         # Update the number of computed tokens.
         post_update(
@@ -758,8 +769,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.req_states.last_sampled_tokens,
             sampled_tokens,
             num_sampled,
+            num_rejected,
             input_batch.query_start_loc,
-            input_batch.cu_num_logits,
         )
 
         # Update the number of computed prefill tokens.
@@ -779,6 +790,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         last_hidden_states: torch.Tensor,
         aux_hidden_states: list[torch.Tensor] | None,
         num_sampled: torch.Tensor,
+        num_rejected: torch.Tensor,
     ) -> torch.Tensor:
         num_reqs = input_batch.num_reqs
         idx_mapping_np = input_batch.idx_mapping_np
@@ -800,6 +812,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             last_hidden_states,
             aux_hidden_states,
             num_sampled,
+            num_rejected,
             self.req_states.last_sampled_tokens,
             next_prefill_tokens,
         )
@@ -958,7 +971,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.execute_model_state = None  # type: ignore
         assert sampling_metadata is not None
 
-        sampler_output, num_sampled_tokens = self.sample(
+        sampler_output, num_sampled, num_rejected = self.sample(
             hidden_states, input_batch, sampling_metadata, grammar_output
         )
         prompt_logprobs_dict = self.compute_prompt_logprobs(hidden_states, input_batch)
@@ -979,7 +992,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         async_output = AsyncOutput(
             model_runner_output=model_runner_output,
             sampler_output=sampler_output,
-            num_sampled_tokens=num_sampled_tokens,
+            num_sampled_tokens=num_sampled,
             copy_stream=self.output_copy_stream,
             copy_event=self.output_copy_event,
         )
@@ -990,7 +1003,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # This sequencing may slightly reduce latency as async D2H copy does not
         # need to wait for the postprocess to finish.
         self.postprocess(
-            input_batch, sampler_output.sampled_token_ids, num_sampled_tokens
+            input_batch, sampler_output.sampled_token_ids, num_sampled, num_rejected
         )
         if self.do_spec_decode:
             _ = self.propose_draft(
@@ -998,7 +1011,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 sampling_metadata,
                 hidden_states,
                 None,  # aux_hidden_states
-                num_sampled_tokens,
+                num_sampled,
+                num_rejected,
             )
 
         if self.use_async_scheduling:
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 59d0f313d96a2..3c8621cc69c97 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -60,6 +60,8 @@ class EagleSpeculator:
         aux_hidden_states: list[torch.Tensor] | None,
         # [num_reqs]
         num_sampled: torch.Tensor,
+        # [num_reqs]
+        num_rejected: torch.Tensor,
         # [max_num_reqs, 1]
         last_sampled: torch.Tensor,
         # [num_reqs]
@@ -84,6 +86,7 @@ class EagleSpeculator:
             self.input_ids,
             input_batch,
             num_sampled,
+            num_rejected,
             last_sampled,
             next_prefill_tokens,
         )
@@ -139,8 +142,8 @@ def _prepare_eagle_inputs_kernel(
     last_sampled_ptr,
     next_prefill_tokens_ptr,
     num_sampled_ptr,
+    num_rejected_ptr,
     query_start_loc_ptr,
-    cu_num_logits_ptr,
     BLOCK_SIZE: tl.constexpr,
 ):
     batch_idx = tl.program_id(0)
@@ -149,17 +152,13 @@ def _prepare_eagle_inputs_kernel(
     query_len = query_end - query_start
 
     # Get the true query length and next token after accounting for rejected tokens.
+    num_rejected = tl.load(num_rejected_ptr + batch_idx)
+    query_len -= num_rejected
+
     num_sampled = tl.load(num_sampled_ptr + batch_idx)
     if num_sampled > 0:
         req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
         next_token = tl.load(last_sampled_ptr + req_state_idx).to(tl.int32)
-
-        logits_start = tl.load(cu_num_logits_ptr + batch_idx)
-        logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1)
-        num_logits = logits_end - logits_start
-
-        num_rejected = num_logits - num_sampled
-        query_len -= num_rejected
     else:
         # Chunked prefilling.
         # Get the next prefill token.
@@ -182,6 +181,8 @@ def prepare_eagle_inputs(
     input_batch: InputBatch,
     # [num_reqs]
     num_sampled: torch.Tensor,
+    # [num_reqs]
+    num_rejected: torch.Tensor,
     # [max_num_reqs, 1]
     last_sampled: torch.Tensor,
     # [max_num_reqs]
@@ -201,8 +202,8 @@ def prepare_eagle_inputs(
         last_sampled,
         next_prefill_tokens,
         num_sampled,
+        num_rejected,
         input_batch.query_start_loc,
-        input_batch.cu_num_logits,
         BLOCK_SIZE=1024,
     )
     return last_token_indices
diff --git a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
index 8a7bf28bacbd4..43c6ac518bccc 100644
--- a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
+++ b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
@@ -69,3 +69,15 @@ def rejection_sample(
         num_warps=1,
     )
     return sampled, num_sampled
+
+
+@torch.compile(dynamic=True)
+def get_num_rejected(
+    cu_num_logits: torch.Tensor,
+    num_sampled: torch.Tensor,
+) -> torch.Tensor:
+    num_logits = cu_num_logits[1:] - cu_num_logits[:-1]
+    num_rejected = num_logits - num_sampled
+    # No token is rejected for chunked prefills.
+    num_rejected *= num_sampled > 0
+    return num_rejected

From 84371daf75507c849a38a9a44b2fb2af89e96dd3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 24 Nov 2025 14:04:31 -0800
Subject: [PATCH 211/249] [Tests] Verify gpt_oss package is installed in
 harmony tests (#29336)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/entrypoints/openai/test_response_api_with_harmony.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 6251e1776c30a..8fd3545eccffa 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import importlib
 import json
 import time
 
@@ -35,6 +35,10 @@ GET_WEATHER_SCHEMA = {
 
 @pytest.fixture(scope="module")
 def server():
+    assert importlib.util.find_spec("gpt_oss") is not None, (
+        "Harmony tests require gpt_oss package to be installed"
+    )
+
     args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
     env_dict = dict(
         VLLM_ENABLE_RESPONSES_API_STORE="1",

From 4dd42db566097cc2cacb2dddff3a8f3b0c007be0 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 24 Nov 2025 17:16:05 -0500
Subject: [PATCH 212/249] Remove VLLM_SKIP_WARMUP tip (#29331)

Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
---
 docs/features/quantization/inc.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/features/quantization/inc.md b/docs/features/quantization/inc.md
index 5e86e9388f328..9875bc44c9144 100644
--- a/docs/features/quantization/inc.md
+++ b/docs/features/quantization/inc.md
@@ -22,9 +22,6 @@ export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxab
 vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
 ```
 
-!!! tip
-    If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
-
 !!! tip
     When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
     `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.

From 71df2a57effc15b5f67cdbf55f3d1e1b71f90e86 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 24 Nov 2025 14:28:32 -0800
Subject: [PATCH 213/249] [Hybrid Allocator] Better layer padding strategy for
 gpt-oss eagle (#29303)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/core/test_kv_cache_utils.py | 59 ++++++++++++++++++++++++++++
 vllm/v1/core/kv_cache_utils.py       | 11 +++++-
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 24611a4aaa1b8..12ed59b6e863b 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1436,6 +1436,65 @@ def test_get_kv_cache_config_one_worker():
         ],
     )
 
+    # 6 full + 5 sliding, pad to 6 full + 6 sliding. This is a typical case for gpt-oss
+    # eagle where there is only one more full attention layer than sliding window layers
+    kv_cache_specs_hybrid = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+        "layer_3": new_kv_cache_spec(),
+        "layer_4": new_kv_cache_spec(),
+        "layer_5": new_kv_cache_spec(),
+        "layer_6": new_kv_cache_spec(),
+        "layer_7": new_sliding_window_spec(),
+        "layer_8": new_sliding_window_spec(),
+        "layer_9": new_sliding_window_spec(),
+        "layer_10": new_sliding_window_spec(),
+        "layer_11": new_sliding_window_spec(),
+    }
+
+    kv_cache_config_hybrid = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 6 * 32]
+    )[0]
+    print(kv_cache_config_hybrid)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_1", "layer_7"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_2", "layer_8"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_3", "layer_9"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_4", "layer_10"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_5", "layer_11"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_6"],
+            ),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer_1", "layer_2", "layer_3", "layer_4", "layer_5", "layer_6"],
+                new_kv_cache_spec(),
+            ),
+            KVCacheGroupSpec(
+                ["layer_7", "layer_8", "layer_9", "layer_10", "layer_11"],
+                new_sliding_window_spec(),
+            ),
+        ],
+    )
     # different hidden size
     kv_cache_specs_hybrid = {
         "layer_1": new_kv_cache_spec(head_size=128),
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index b18ba8e8b2c7b..a0033fa650baa 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -971,7 +971,16 @@ def _get_kv_cache_groups_uniform_page_size(
     # is the minimum number of layers among all attention types. Need a better
     # strategy if we want to support more complex patterns (e.g., 20 full + 30
     # sw, where the group size should be 10).
-    group_size = min([len(layers) for layers in same_type_layers.values()])
+    min_num_layers = min([len(layers) for layers in same_type_layers.values()])
+    group_size = min_num_layers
+    max_num_layers = max([len(layers) for layers in same_type_layers.values()])
+    if max_num_layers < min_num_layers * 1.25:
+        # If the number of layers is not much larger than the minimum number of layers,
+        # use the maximum number of layers as the group size to avoid too many padding
+        # layers. A typical example is gpt-oss-20b + eagle, with 12 sw + 13 full. We
+        # pad it to (13 sw, 13 full) instead of (12 sw, 24 full). 1.25 is just a
+        # magic number to avoid too many padding layers.
+        group_size = max_num_layers
     grouped_layers = []
     for layers in same_type_layers.values():
         num_padding_layers = group_size - len(layers) % group_size

From c17610e2baf5e40b3b0638b272bfe7e04e471bfe Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 24 Nov 2025 18:22:46 -0500
Subject: [PATCH 214/249] [Bugfix] Only use triton_kernels for MXFP4 on SM90
 and SM100 (#29339)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 255b5aad17853..198feb03be3e4 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -132,12 +132,15 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
             )
 
         # If FlashInfer is not available, try either Marlin or Triton
-        if (
-            envs.VLLM_MXFP4_USE_MARLIN
-            or current_platform.get_device_capability()[0] < 9
-            or not has_triton_kernels()
-            or not is_torch_equal_or_newer("2.8.0")
-        ):
+        triton_kernels_supported = (
+            has_triton_kernels()
+            and is_torch_equal_or_newer("2.8.0")
+            # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
+            # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
+            # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
+            and (9, 0) <= current_platform.get_device_capability() < (11, 0)
+        )
+        if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
             logger.info_once("Using Marlin backend")
             return Mxfp4Backend.MARLIN
         else:

From 699bca76c00b81ba6c7ead38fed01712f5f56aa1 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 24 Nov 2025 19:49:01 -0500
Subject: [PATCH 215/249] [UX] Raise error for attn backend of batch invariant
 (#29348)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/batch_invariant.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 8b33727f05fbc..be7f673e5618f 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -812,19 +812,19 @@ def override_envs_for_invariance():
         # "TRITON_MLA",
     ]
     if curr_attn_backend not in supported_backends:
-        warning = (
-            "Forcibly updating attention backend to"
-            f" {supported_backends[0]} for batch_invariant. "
-            f" Supported backends: {supported_backends}."
+        error = (
+            "VLLM batch_invariant mode requires an attention backend in "
+            f"{supported_backends}, but got '{curr_attn_backend}'. "
+            "Please set the 'VLLM_ATTENTION_BACKEND' environment variable "
+            "to one of the supported backends before enabling batch_invariant."
         )
-        logger.warning_once(warning)
-        os.environ["VLLM_ATTENTION_BACKEND"] = supported_backends[0]
+        raise RuntimeError(error)
     if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]:
         warning = (
             "You are using a decode-invariant form of batch invariance. "
             "This will not be invariant between prefill and decode."
         )
-        logger.warning_once(warning)
+        logger.warning_once(warning, scope="local")
     os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
 
     os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

From 5f9679a43bf92fc0fc8610f0ba5cc9c857148ccf Mon Sep 17 00:00:00 2001
From: Hanjie Qiu <50634613+hjjq@users.noreply.github.com>
Date: Mon, 24 Nov 2025 20:13:12 -0500
Subject: [PATCH 216/249] [Spec Decode] Add support for EAGLE3 heads that do
 not use_aux_hidden_states (#27688)

Signed-off-by: hjjq <hanjieq@nvidia.com>
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
---
 vllm/model_executor/models/llama_eagle3.py | 38 ++++++++++++++--------
 vllm/v1/spec_decode/eagle.py               | 19 +++++++++++
 vllm/v1/worker/gpu_model_runner.py         |  4 ++-
 3 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 3eaf2d80082f1..7a57644db1b13 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -142,6 +142,12 @@ class LlamaModel(nn.Module):
         # Get drafter's quantization config
         self.quant_config = get_draft_quant_config(vllm_config)
 
+        eagle_config = getattr(self.config, "eagle_config", None)
+        if eagle_config is not None and "use_aux_hidden_state" in eagle_config:
+            self.use_aux_hidden_state = eagle_config["use_aux_hidden_state"]
+        else:
+            self.use_aux_hidden_state = True
+
         current_vllm_config = get_current_vllm_config()
 
         self.embed_tokens = VocabParallelEmbedding(
@@ -161,20 +167,20 @@ class LlamaModel(nn.Module):
                 for layer_idx in range(self.config.num_hidden_layers)
             ]
         )
-        if hasattr(self.config, "target_hidden_size"):
-            fc_input_size = self.config.target_hidden_size * 3
-        else:
-            fc_input_size = self.config.hidden_size * 3
-        self.fc = ReplicatedLinear(
-            input_size=fc_input_size,
-            output_size=self.config.hidden_size,
-            bias=False,
-            params_dtype=vllm_config.model_config.dtype,
-            quant_config=self.quant_config,
-            prefix=maybe_prefix(prefix, "fc"),
-            return_bias=False,
-        )
-
+        if self.use_aux_hidden_state:
+            if hasattr(self.config, "target_hidden_size"):
+                fc_input_size = self.config.target_hidden_size * 3
+            else:
+                fc_input_size = self.config.hidden_size * 3
+            self.fc = ReplicatedLinear(
+                input_size=fc_input_size,
+                output_size=self.config.hidden_size,
+                bias=False,
+                params_dtype=vllm_config.model_config.dtype,
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, "fc"),
+                return_bias=False,
+            )
         self.norm = RMSNorm(
             self.config.hidden_size,
             eps=self.config.rms_norm_eps,
@@ -332,6 +338,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
         self,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
+        if not self.model.use_aux_hidden_state:
+            return hidden_states
         # combine multiple auxiliary hidden states returned by eagle3
         return self.model.fc(hidden_states)
 
@@ -357,6 +365,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
             skip_substrs.append("draft_id_to_target_id")
         if not includes_embed_tokens:
             skip_substrs.append("embed_tokens")
+        if not self.model.use_aux_hidden_state:
+            skip_substrs.append("fc.")
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=None,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 3de418f1d13c8..afa16573eea10 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -83,6 +83,9 @@ class EagleProposer:
         self.draft_indexer_metadata_builder: AttentionMetadataBuilder | None = None
         self.attn_layer_names: list[str] = []
         self.indexer_layer_names: list[str] = []
+        self.eagle3_use_aux_hidden_state: bool = (
+            self._get_eagle3_use_aux_hidden_state_from_config()
+        )
 
         self.use_cuda_graph = False
 
@@ -1169,6 +1172,22 @@ class EagleProposer:
         )
         return builder
 
+    def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool:
+        """
+        Some eagle3 heads (e.g., nvidia/gpt-oss-120b-Eagle3-v2) do not use auxiliary
+        hidden states and directly uses the last layer output just like eagle1.
+        They might indicate this by setting "use_aux_hidden_state" to False
+        inside the "eagle_config" dict of their hf_config.
+        """
+        if self.method != "eagle3":
+            return False
+        # Assume that eagle3 heads use aux hidden states by default
+        use_aux_hidden_state = True
+        eagle_config = getattr(self.draft_model_config.hf_config, "eagle_config", None)
+        if eagle_config is not None:
+            use_aux_hidden_state = eagle_config.get("use_aux_hidden_state", True)
+        return use_aux_hidden_state
+
     def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Validate that all eagle layers belong to the same KVCacheGroup.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index cbafc9c993cc2..6a83ac14e0b3f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -375,7 +375,9 @@ class GPUModelRunner(
             elif self.speculative_config.use_eagle():
                 self.drafter = EagleProposer(self.vllm_config, self.device, self)
                 if self.speculative_config.method == "eagle3":
-                    self.use_aux_hidden_state_outputs = True
+                    self.use_aux_hidden_state_outputs = (
+                        self.drafter.eagle3_use_aux_hidden_state
+                    )
             elif self.speculative_config.method == "medusa":
                 self.drafter = MedusaProposer(
                     vllm_config=self.vllm_config, device=self.device

From b8328b49fb9954575bd1d7b30b22bb626ee47624 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 25 Nov 2025 09:34:47 +0800
Subject: [PATCH 217/249] [XPU] upgrade torch & ipex 2.9 on XPU platform
 (#29307)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 docker/Dockerfile.xpu | 13 +++++++++----
 requirements/xpu.txt  |  6 +++---
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 5d5b82c4fa5af..adac43c6accbe 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -1,4 +1,4 @@
-FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
+FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
@@ -25,10 +25,14 @@ RUN apt clean && apt-get update -y && \
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
 
-RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
+RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
+
+# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
+RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh
+RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \
+    echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
+    echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
 
-RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
-RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
 SHELL ["bash", "-c"]
 CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
 
@@ -72,6 +76,7 @@ RUN python3 -m pip install -e tests/vllm_test_utils
 ENV NIXL_VERSION=0.7.0
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
 
+# remove torch bundled oneccl to avoid conflicts
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip uninstall oneccl oneccl-devel -y
 
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 59ea710684a2c..c1dc4195b5231 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -10,9 +10,9 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.61.2 # Required for N-gram speculative decoding
-torch==2.8.0+xpu
+--extra-index-url=https://download.pytorch.org/whl/xpu
+torch==2.9.0+xpu
 torchaudio
 torchvision
---extra-index-url=https://download.pytorch.org/whl/xpu
 
-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.9.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl

From a178a0b40b50bf448ab50a853b7eb1744af18f31 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 24 Nov 2025 17:54:26 -0800
Subject: [PATCH 218/249] [BugFix] Fix duplicate id tool-call race condition
 (#29355)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/serving_chat.py   | 13 +++++++++----
 vllm/entrypoints/openai/serving_engine.py | 10 +++++++---
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 6cc685acd6728..2a870dbc3afac 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -273,6 +273,11 @@ class OpenAIServingChat(OpenAIServing):
         try:
             for i, engine_prompt in enumerate(engine_prompts):
                 prompt_text, _, _ = self._get_prompt_components(request_prompts[i])
+                # If we are creating sub requests for multiple prompts, ensure that they
+                # have unique request ids.
+                sub_request_id = (
+                    request_id if len(engine_prompts) == 1 else f"{request_id}_{i}"
+                )
 
                 if self.default_sampling_params is None:
                     self.default_sampling_params = {}
@@ -301,7 +306,7 @@ class OpenAIServingChat(OpenAIServing):
                     )
 
                 self._log_inputs(
-                    request_id,
+                    sub_request_id,
                     request_prompts[i],
                     params=sampling_params,
                     lora_request=lora_request,
@@ -316,14 +321,14 @@ class OpenAIServingChat(OpenAIServing):
                 if isinstance(sampling_params, BeamSearchParams):
                     generator = self.beam_search(
                         prompt=engine_prompt,
-                        request_id=request_id,
+                        request_id=sub_request_id,
                         params=sampling_params,
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                     )
                 else:
                     engine_request, tokenization_kwargs = await self._process_inputs(
-                        request_id,
+                        sub_request_id,
                         engine_prompt,
                         sampling_params,
                         lora_request=lora_request,
@@ -334,7 +339,7 @@ class OpenAIServingChat(OpenAIServing):
                     generator = self.engine_client.generate(
                         engine_request,
                         sampling_params,
-                        request_id,
+                        sub_request_id,
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 7dab5dbacd28c..de22c48809dc8 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1242,16 +1242,19 @@ class OpenAIServing:
     ):
         prompt_text, _, _ = self._get_prompt_components(request_prompt)
         orig_priority = priority
+        sub_request = 0
         while True:
+            # Ensure that each sub-request has a unique request id.
+            sub_request_id = f"{request_id}_{sub_request}"
             self._log_inputs(
-                request_id,
+                sub_request_id,
                 request_prompt,
                 params=sampling_params,
                 lora_request=lora_request,
             )
             trace_headers = kwargs.get("trace_headers")
             engine_request, tokenization_kwargs = await self._process_inputs(
-                request_id,
+                sub_request_id,
                 engine_prompt,
                 sampling_params,
                 lora_request=lora_request,
@@ -1262,7 +1265,7 @@ class OpenAIServing:
             generator = self.engine_client.generate(
                 engine_request,
                 sampling_params,
-                request_id,
+                sub_request_id,
                 lora_request=lora_request,
                 priority=priority,
                 prompt_text=prompt_text,
@@ -1295,6 +1298,7 @@ class OpenAIServing:
             sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids)
             # OPTIMIZATION
             priority = orig_priority - 1
+            sub_request += 1
 
     def _get_prompt_components(
         self,

From a4ad43ad5a819aabc7d9b48b46a7f11e2552befc Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 01:58:58 +0000
Subject: [PATCH 219/249] Scheduled removal of `ParallelConfig`'s direct child
 EPLB fields (#29324)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/parallel.py  | 50 ----------------------------------------
 vllm/engine/arg_utils.py | 24 -------------------
 2 files changed, 74 deletions(-)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index ad438a8b464e0..913e97250d3d3 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -141,22 +141,6 @@ class ParallelConfig:
     - "deepep_high_throughput": Use deepep high-throughput kernels
     - "deepep_low_latency": Use deepep low-latency kernels
     - "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
-    num_redundant_experts: int | None = None
-    """`num_redundant_experts` is deprecated and has been replaced with
-    `eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
-    Please use `eplb_config.num_redundant_experts` instead."""
-    eplb_window_size: int | None = None
-    """`eplb_window_size` is deprecated and has been replaced with
-    `eplb_config.window_size`. This will be removed in v0.12.0.
-    Please use `eplb_config.window_size` instead."""
-    eplb_step_interval: int | None = None
-    """`eplb_step_interval` is deprecated and has been replaced with
-    `eplb_config.step_interval`. This will be removed in v0.12.0.
-    Please use `eplb_config.step_interval` instead."""
-    eplb_log_balancedness: bool | None = None
-    """`eplb_log_balancedness` is deprecated and has been replaced with
-    `eplb_config.log_balancedness`. This will be removed in v0.12.0.
-    Please use `eplb_config.log_balancedness` instead."""
 
     max_parallel_loading_workers: int | None = None
     """Maximum number of parallel loading workers when loading model
@@ -516,40 +500,6 @@ class ParallelConfig:
                     "--all2all-backend command-line argument instead."
                 )
 
-        # Forward deprecated fields to their new location
-        if self.num_redundant_experts is not None:
-            self.eplb_config.num_redundant_experts = self.num_redundant_experts
-            logger.warning_once(
-                "num_redundant_experts is deprecated and has been replaced "
-                "with eplb_config.num_redundant_experts. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-        if self.eplb_window_size is not None:
-            self.eplb_config.window_size = self.eplb_window_size
-            logger.warning_once(
-                "eplb_window_size is deprecated and has been replaced "
-                "with eplb_config.window_size. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-        if self.eplb_step_interval is not None:
-            self.eplb_config.step_interval = self.eplb_step_interval
-            logger.warning_once(
-                "eplb_step_interval is deprecated and has been replaced "
-                "with eplb_config.step_interval. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-        if self.eplb_log_balancedness is not None:
-            self.eplb_config.log_balancedness = self.eplb_log_balancedness
-            logger.warning_once(
-                "eplb_log_balancedness is deprecated and has been replaced "
-                "with eplb_config.log_balancedness. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-
         # Continue with the rest of the initialization
         self.world_size = (
             self.pipeline_parallel_size
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b7c8f56e18c52..a7c6b11ccd5a8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -855,30 +855,6 @@ class EngineArgs:
             "--expert-placement-strategy",
             **parallel_kwargs["expert_placement_strategy"],
         )
-        parallel_group.add_argument(
-            "--num-redundant-experts",
-            type=int,
-            help="[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
-            deprecated=True,
-        )
-        parallel_group.add_argument(
-            "--eplb-window-size",
-            type=int,
-            help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
-            deprecated=True,
-        )
-        parallel_group.add_argument(
-            "--eplb-step-interval",
-            type=int,
-            help="[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
-            deprecated=True,
-        )
-        parallel_group.add_argument(
-            "--eplb-log-balancedness",
-            action=argparse.BooleanOptionalAction,
-            help="[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
-            deprecated=True,
-        )
 
         parallel_group.add_argument(
             "--max-parallel-loading-workers",

From 6f1355a1b74e4502e6a4e6ba9a811cc50729ee1f Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 24 Nov 2025 21:01:40 -0500
Subject: [PATCH 220/249] [Perf] Disable DeepGEMM MoE by default when TP=8 is
 used (#29346)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../model_executor/layers/quantization/fp8.py | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 9e2718057038d..e033032903e87 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -28,6 +28,7 @@ from vllm.model_executor.layers.fused_moe import (
     FusedMoeWeightScaleSupported,
 )
 from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEParallelConfig,
     FusedMoEQuantConfig,
     RoutingMethodType,
     fp8_w8a8_moe_quant_config,
@@ -118,7 +119,9 @@ class Fp8MoeBackend(Enum):
     TRITON = 6
 
 
-def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
+def get_fp8_moe_backend(
+    block_quant: bool, moe_parallel_config: FusedMoEParallelConfig
+) -> Fp8MoeBackend:
     """
     Select the primary FP8 MoE backend
     Note: Shape-specific fallbacks may still occur at runtime.
@@ -159,8 +162,19 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
         logger.info_once("Using Marlin backend for FP8 MoE")
         return Fp8MoeBackend.MARLIN
 
-    # deepGEMM on supported platforms with block-quantized weights
-    if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
+    # Determine if we should use DeepGEMM with block-quantized weights:
+    # - If explicitly set by user, respect their choice
+    # - If not explicitly set (default), disable when TP size is >= 8
+    moe_use_deep_gemm = envs.VLLM_MOE_USE_DEEP_GEMM
+    if not envs.is_set("VLLM_MOE_USE_DEEP_GEMM") and moe_parallel_config.tp_size >= 8:
+        moe_use_deep_gemm = False
+        logger.info_once(
+            "DeepGEMM MoE is disabled by default when TP size is >= 8. "
+            "Set VLLM_MOE_USE_DEEP_GEMM=1 to enable it.",
+            scope="local",
+        )
+
+    if envs.VLLM_USE_DEEP_GEMM and moe_use_deep_gemm and block_quant:
         if not has_deep_gemm():
             logger.warning_once(
                 "DeepGEMM backend requested but not available.", scope="local"
@@ -641,7 +655,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self.quant_config = quant_config
         self.weight_block_size = self.quant_config.weight_block_size
         self.block_quant: bool = self.weight_block_size is not None
-        self.fp8_backend = get_fp8_moe_backend(self.block_quant)
+        self.fp8_backend = get_fp8_moe_backend(
+            self.block_quant, layer.moe_parallel_config
+        )
 
         self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
         self.flashinfer_moe_backend: FlashinferMoeBackend | None = None

From 77e10c9cab751c83de0b2200977212922cc3776f Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Tue, 25 Nov 2025 10:05:46 +0800
Subject: [PATCH 221/249] [Perf][Deepseek] optimize
 gather_and_maybe_dequant_cache kernel's perf for extremely long sequence
 (#28029)

Signed-off-by: ganyi <ygan@amd.com>
---
 csrc/cache.h                             |  11 +-
 csrc/cache_kernels.cu                    | 176 +++++++++++------------
 csrc/torch_bindings.cpp                  |   3 +-
 tests/kernels/attention/test_cache.py    |  12 +-
 vllm/_custom_ops.py                      |   6 +-
 vllm/v1/attention/backends/mla/common.py |  28 +++-
 6 files changed, 131 insertions(+), 105 deletions(-)

diff --git a/csrc/cache.h b/csrc/cache.h
index b162a4a2bc31f..f2a5ec0acf5cd 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -41,11 +41,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
 
 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t num_tokens, const std::string& kv_cache_dtype,
     torch::Tensor const& scale,
     std::optional<torch::Tensor> seq_starts = std::nullopt);
 
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 32960cc8073bb..8a5457206c706 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -905,91 +905,79 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
 namespace vllm {
 
 // grid is launched with dimensions (batch, num_splits)
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
+          int ENTRY_SIZE, int CTA_SIZE>
 __global__ void gather_and_maybe_dequant_cache(
-    const cache_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE,
-                                              // ENTRIES...]
-    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
-    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
-    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
-    const int32_t block_size, const int32_t entry_size,
+    const cache_t* __restrict__ src_cache,     // [NUM_BLOCKS, BLOCK_SIZE,
+                                               // ENTRIES...]
+    scalar_t* __restrict__ dst,                // [TOT_TOKENS, ENTRIES...]
+    const int32_t* __restrict__ block_table,   // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ cu_seq_lens,   // [BATCH+1]
+    const int32_t* __restrict__ token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNK]
+    const int32_t num_tokens, const int32_t block_size,
     const int64_t block_table_stride, const int64_t cache_block_stride,
     const int64_t cache_entry_stride, const int64_t dst_entry_stride,
     const float* __restrict__ scale,
     const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
                                                // batch
+  constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
+  using ltype = vllm::vec_n_t<cache_t, vec_size>;
+  using stype = vllm::vec_n_t<scalar_t, vec_size>;
+  // We are adding this for code readability which will be optimized out when
+  // build in release.
+  assert(CTA_SIZE == blockDim.x);
 
-  const int64_t bid = blockIdx.x;  // Batch ID
-  const int32_t num_splits = gridDim.y;
-  const int32_t split = blockIdx.y;
-  const int32_t seq_start = cu_seq_lens[bid];
-  const int32_t seq_end = cu_seq_lens[bid + 1];
-  const int32_t seq_len = seq_end - seq_start;
-  const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
-  const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
+#pragma unroll
+  for (int token_id = blockIdx.x; token_id < num_tokens;
+       token_id += gridDim.x) {
+    int64_t batch_id = token_to_seq[token_id];
+    int64_t batch_start = cu_seq_lens[batch_id];
+    int64_t batch_end = cu_seq_lens[batch_id + 1];
+    int32_t batch_offset = token_id - batch_start;
 
-  const int32_t split_start = split * split_blocks;
-  const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
+    if (token_id >= batch_end) return;
+    int32_t offset = 0;
+    if (seq_starts != nullptr) {
+      offset = seq_starts[batch_id];
+    }
+    batch_offset += offset;
+    int32_t block_table_id = batch_offset / block_size;
+    int32_t slot_id = batch_offset % block_size;
+    int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
+    int32_t block_id = block_table[block_table_offset];
+    int64_t cache_offset =
+        block_id * cache_block_stride + slot_id * cache_entry_stride;
+    constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
+    scalar_t* dst_ = dst + token_id * dst_entry_stride;
+    cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
 
-  const bool is_active_split = (split_start < tot_blocks);
-  const bool is_last_split = (split_end == tot_blocks);
-
-  if (!is_active_split) return;
-
-  int32_t full_blocks_end = split_end;
-  int32_t partial_block_size = 0;
-
-  // Adjust the pointer for the block_table for this batch.
-  // If seq_starts is provided, compute an offset based on (seq_starts[bid] /
-  // page_size)
-  const int32_t batch_offset = bid * block_table_stride;
-  int32_t offset = 0;
-  if (seq_starts != nullptr) {
-    offset = seq_starts[bid] / block_size;
-  }
-  const int32_t* batch_block_table = block_table + batch_offset + offset;
-
-  // Adjust dst pointer based on the cumulative sequence lengths.
-  dst += seq_start * dst_entry_stride;
-
-  if (is_last_split) {
-    partial_block_size = seq_len % block_size;
-    if (partial_block_size) full_blocks_end -= 1;
-  }
-
-  auto copy_entry = [&](const cache_t* __restrict__ _src,
-                        scalar_t* __restrict__ _dst) {
-    for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
+#pragma unroll
+    for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
       if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        _dst[i] = static_cast<scalar_t>(_src[i]);
+        reinterpret_cast<stype*>(dst_)[idx] =
+            static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
       } else {
-        _dst[i] =
-            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
+        ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
+        stype store_val;
+#pragma unroll
+        for (int j = 0; j < vec_size; ++j) {
+          store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
+              loaded_val.val[j], *scale);
+        }
+        reinterpret_cast<stype*>(dst_)[idx] = store_val;
       }
     }
-  };
-
-  const auto loop_end =
-      std::min((int64_t)full_blocks_end, block_table_stride - offset);
-  for (int pid = split_start; pid < loop_end; ++pid) {
-    auto block_id = batch_block_table[pid];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
-    for (int eid = 0; eid < block_size; ++eid) {
-      copy_entry(block_start_ptr + eid * cache_entry_stride,
-                 block_dst_ptr + eid * dst_entry_stride);
-    }
-  }
-
-  if (partial_block_size) {
-    if (offset + full_blocks_end < block_table_stride) {
-      auto block_id = batch_block_table[full_blocks_end];
-      auto block_start_ptr = src_cache + block_id * cache_block_stride;
-      auto block_dst_ptr =
-          dst + full_blocks_end * block_size * dst_entry_stride;
-      for (int eid = 0; eid < partial_block_size; ++eid) {
-        copy_entry(block_start_ptr + eid * cache_entry_stride,
-                   block_dst_ptr + eid * dst_entry_stride);
+    // process tail
+    constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
+    dst_ = dst_ + ENTRY_SIZE - tail_cnt;
+    src_ = src_ + ENTRY_SIZE - tail_cnt;
+#pragma unroll
+    for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst_[idx] = static_cast<scalar_t>(src_[idx]);
+      } else {
+        dst_[idx] =
+            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
       }
     }
   }
@@ -1001,34 +989,38 @@ __global__ void gather_and_maybe_dequant_cache(
 // SCALAR_T is the data type of the destination tensor.
 // CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                      \
-  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE>         \
-      <<<grid, block, 0, stream>>>(                                         \
-          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                 \
-          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                      \
-          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
-          block_size, entry_size, block_table_stride, cache_block_stride,   \
-          cache_entry_stride, dst_entry_stride,                             \
-          reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
+#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                        \
+  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576,      \
+                                       thread_block_size>                     \
+      <<<grid, block, 0, stream>>>(                                           \
+          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                   \
+          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                        \
+          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(),   \
+          token_to_seq.data_ptr<int32_t>(), num_tokens, block_size,           \
+          block_table_stride, cache_block_stride, cache_entry_stride,         \
+          dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
+          seq_starts_ptr);
 
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
+//  - token_to_seq contains the back mapping from token_id to batch_id
 //  - Optionally, seq_starts (if provided) offsets the starting block index by
 //  (seq_starts[bid] / page_size)
 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t num_tokens, const std::string& kv_cache_dtype,
     torch::Tensor const& scale,
     std::optional<torch::Tensor> seq_starts = std::nullopt) {
   at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   int32_t block_size = src_cache.size(1);
-  int32_t entry_size = src_cache.flatten(2, -1).size(2);
+  int32_t head_dim = dst.size(-1);
 
   TORCH_CHECK(block_table.dtype() == torch::kInt32,
               "block_table must be int32");
@@ -1038,6 +1030,9 @@ void gather_and_maybe_dequant_cache(
     TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
                 "seq_starts must be int32");
   }
+  TORCH_CHECK(head_dim == 576,
+              "gather_and_maybe_dequant_cache only support the head_dim to 576 "
+              "for better performance")
 
   TORCH_CHECK(src_cache.device() == dst.device(),
               "src_cache and dst must be on the same device");
@@ -1055,10 +1050,9 @@ void gather_and_maybe_dequant_cache(
   int64_t cache_entry_stride = src_cache.stride(1);
   int64_t dst_entry_stride = dst.stride(0);
 
-  // Decide on the number of splits based on the batch size.
-  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
-  dim3 grid(batch_size, num_splits);
-  dim3 block(1024);
+  constexpr int32_t thread_block_size = 64;
+  dim3 grid(num_tokens);
+  dim3 block(thread_block_size);
 
   const int32_t* seq_starts_ptr =
       seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 5af74c2c2a6b0..14913bef13125 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -695,7 +695,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
   cache_ops.def(
       "gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, "
       "                               Tensor block_table, Tensor cu_seq_lens, "
-      "                               int batch_size, "
+      "                               Tensor token_to_seq, "
+      "                               int num_tokens, "
       "                               str kv_cache_dtype, "
       "                               Tensor scale, Tensor? seq_starts) -> ()");
   cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 028e164cb801b..acf46d75d62eb 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -921,12 +921,16 @@ def test_gather_and_maybe_dequant_cache_mla(
     )
     _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
 
-    seq_len_tensor = torch.randint(0, max_seq_len + 1, (batch_size,), device=device)
+    seq_len_tensor = torch.randint(
+        max_seq_len, max_seq_len + 1, (batch_size,), device=device
+    )
 
     total_tokens = seq_len_tensor.sum()
     cu_seq_lens = torch.empty((batch_size + 1), dtype=torch.int32, device=device)
     cu_seq_lens[0] = 0
     cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
+    token_to_seq = torch.arange(0, batch_size, dtype=torch.int32, device=device)
+    token_to_seq = torch.repeat_interleave(token_to_seq, seq_len_tensor)
     print("seq_len_tensor", seq_len_tensor)
 
     tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
@@ -977,7 +981,8 @@ def test_gather_and_maybe_dequant_cache_mla(
             dst,
             block_table,
             cu_seq_lens,
-            batch_size,
+            token_to_seq,
+            total_tokens,
             kv_cache_dtype,
             scale,
             None,
@@ -990,7 +995,8 @@ def test_gather_and_maybe_dequant_cache_mla(
         dst,
         block_table,
         cu_seq_lens,
-        batch_size,
+        token_to_seq,
+        total_tokens,
         kv_cache_dtype,
         scale,
         None,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 0f625a7945241..4a1bcc761f994 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2201,7 +2201,8 @@ def gather_and_maybe_dequant_cache(
     dst: torch.Tensor,
     block_table: torch.Tensor,
     cu_seq_lens: torch.Tensor,
-    batch_size: int,
+    token_to_seq: torch.Tensor,
+    num_tokens: int,
     kv_cache_dtype: str,
     scale: torch.Tensor,
     seq_starts: torch.Tensor | None = None,
@@ -2211,7 +2212,8 @@ def gather_and_maybe_dequant_cache(
         dst,
         block_table,
         cu_seq_lens,
-        batch_size,
+        token_to_seq,
+        num_tokens,
         kv_cache_dtype,
         scale,
         seq_starts,
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 43aef8a7cca91..87a3aac21d2c3 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -340,6 +340,8 @@ class MLACommonPrefillMetadata:
         max_seq_lens: list[int]
         seq_lens: torch.Tensor
         workspace: torch.Tensor
+        token_to_seq: torch.Tensor
+        chunk_total_token: list[int]
 
         # for mla DCP
         padded_local_chunk_seq_lens: list[list[int]] | None = None
@@ -839,6 +841,19 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 torch.cumsum(
                     chunk_seq_lens, dim=1, out=cu_seq_lens_cpu[:, 1:], dtype=torch.int32
                 )
+                chunk_total_token = cu_seq_lens_cpu[:, -1]
+
+                max_token_num_over_chunk = chunk_total_token.max().item()
+                token_to_seq_tensor_cpu = torch.zeros(
+                    [num_chunks, max_token_num_over_chunk], dtype=torch.int32
+                )
+                range_idx = torch.arange(num_prefills, dtype=torch.int32)
+                for i in range(num_chunks):
+                    chunk_token_to_seq_tensor = torch.repeat_interleave(
+                        range_idx, chunk_seq_lens[i]
+                    )
+                    chunk_len = chunk_token_to_seq_tensor.shape[0]
+                    token_to_seq_tensor_cpu[i, :chunk_len] = chunk_token_to_seq_tensor
 
                 if self.dcp_world_size > 1:
                     local_context_lens_allranks = get_dcp_local_seq_lens(
@@ -906,6 +921,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                         seq_tot=padded_local_chunk_seq_lens.sum(dim=1).tolist(),
                         max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                         seq_lens=chunk_seq_lens,
+                        token_to_seq=token_to_seq_tensor_cpu.to(
+                            device, non_blocking=True
+                        ),
+                        chunk_total_token=chunk_total_token.tolist(),
                         workspace=self.chunked_prefill_workspace,
                         padded_local_chunk_seq_lens=padded_local_chunk_seq_lens.tolist(),
                         local_context_lens_allranks=local_context_lens_allranks.tolist(),
@@ -922,6 +941,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                         seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
                         max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                         seq_lens=chunk_seq_lens,
+                        token_to_seq=token_to_seq_tensor_cpu.to(
+                            device, non_blocking=True
+                        ),
+                        chunk_total_token=chunk_total_token,
                         workspace=self.chunked_prefill_workspace,
                     )
 
@@ -1638,16 +1661,15 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         output = None
         iters = len(prefill_metadata.chunked_context.seq_tot)
         workspace = prefill_metadata.chunked_context.workspace
-
         for i in range(iters):
             toks = prefill_metadata.chunked_context.seq_tot[i]
-
             ops.gather_and_maybe_dequant_cache(
                 src_cache=kv_c_and_k_pe_cache,
                 dst=workspace,
                 block_table=prefill_metadata.block_table,
                 cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i],
-                batch_size=attn_metadata.num_prefills,
+                token_to_seq=prefill_metadata.chunked_context.token_to_seq[i],
+                num_tokens=prefill_metadata.chunked_context.chunk_total_token[i],
                 kv_cache_dtype=self.kv_cache_dtype,
                 scale=k_scale,
                 seq_starts=prefill_metadata.chunked_context.starts[i],

From cb7214d8eaa231c67416282668f1ca274f8068ba Mon Sep 17 00:00:00 2001
From: gbyu-amd <Guanbao.Yu@amd.com>
Date: Tue, 25 Nov 2025 10:15:02 +0800
Subject: [PATCH 222/249] [ROCm][MLA] enable fp8 MLA decode on ROCm (#28032)

Signed-off-by: guanbao <gyu@amd.com>
Signed-off-by: Guanbao Yu <gyu@amd.com>
Signed-off-by: gbyu-amd <Guanbao.Yu@amd.com>
Co-authored-by: guanbao <gyu@amd.com>
---
 vllm/_aiter_ops.py                               | 10 ++++++++++
 vllm/v1/attention/backends/mla/rocm_aiter_mla.py | 12 +++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index db79b3f5e8bcb..a8f472d147a0d 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -294,6 +294,8 @@ def _rocm_aiter_mla_decode_fwd_impl(
     kv_last_page_lens: torch.Tensor | None = None,
     sm_scale: float = 1.0,
     logit_cap: float = 0.0,
+    q_scale: torch.Tensor | None = None,
+    kv_scale: torch.Tensor | None = None,
 ) -> None:
     from aiter.mla import mla_decode_fwd
 
@@ -308,6 +310,8 @@ def _rocm_aiter_mla_decode_fwd_impl(
         max_seqlen_qo,
         sm_scale=sm_scale,
         logit_cap=logit_cap,
+        q_scale=q_scale,
+        kv_scale=kv_scale,
     )
 
 
@@ -322,6 +326,8 @@ def _rocm_aiter_mla_decode_fwd_fake(
     kv_last_page_lens: torch.Tensor | None = None,
     sm_scale: float = 1.0,
     logit_cap: float = 0.0,
+    q_scale: torch.Tensor | None = None,
+    kv_scale: torch.Tensor | None = None,
 ) -> None:
     pass
 
@@ -806,6 +812,8 @@ class rocm_aiter_ops:
         kv_indices: torch.Tensor | None = None,
         kv_last_page_lens: torch.Tensor | None = None,
         logit_cap: float = 0.0,
+        q_scale: torch.Tensor | None = None,
+        kv_scale: torch.Tensor | None = None,
     ):
         torch.ops.vllm.rocm_aiter_mla_decode_fwd(
             q,
@@ -818,6 +826,8 @@ class rocm_aiter_ops:
             kv_last_page_lens,
             sm_scale=sm_scale,
             logit_cap=logit_cap,
+            q_scale=q_scale,
+            kv_scale=kv_scale,
         )
 
     @staticmethod
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 56f9c7a281e7f..00a0a77a1c2f7 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -49,6 +49,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata):
     paged_kv_last_page_len: torch.Tensor | None = None
     # The query indptr, shape : [num_decode + 1]
     qo_indptr: torch.Tensor | None = None
+    # The dtype of MLA out tensor
+    attn_out_dtype: torch.dtype = torch.bfloat16
 
 
 class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
@@ -74,6 +76,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         )
 
         self.compilation_config = vllm_config.compilation_config
+        self.decode_attn_out_dtype = vllm_config.model_config.dtype
         # kernel block size is always 1.
         max_num_pages_per_req = vllm_config.model_config.max_model_len
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
@@ -162,6 +165,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
             paged_kv_last_page_len=paged_kv_last_page_len,
             qo_indptr=qo_indptr,
             dcp_tot_seq_lens=dcp_tot_seq_lens_device,
+            attn_out_dtype=self.decode_attn_out_dtype,
         )
 
         return attn_metadata
@@ -242,7 +246,11 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
         assert isinstance(q, torch.Tensor)
         B = q.shape[0]
         o = torch.zeros(
-            B, self.num_heads, self.kv_lora_rank, dtype=q.dtype, device=q.device
+            B,
+            self.num_heads,
+            self.kv_lora_rank,
+            dtype=attn_metadata.decode.attn_out_dtype,
+            device=q.device,
         )
 
         kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
@@ -260,6 +268,8 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
             attn_metadata.decode.paged_kv_indptr,
             attn_metadata.decode.paged_kv_indices,
             attn_metadata.decode.paged_kv_last_page_len,
+            q_scale=layer._q_scale,
+            kv_scale=layer._k_scale,
         )
 
         return o, None

From 22b42b5402f887c7d4b9f9aa4e82c970a6fd11a9 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Mon, 24 Nov 2025 20:15:39 -0600
Subject: [PATCH 223/249] [CI][ROCm] Install arctic-inference on ROCm tests
 (#29344)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 requirements/rocm-test.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 2d57e7e167869..f9bddc23420b4 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -45,3 +45,6 @@ multiprocess==0.70.16
 
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
+
+# Required for suffix decoding test
+arctic-inference == 0.1.1
\ No newline at end of file

From 7012d8b45e677a4316e38be6fb9547de2993b519 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Tue, 25 Nov 2025 10:54:00 +0800
Subject: [PATCH 224/249] [Docker] Optimize Dockerfile: consolidate apt-get and
 reduce image size by ~200MB (#29060)

Signed-off-by: princepride <wangzhipeng628@gmail.com>
---
 docker/Dockerfile | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index e03b9989a190c..84a1802dbe03a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -85,7 +85,20 @@ ARG GET_PIP_URL
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \
+    && apt-get install -y --no-install-recommends \
+        ccache \
+        software-properties-common \
+        git \
+        curl \
+        sudo \
+        python3-pip \
+        libibverbs-dev \
+        # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+        # as it was causing spam when compiling the CUTLASS kernels
+        gcc-10 \
+        g++-10 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
+    && rm -rf /var/lib/apt/lists/* \
     && curl -LsSf https://astral.sh/uv/install.sh | sh \
     && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
     && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
@@ -110,10 +123,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
-# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
-# as it was causing spam when compiling the CUTLASS kernels
-RUN apt-get install -y gcc-10 g++-10
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
 RUN <<EOF
 gcc --version
 EOF
@@ -268,7 +277,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy
 
 # Install libnuma-dev, required by fastsafetensors (fixes #20384)
-RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
@@ -305,8 +314,15 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y software-properties-common curl sudo python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && apt-get install -y --no-install-recommends \
+        software-properties-common \
+        curl \
+        sudo \
+        python3-pip \
+        ffmpeg \
+        libsm6 \
+        libxext6 \
+        libgl1 \
     && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
         if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
             mkdir -p -m 0755 /etc/apt/keyrings ; \
@@ -321,7 +337,12 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
         done ; \
     fi \
     && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && apt-get install -y --no-install-recommends \
+        python${PYTHON_VERSION} \
+        python${PYTHON_VERSION}-dev \
+        python${PYTHON_VERSION}-venv \
+        libibverbs-dev \
+    && rm -rf /var/lib/apt/lists/* \
     && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
     && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
     && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \

From 9cf4edae6ef2c972429560ca8f72d40688d10495 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 25 Nov 2025 03:15:13 +0000
Subject: [PATCH 225/249] [Metrics] Scheduled removal of deprecated metrics
 (#29330)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |   3 -
 vllm/v1/metrics/loggers.py               | 134 +++++++----------------
 2 files changed, 37 insertions(+), 100 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 4e7b765d7713f..65a6fd20bd0d1 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -183,9 +183,6 @@ async def test_metrics_counts(
 EXPECTED_METRICS_V1 = [
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
-    "vllm:gpu_cache_usage_perc",
-    "vllm:gpu_prefix_cache_queries",
-    "vllm:gpu_prefix_cache_hits",
     "vllm:kv_cache_usage_perc",
     "vllm:prefix_cache_queries",
     "vllm:prefix_cache_hits",
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index e2d82241ce210..bd18a152ffc08 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -440,57 +440,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
         # Setting default values
         self.record_sleep_state()
 
-        # GPU cache
-        #
-        # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
-        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
-        # TODO: remove in 0.12.0
-        if self.show_hidden_metrics:
-            gauge_gpu_cache_usage = self._gauge_cls(
-                name="vllm:gpu_cache_usage_perc",
-                documentation=(
-                    "GPU KV-cache usage. 1 means 100 percent usage."
-                    "DEPRECATED: Use vllm:kv_cache_usage_perc instead."
-                ),
-                multiprocess_mode="mostrecent",
-                labelnames=labelnames,
-            )
-            self.gauge_gpu_cache_usage = make_per_engine(
-                gauge_gpu_cache_usage, engine_indexes, model_name
-            )
-
-        # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
-        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
-        # TODO: remove in 0.12.0
-        if self.show_hidden_metrics:
-            counter_gpu_prefix_cache_queries = self._counter_cls(
-                name="vllm:gpu_prefix_cache_queries",
-                documentation=(
-                    "GPU prefix cache queries, in terms of number of queried"
-                    "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
-                ),
-                labelnames=labelnames,
-            )
-            self.counter_gpu_prefix_cache_queries = make_per_engine(
-                counter_gpu_prefix_cache_queries, engine_indexes, model_name
-            )
-
-        # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
-        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
-        # TODO: remove in 0.12.0
-        if self.show_hidden_metrics:
-            counter_gpu_prefix_cache_hits = self._counter_cls(
-                name="vllm:gpu_prefix_cache_hits",
-                documentation=(
-                    "GPU prefix cache hits, in terms of number of cached "
-                    "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
-                ),
-                labelnames=labelnames,
-            )
-            self.counter_gpu_prefix_cache_hits = make_per_engine(
-                counter_gpu_prefix_cache_hits, engine_indexes, model_name
-            )
-
         gauge_kv_cache_usage = self._gauge_cls(
             name="vllm:kv_cache_usage_perc",
             documentation="KV-cache usage. 1 means 100 percent usage.",
@@ -735,39 +684,41 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
         )
 
         # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
-        # TODO: in 0.12, only enable if show_hidden_metrics=True
-        histogram_time_per_output_token = self._histogram_cls(
-            name="vllm:time_per_output_token_seconds",
-            documentation=(
-                "Histogram of time per output token in seconds."
-                "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
-            ),
-            buckets=[
-                0.01,
-                0.025,
-                0.05,
-                0.075,
-                0.1,
-                0.15,
-                0.2,
-                0.3,
-                0.4,
-                0.5,
-                0.75,
-                1.0,
-                2.5,
-                5.0,
-                7.5,
-                10.0,
-                20.0,
-                40.0,
-                80.0,
-            ],
-            labelnames=labelnames,
-        )
-        self.histogram_time_per_output_token = make_per_engine(
-            histogram_time_per_output_token, engine_indexes, model_name
-        )
+        # With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
+        # TODO: remove in 0.13.0
+        if self.show_hidden_metrics:
+            histogram_time_per_output_token = self._histogram_cls(
+                name="vllm:time_per_output_token_seconds",
+                documentation=(
+                    "Histogram of time per output token in seconds."
+                    "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
+                ),
+                buckets=[
+                    0.01,
+                    0.025,
+                    0.05,
+                    0.075,
+                    0.1,
+                    0.15,
+                    0.2,
+                    0.3,
+                    0.4,
+                    0.5,
+                    0.75,
+                    1.0,
+                    2.5,
+                    5.0,
+                    7.5,
+                    10.0,
+                    20.0,
+                    40.0,
+                    80.0,
+                ],
+                labelnames=labelnames,
+            )
+            self.histogram_time_per_output_token = make_per_engine(
+                histogram_time_per_output_token, engine_indexes, model_name
+            )
 
         histogram_inter_token_latency = self._histogram_cls(
             name="vllm:inter_token_latency_seconds",
@@ -966,20 +917,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
             self.gauge_scheduler_waiting[engine_idx].set(
                 scheduler_stats.num_waiting_reqs
             )
-            if self.show_hidden_metrics:
-                self.gauge_gpu_cache_usage[engine_idx].set(
-                    scheduler_stats.kv_cache_usage
-                )
             self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage)
 
-            if self.show_hidden_metrics:
-                self.counter_gpu_prefix_cache_queries[engine_idx].inc(
-                    scheduler_stats.prefix_cache_stats.queries
-                )
-                self.counter_gpu_prefix_cache_hits[engine_idx].inc(
-                    scheduler_stats.prefix_cache_stats.hits
-                )
-
             self.counter_prefix_cache_queries[engine_idx].inc(
                 scheduler_stats.prefix_cache_stats.queries
             )
@@ -1050,7 +989,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
             self.histogram_time_to_first_token[engine_idx].observe(ttft)
         for itl in iteration_stats.inter_token_latencies_iter:
             self.histogram_inter_token_latency[engine_idx].observe(itl)
-            self.histogram_time_per_output_token[engine_idx].observe(itl)
+            if self.show_hidden_metrics:
+                self.histogram_time_per_output_token[engine_idx].observe(itl)
 
         for finished_request in iteration_stats.finished_requests:
             self.counter_request_success[finished_request.finish_reason][

From 87185c88d54bd97c4c08f1fd3c5a8564e4924e2a Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Tue, 25 Nov 2025 03:19:52 +0000
Subject: [PATCH 226/249] =?UTF-8?q?[Bugfix]=20Make=20deprecated=20`--task?=
 =?UTF-8?q?=20embedding`=20consistent=20with=20`--runner=E2=80=A6=20(#2931?=
 =?UTF-8?q?2)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 vllm/config/model.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 49688e17cf932..c37dd7c15f2a7 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -585,16 +585,26 @@ class ModelConfig:
                 else:  # task == "auto"
                     pass
             else:
-                debug_info = {
-                    "architectures": architectures,
-                    "is_generative_model": is_generative_model,
-                    "is_pooling_model": is_pooling_model,
-                }
-                raise AssertionError(
-                    "The model should be a generative or "
-                    "pooling model when task is set to "
-                    f"{self.task!r}. Found: {debug_info}"
-                )
+                # Neither generative nor pooling model - try to convert if possible
+                if is_pooling_task:
+                    runner = "pooling"
+                    convert = _task_to_convert(self.task)
+                    msg_hint = (
+                        "Please replace this option with `--runner pooling "
+                        f"--convert {convert}` to continue using this model "
+                        "as a pooling model."
+                    )
+                else:
+                    debug_info = {
+                        "architectures": architectures,
+                        "is_generative_model": is_generative_model,
+                        "is_pooling_model": is_pooling_model,
+                    }
+                    raise AssertionError(
+                        "The model should be a generative or "
+                        "pooling model when task is set to "
+                        f"{self.task!r}. Found: {debug_info}"
+                    )
 
             self.runner = runner
             self.convert = convert

From 92effb07a48e56c531a95b696acd5f699baf16da Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 25 Nov 2025 11:28:51 +0800
Subject: [PATCH 227/249] [Model] Add HunyuanOCR support (#29327)

Signed-off-by: manayang <jackmanayang@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: sergeywang <sergeywang@tencent.com>
Co-authored-by: manayang <jackmanayang@gmail.com>
Co-authored-by: manayang <manayang@tencent.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 docs/models/supported_models.md               |    1 +
 examples/offline_inference/vision_language.py |   26 +
 tests/models/registry.py                      |    4 +
 vllm/config/model.py                          |    5 +
 .../layers/rotary_embedding/__init__.py       |   13 +
 .../layers/rotary_embedding/xdrope.py         |  102 ++
 vllm/model_executor/models/hunyuan_v1.py      |   11 +-
 vllm/model_executor/models/hunyuan_vision.py  | 1028 +++++++++++++++++
 vllm/model_executor/models/interfaces.py      |   51 +-
 vllm/model_executor/models/registry.py        |    4 +
 vllm/transformers_utils/config.py             |   18 +
 vllm/transformers_utils/configs/__init__.py   |    8 +
 vllm/transformers_utils/configs/hunyuan_vl.py |  322 ++++++
 .../transformers_utils/processors/__init__.py |   10 +-
 .../processors/hunyuan_vl.py                  |  233 ++++
 .../processors/hunyuan_vl_image.py            |  477 ++++++++
 vllm/v1/worker/gpu_input_batch.py             |    2 +
 vllm/v1/worker/gpu_model_runner.py            |  104 +-
 18 files changed, 2415 insertions(+), 4 deletions(-)
 create mode 100644 vllm/model_executor/layers/rotary_embedding/xdrope.py
 create mode 100644 vllm/model_executor/models/hunyuan_vision.py
 create mode 100644 vllm/transformers_utils/configs/hunyuan_vl.py
 create mode 100644 vllm/transformers_utils/processors/hunyuan_vl.py
 create mode 100644 vllm/transformers_utils/processors/hunyuan_vl_image.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 404519f887dc6..25579835faf63 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -680,6 +680,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
+| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 624de2a2debc3..65ea4df4a3099 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -538,6 +538,31 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# HunyuanOCR
+def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "tencent/HunyuanOCR"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    placeholder = "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+    prompts = [
+        f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=None,
+    )
+
+
 # naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
 def run_hyperclovax_seed_vision(
     questions: list[str], modality: str
@@ -1820,6 +1845,7 @@ model_example_map = {
     "glm4_5v": run_glm4_5v,
     "glm4_5v_fp8": run_glm4_5v_fp8,
     "h2ovl_chat": run_h2ovl,
+    "hunyuan_vl": run_hunyuan_vl,
     "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
     "idefics3": run_idefics3,
     "interns1": run_interns1,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 758ec54493aa3..f8b3470e6d39b 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -626,6 +626,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
         trust_remote_code=True,
     ),
+    "HunYuanVLForConditionalGeneration": _HfExamplesInfo(
+        "tencent/HunyuanOCR",
+        is_available_online=False,
+    ),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo(
         "HuggingFaceM4/Idefics3-8B-Llama3",
         extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},
diff --git a/vllm/config/model.py b/vllm/config/model.py
index c37dd7c15f2a7..caa9a3440c41d 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -33,6 +33,7 @@ from vllm.transformers_utils.config import (
     try_get_safetensors_metadata,
     try_get_tokenizer_config,
     uses_mrope,
+    uses_xdrope_dim,
 )
 from vllm.transformers_utils.gguf_utils import (
     maybe_patch_hf_config_from_gguf,
@@ -1615,6 +1616,10 @@ class ModelConfig:
     def uses_mrope(self) -> bool:
         return uses_mrope(self.hf_config)
 
+    @property
+    def uses_xdrope_dim(self) -> int:
+        return uses_xdrope_dim(self.hf_config)
+
     @property
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index 152d9401b8e94..0f10bff6ac4f5 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -17,6 +17,7 @@ from .llama4_vision_rope import Llama4VisionRotaryEmbedding
 from .mrope import MRotaryEmbedding
 from .ntk_scaling_rope import NTKScalingRotaryEmbedding
 from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding
+from .xdrope import XDRotaryEmbedding
 from .yarn_scaling_rope import YaRNScalingRotaryEmbedding
 
 _ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
@@ -184,6 +185,18 @@ def get_rope(
                 raise ValueError(
                     "Dynamic rope scaling must contain either 'alpha' or 'factor' field"
                 )
+        elif scaling_type == "xdrope":
+            scaling_alpha = rope_parameters["alpha"]
+            rotary_emb = XDRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_alpha,
+                dtype,
+                xdrope_section=rope_parameters["xdrope_section"],
+            )
         elif scaling_type == "yarn":
             scaling_factor = rope_parameters["factor"]
             original_max_position = rope_parameters["original_max_position_embeddings"]
diff --git a/vllm/model_executor/layers/rotary_embedding/xdrope.py b/vllm/model_executor/layers/rotary_embedding/xdrope.py
new file mode 100644
index 0000000000000..2432273faf195
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/xdrope.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import torch
+
+from .common import apply_rotary_emb_dispatch
+from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
+
+
+class XDRotaryEmbedding(DynamicNTKAlphaRotaryEmbedding):
+    """DynamicNTKAlphaRotaryEmbedding extended with MultiModal(XD) Sections.
+
+    Based on the original DynamicNTKAlphaRotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+        xdrope_section: list[int],
+    ) -> None:
+        self.xdrope_section = xdrope_section
+        super().__init__(
+            head_size,
+            rotary_dim,
+            max_position_embeddings,
+            base,
+            is_neox_style,
+            scaling_alpha,
+            dtype,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [4, num_tokens] (P/W/H/T positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = torch.cat(
+            [m[i] for i, m in enumerate(cos.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+        sin = torch.cat(
+            [m[i] for i, m in enumerate(sin.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    @staticmethod
+    def get_next_input_positions(
+        context_len: int,
+        seq_len: int,
+        xd_sections: int = 4,
+    ) -> list[list[int]]:
+        return [list(range(context_len, seq_len)) for _ in range(xd_sections)]
+
+    @staticmethod
+    def get_next_input_positions_tensor(
+        out: np.ndarray,
+        out_offset: int,
+        context_len: int,
+        num_new_tokens: int,
+    ):
+        values = np.arange(
+            context_len,
+            context_len + num_new_tokens,
+            dtype=out.dtype,
+        )
+        out[:, out_offset : out_offset + num_new_tokens] = values
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 9fa5e2bd33f21..53fb444ed622d 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -576,7 +576,16 @@ class HunYuanDecoderLayer(nn.Module):
         return hidden_states, residual, ori_kv_states
 
 
-@support_torch_compile
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (xd, seq_len) if xdrope is enabled for hunyuan-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
 class HunYuanModel(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
new file mode 100644
index 0000000000000..e83addd0c092f
--- /dev/null
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -0,0 +1,1028 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# coding=utf-8
+# Copyright 2025 The HunYuan team.
+# Copyright 2025 The vLLM team.
+# Copyright 2025 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only HunYuan-VL model compatible with HuggingFace weights."""
+
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from functools import partial
+from typing import Annotated, Any, Literal, TypeAlias
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BatchFeature
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import MultiHeadAttention
+from vllm.config import MultiModalConfig, VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    ImageItem,
+    ModalityData,
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    DictEmbeddingItems,
+    ImageSize,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.hunyuan_vl import (
+    HunYuanVLConfig,
+    HunYuanVLVisionConfig,
+)
+from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
+from vllm.transformers_utils.processors.hunyuan_vl_image import smart_resize
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+    SupportsXDRoPE,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+# === Vision Inputs === #
+
+
+class HunYuanVLImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: Number of patches
+        - ni: Number of images
+        - cps: Number of channels * patch_size * patch_size
+    """
+
+    type: Literal["pixel_values"]
+
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", "cps"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+class HunYuanVLImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - nf: Number of image features
+        - hs: Hidden size
+        - ni: Number of images
+    """
+
+    type: Literal["image_embeds"]
+
+    image_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("nf", "hs"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+HunYuanVLImageInputs: TypeAlias = (
+    HunYuanVLImagePixelInputs | HunYuanVLImageEmbeddingInputs
+)
+
+# === Vision Encoder === #
+
+
+class HunYuanVisionMLP(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        bias: bool = True,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.dense_h_to_4h = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
+            disable_tp=use_data_parallel,
+        )
+        self.dense_4h_to_h = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
+            disable_tp=use_data_parallel,
+        )
+        self.act_fn = act_fn
+
+    def forward(self, x: torch.Tensor):
+        x_up, _ = self.dense_h_to_4h(x)
+        x_down, _ = self.dense_4h_to_h(self.act_fn(x_up))
+        return x_down
+
+
+class HunYuanVisionAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        self.tp_size = (
+            1
+            if use_data_parallel
+            else parallel_state.get_tensor_model_parallel_world_size()
+        )
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size
+        )
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+            disable_tp=use_data_parallel,
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+            disable_tp=use_data_parallel,
+        )
+
+        self.scale = self.hidden_size_per_attention_head**-0.5
+        self.attn = MultiHeadAttention(
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+            self.scale,
+            prefix=f"{prefix}.attn",
+            multimodal_config=multimodal_config,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+        out = self.attn(q, k, v)
+        output, _ = self.o_proj(out)
+        return output
+
+
+class HunYuanVisionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_hidden_dim: int,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.input_layernorm = norm_layer(dim)
+        self.post_attention_layernorm = norm_layer(dim)
+        self.self_attn = HunYuanVisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.self_attn",
+            use_data_parallel=use_data_parallel,
+        )
+        self.mlp = HunYuanVisionMLP(
+            dim,
+            mlp_hidden_dim,
+            act_fn=act_fn,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        x = x + self.self_attn(self.input_layernorm(x))
+        x = x + self.mlp(self.post_attention_layernorm(x))
+        return x
+
+
+class HunYuanVisionPatchEmbed(nn.Module):
+    def __init__(self, config: HunYuanVLVisionConfig):
+        super().__init__()
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.num_channels = config.num_channels
+        self.spatial_merge_size = config.spatial_merge_size
+        self.interpolate_mode = config.interpolate_mode
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=True,
+        )
+
+        self.max_num_patches = (config.max_image_size // self.patch_size) ** 2
+
+        self.num_positions = self.max_num_patches + 1
+        self.position_edge = int(self.num_positions**0.5)
+        # first token is cls token, skip it
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+        self.patch_pos_embed = None
+
+    def forward(
+        self, pixel_values: torch.Tensor, grid_thw: list[list[int]]
+    ) -> torch.Tensor:
+        num_patches = pixel_values.size(0)
+        pixel_values = pixel_values.reshape(
+            num_patches, self.num_channels, self.patch_size, self.patch_size
+        )
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        patch_embeds = patch_embeds.squeeze(-1).squeeze(-1).unsqueeze(0)
+
+        if self.patch_pos_embed is None:
+            patch_pos_shape = (
+                1,
+                self.position_edge,
+                self.position_edge,
+                self.embed_dim,
+            )
+            self.patch_pos_embed = (
+                self.position_embedding.weight[1:, :]
+                .reshape(patch_pos_shape)
+                .permute(0, 3, 1, 2)
+                .float()
+            )
+
+        patch_pos_embed_list = []
+        for grid in grid_thw:
+            _, h0, w0 = grid
+            # we add a small number to avoid floating point error in the interpolation
+            # see discussion at https://github.com/facebookresearch/dino/issues/8
+            h0, w0 = h0 + 0.1, w0 + 0.1
+            patch_pos_embed = nn.functional.interpolate(
+                self.patch_pos_embed,
+                scale_factor=(h0 / self.position_edge, w0 / self.position_edge),
+                mode=self.interpolate_mode,
+                align_corners=False,
+            )
+
+            patch_pos_embed = (
+                patch_pos_embed.reshape(self.embed_dim, -1)
+                .transpose(0, 1)
+                .unsqueeze(0)
+                .to(patch_embeds.dtype)
+            )
+            patch_pos_embed_list.append(patch_pos_embed)
+
+        patch_pos_embed = torch.cat(patch_pos_embed_list, dim=1)
+        embeddings = patch_embeds + patch_pos_embed
+
+        return embeddings
+
+
+class HunYuanVisionPatchMerger(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        spatial_merge_size=2,
+        rms_norm_eps=1e-5,
+        prefix="",
+    ):
+        super().__init__()
+        self.spatial_merge_size = spatial_merge_size
+        embed_std = out_channels**-0.5
+
+        self.proj = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                in_channels * 2,
+                kernel_size=spatial_merge_size,
+                stride=spatial_merge_size,
+            ),
+            nn.GELU(),
+            nn.Conv2d(in_channels * 2, in_channels * 4, kernel_size=1),
+        )
+        self.mlp = nn.Linear(in_channels * 4, out_channels)
+
+        self.image_newline = nn.Parameter(torch.randn(in_channels * 4) * embed_std)
+        self.image_begin = nn.Parameter(torch.randn(out_channels) * embed_std)
+        self.image_end = nn.Parameter(torch.randn(out_channels) * embed_std)
+        self.image_sep = nn.Parameter(torch.randn(out_channels) * embed_std)
+
+        self.before_rms = RMSNorm(in_channels, eps=rms_norm_eps)
+        self.after_rms = RMSNorm(out_channels, eps=rms_norm_eps)
+
+    def forward(self, x, size=(16, 16)):
+        x = self.before_rms(x)
+
+        h, w = size
+        dtype = x.dtype
+        x = x.permute(0, 2, 1).reshape(x.shape[0], -1, h, w)
+
+        x = self.proj(x)  # b,c,h,w
+        b, c, h, w = x.shape
+        x = torch.cat(
+            [x, self.image_newline.reshape(1, c, 1, 1).expand(b, c, h, 1).to(dtype)],
+            dim=-1,
+        )
+        x = x.reshape(b, c, -1).permute(0, 2, 1)
+        x = self.mlp(x)
+
+        begin = self.image_begin.reshape(1, 1, -1).expand(b, 1, x.shape[-1]).to(dtype)
+        end = self.image_end.reshape(1, 1, -1).expand(b, 1, x.shape[-1]).to(dtype)
+        x = torch.cat([begin, x, end], dim=1)
+
+        return self.after_rms(x)
+
+
+class HunYuanVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        vision_config: HunYuanVLVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+        multimodal_config: MultiModalConfig | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
+    ) -> None:
+        super().__init__()
+
+        num_hidden_layers = vision_config.num_hidden_layers
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_attention_heads
+        self.spatial_merge_size = vision_config.spatial_merge_size
+
+        from vllm.compilation.backends import set_model_tag
+
+        with set_model_tag("HunYuanVisionPatchEmbed"):
+            self.embeddings = HunYuanVisionPatchEmbed(vision_config)
+
+        norm_layer = partial(nn.LayerNorm, eps=vision_config.rms_norm_eps)
+
+        with set_model_tag("HunYuanVisionBlock"):
+            self.layers = nn.ModuleList(
+                [
+                    HunYuanVisionBlock(
+                        dim=vision_config.hidden_size,
+                        num_heads=vision_config.num_attention_heads,
+                        mlp_hidden_dim=vision_config.intermediate_size,
+                        act_fn=get_act_fn(vision_config.hidden_act),
+                        norm_layer=norm_layer,
+                        quant_config=quant_config,
+                        multimodal_config=multimodal_config,
+                        prefix=f"{prefix}.layers.{layer_idx}",
+                        use_data_parallel=use_data_parallel,
+                    )
+                    for layer_idx in range(num_hidden_layers)
+                ]
+            )
+
+        with set_model_tag("HunYuanVisionPatchMerger"):
+            self.perceive = HunYuanVisionPatchMerger(
+                vision_config.hidden_size,
+                vision_config.out_hidden_size,
+                spatial_merge_size=vision_config.spatial_merge_size,
+                rms_norm_eps=vision_config.rms_norm_eps,
+                prefix=f"{prefix}.perceive",
+            )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.embeddings.patch_embedding.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.embeddings.patch_embedding.weight.device
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: list[list[int]],
+    ) -> torch.Tensor:
+        # patchify
+        seq_len = x.size(0)
+        cu_seqlens: list = [0]
+
+        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        hidden_states = self.embeddings(hidden_states, grid_thw)
+
+        for t, h, w in grid_thw:
+            t, h, w = int(t), int(h), int(w)
+            cu_seqlens.append(h * w)
+
+        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32)
+        cu_seqlens = torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32)
+
+        cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
+
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        hidden_states = hidden_states.unsqueeze(0)
+        for layer_num, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states)
+
+        # adapter
+        split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        split_items = hidden_states.split(split_lengths, dim=1)
+        image_embeds_list = []
+        for grid, split_item in zip(grid_thw, split_items):
+            image_embeds_list.append(
+                self.perceive(split_item.contiguous(), size=grid[1:]).squeeze(0)
+            )
+
+        return image_embeds_list
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv", ".q_proj", "q"),
+            (".qkv", ".k_proj", "k"),
+            (".qkv", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+def _hunyuan_vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+    image_grid_sizes = image_grid_thw.prod(-1)
+    return dict(
+        pixel_values=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+    )
+
+
+class HunYuanVLMultiModalDataParser(MultiModalDataParser):
+    def _parse_image_data(
+        self,
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
+    ):
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                required_fields={"image_embeds", "image_grid_thw"},
+                fields_factory=_hunyuan_vl_field_config,
+            )
+
+        return super()._parse_image_data(data)
+
+
+class HunYuanVLProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(HunYuanVLConfig)
+
+    def get_hf_processor(
+        self,
+        **kwargs: object,
+    ) -> HunYuanVLProcessor:
+        return self.ctx.get_hf_processor(
+            HunYuanVLProcessor,
+            use_fast=kwargs.pop("use_fast", True),
+            **kwargs,
+        )
+
+    def get_image_processor(
+        self,
+        **kwargs: object,
+    ) -> HunYuanVLProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        max_image_tokens = self.get_max_image_tokens()
+        # TODO: support video
+        max_video_tokens = 0
+        return {"image": max_image_tokens, "video": max_video_tokens}
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+        image_processor: HunYuanVLProcessor | None,
+    ) -> tuple[ImageSize, int]:
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        spatial_merge_size = vision_config.spatial_merge_size
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * spatial_merge_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width, height=image_height)
+
+        grid_t = 1
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_vision_tokens = (
+            grid_t * grid_h // spatial_merge_size * (grid_w // spatial_merge_size + 1)
+            + 2
+        )
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor: HunYuanVLProcessor | None,
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            image_processor=image_processor,
+        )
+        return num_image_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=512,
+            image_height=8192,
+            image_processor=None,
+        )
+        return max_image_size
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            image_processor=None,
+        )
+
+
+class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        image_token: str = hf_processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 1)
+
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        return {
+            "image": self._get_dummy_images(
+                width=target_width, height=target_height, num_images=num_images
+            ),
+        }
+
+
+class HunYuanVLMultiModalProcessor(BaseMultiModalProcessor[HunYuanVLProcessingInfo]):
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return HunYuanVLMultiModalDataParser()
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            dict(**mm_kwargs, **tok_kwargs),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+
+        placeholder = {
+            "image": hf_processor.image_token_id,
+        }
+
+        merge_size = image_processor.merge_size
+
+        def get_replacement_hunyuan_vl(item_idx: int, modality: str):
+            out_item = out_mm_kwargs[modality][item_idx]
+            grid_thw = out_item[f"{modality}_grid_thw"].data
+            assert isinstance(grid_thw, torch.Tensor)
+
+            _, grid_h, grid_w = grid_thw
+            num_tokens = (int(grid_h) // merge_size) * (
+                int(grid_w) // merge_size + 1
+            ) + 2
+            return [placeholder[modality]] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=[placeholder[modality]],
+                replacement=partial(get_replacement_hunyuan_vl, modality=modality),
+            )
+            for modality in ("image",)
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _hunyuan_vl_field_config(hf_inputs)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    HunYuanVLMultiModalProcessor,
+    info=HunYuanVLProcessingInfo,
+    dummy_inputs=HunYuanVLDummyInputsBuilder,
+)
+class HunYuanVLForConditionalGeneration(
+    nn.Module,
+    SupportsMultiModal,
+    SupportsLoRA,
+    SupportsPP,
+    SupportsQuant,
+    SupportsXDRoPE,
+):
+    multimodal_cpu_fields = {"image_grid_thw"}
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "vit.vit.": "visual.",
+            "vit.": "visual.",
+            "model.": "language_model.model.",
+        }
+    )
+
+    supports_encoder_tp_data = True
+
+    def get_xdrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> torch.Tensor:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+
+        hf_config = self.config
+        image_start_token_id = hf_config.image_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        xd_num = len(hf_config.rope_scaling["xdrope_section"])
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        image_start_indices = torch.argwhere(
+            input_tokens_tensor == image_start_token_id
+        ).squeeze(1)
+
+        p_index = torch.arange(len(input_tokens_tensor))
+        w_index = torch.arange(len(input_tokens_tensor))
+        h_index = torch.arange(len(input_tokens_tensor))
+        t_index = torch.arange(len(input_tokens_tensor))
+        for image_index in range(len(image_start_indices)):
+            # +1 : first image_token, +2: for xdrope positions
+            pos = image_start_indices[image_index] + 2
+            t, h, w = image_grid_thw[image_index]
+            _, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+
+            token_num = (llm_grid_w + 1) * llm_grid_h
+            w_index[pos : pos + token_num].copy_(
+                torch.arange(0, llm_grid_w + 1)
+                .reshape(1, -1)
+                .expand(llm_grid_h, -1)
+                .reshape(-1)
+            )
+            h_index[pos : pos + token_num].copy_(
+                torch.arange(0, llm_grid_h)
+                .reshape(-1, 1)
+                .expand(-1, llm_grid_w + 1)
+                .reshape(-1)
+            )
+            h_index[pos : pos + token_num] = 0
+
+        if xd_num == 4:
+            llm_positions = torch.stack([p_index, w_index, h_index, t_index])
+        elif xd_num == 3:
+            llm_positions = torch.stack([w_index, h_index, t_index])
+
+        return llm_positions
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: HunYuanVLConfig = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        if multimodal_config.get_limit_per_prompt("image"):
+            attn_backend_override = (
+                multimodal_config.mm_encoder_attn_backend
+                if multimodal_config is not None
+                else None
+            )
+            self.visual = HunYuanVisionTransformer(
+                config.vision_config,
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, "visual"),
+                multimodal_config=multimodal_config,
+                attn_backend_override=attn_backend_override,
+            )
+        else:
+            self.visual = None
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model.model"),
+            architectures=[
+                "HunYuanDenseV1ForCausalLM",
+                "HunYuanMoEV1ForCausalLM",
+            ],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> HunYuanVLImageInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        # TODO: refine
+        if isinstance(pixel_values, list):
+            pixel_values = torch.cat(pixel_values, dim=0)
+        if len(pixel_values.shape) == 3:
+            last_dim = pixel_values.shape[-1]
+            pixel_values = pixel_values.reshape(-1, last_dim)
+            image_grid_thw = image_grid_thw.reshape(-1, 3)
+
+        if pixel_values is not None:
+            return HunYuanVLImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+
+        if image_embeds is not None:
+            return HunYuanVLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
+
+    def _process_image_input(
+        self, image_input: HunYuanVLImageInputs
+    ) -> tuple[torch.Tensor, ...]:
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
+
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"]
+
+            # TODO: use_data_parallel (split image_embeds in visual)
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+
+        return image_embeds
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "image" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
+                    **kwargs
+                )
+        return mm_input_by_modality
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not mm_input_by_modality:
+            return []
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                image_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += tuple(image_embeddings)
+        return multimodal_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model.model",
+            connector="visual.perceive",
+            tower_model="visual",
+        )
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 9966498e1b4c9..6f6ce32538b71 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1047,7 +1047,7 @@ class SupportsMRoPE(Protocol):
     supports_mrope: ClassVar[Literal[True]] = True
     """
     A flag that indicates this model supports M-RoPE.
-    
+
     Note:
         There is no need to redefine this flag if this class is in the
         MRO of your model class.
@@ -1088,3 +1088,52 @@ def supports_mrope(
     model: type[object] | object,
 ) -> TypeIs[type[SupportsMRoPE]] | TypeIs[SupportsMRoPE]:
     return isinstance(model, SupportsMRoPE)
+
+
+@runtime_checkable
+class SupportsXDRoPE(Protocol):
+    """The interface required for all models that support XD-RoPE."""
+
+    supports_xdrope: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports XD-RoPE.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        XDRope of your model class.
+    """
+
+    def get_xdrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list["MultiModalFeatureSpec"],
+    ) -> torch.Tensor:
+        """
+        Get XD-RoPE input positions and delta value for this specific model.
+
+        This method should be implemented by each model that supports XD-RoPE
+        to provide model-specific logic for computing input positions.
+
+        Args:
+            input_tokens: List of input token IDs
+            mm_features: Information about each multi-modal data item
+
+        Returns:
+            llm_positions: Tensor of shape `[xdrope_dim, num_tokens]` with
+            4D(P/W/H/T) or 3D(W/H/T) positions.
+        """
+        ...
+
+
+@overload
+def supports_xdrope(model: type[object]) -> TypeIs[type[SupportsXDRoPE]]: ...
+
+
+@overload
+def supports_xdrope(model: object) -> TypeIs[SupportsXDRoPE]: ...
+
+
+def supports_xdrope(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsXDRoPE]] | TypeIs[SupportsXDRoPE]:
+    return isinstance(model, SupportsXDRoPE)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b3da64af750c7..a0d8a78a2ae76 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -287,6 +287,10 @@ _MULTIMODAL_MODELS = {
         "GraniteSpeechForConditionalGeneration",
     ),
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
+    "HunYuanVLForConditionalGeneration": (
+        "hunyuan_vision",
+        "HunYuanVLForConditionalGeneration",
+    ),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
     "OpenCUAForConditionalGeneration": (
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 3d282da8c6112..c1880a3fba0ee 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -86,6 +86,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
     deepseek_vl_v2="DeepseekVLV2Config",
     deepseek_v32="DeepseekV3Config",
     flex_olmo="FlexOlmoConfig",
+    hunyuan_vl="HunYuanVLConfig",
     kimi_linear="KimiLinearConfig",
     kimi_vl="KimiVLConfig",
     RefinedWeb="RWConfig",  # For tiiuae/falcon-40b(-instruct)
@@ -549,6 +550,23 @@ def thinker_uses_mrope(config: PretrainedConfig) -> bool:
     return uses_mrope(thinker_text_config)
 
 
+def uses_xdrope_dim(config: PretrainedConfig) -> int:
+    """Detect if the model with this config uses XD-ROPE."""
+    xdrope_section = getattr(config, "xdrope_section", None)
+    if xdrope_section is not None and isinstance(xdrope_section, list):
+        return len(xdrope_section)
+    rope_scaling = getattr(config, "rope_scaling", None)
+    if rope_scaling is None:
+        return 0
+
+    if isinstance(rope_scaling, dict) and "xdrope_section" in rope_scaling:
+        xdrope_section = rope_scaling["xdrope_section"]
+        if xdrope_section is not None and isinstance(xdrope_section, list):
+            return len(xdrope_section)
+
+    return 0
+
+
 def is_encoder_decoder(config: PretrainedConfig) -> bool:
     """Detect if the model with this config is used as an encoder/decoder."""
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index d28fd8d033373..109f2b6986514 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -23,6 +23,11 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
+from vllm.transformers_utils.configs.hunyuan_vl import (
+    HunYuanVLConfig,
+    HunYuanVLTextConfig,
+    HunYuanVLVisionConfig,
+)
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
@@ -53,6 +58,9 @@ __all__ = [
     "DotsOCRConfig",
     "EAGLEConfig",
     "FlexOlmoConfig",
+    "HunYuanVLConfig",
+    "HunYuanVLTextConfig",
+    "HunYuanVLVisionConfig",
     "RWConfig",
     "JAISConfig",
     "Lfm2MoeConfig",
diff --git a/vllm/transformers_utils/configs/hunyuan_vl.py b/vllm/transformers_utils/configs/hunyuan_vl.py
new file mode 100644
index 0000000000000..a826ed9b5155d
--- /dev/null
+++ b/vllm/transformers_utils/configs/hunyuan_vl.py
@@ -0,0 +1,322 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/configuration_hunyuan_vl.py
+
+from transformers import PretrainedConfig
+
+
+class HunYuanVLVisionConfig(PretrainedConfig):
+    model_type = "hunyuan_vl"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_act="gelu",
+        hidden_size=1152,
+        intermediate_size=4304,
+        interpolate_mode="bilinear",
+        rms_norm_eps=1e-05,
+        learnable_mlp_pooling_size=0,
+        num_attention_heads=16,
+        num_key_value_heads=None,
+        num_channels=3,
+        num_hidden_layers=27,
+        out_hidden_size=4096,
+        patch_size=16,
+        remove_prenorm=True,
+        spatial_merge_size=2,
+        temporal_patch_size=1,
+        resize_resolution=2048,
+        img_max_token_num=4096,
+        max_image_size=2048,
+        video_max_image_size=768,
+        video_min_image_size=256,
+        min_image_size=512,
+        anyres_vit_max_image_size=2048,
+        max_vit_seq_len=16384,
+        text_hidden_size=3072,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.interpolate_mode = interpolate_mode
+        self.learnable_mlp_pooling_size = learnable_mlp_pooling_size
+        self.num_attention_heads = num_attention_heads
+        if not num_key_value_heads:
+            self.num_key_value_heads = num_attention_heads
+        else:
+            self.num_key_value_heads = num_key_value_heads
+        self.num_channels = num_channels
+        self.num_hidden_layers = num_hidden_layers
+        self.out_hidden_size = out_hidden_size
+        self.patch_size = patch_size
+        self.remove_prenorm = remove_prenorm
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.rms_norm_eps = rms_norm_eps
+
+        self.resize_resolution = resize_resolution
+        self.img_max_token_num = img_max_token_num
+        self.max_image_size = max_image_size
+        self.min_image_size = min_image_size
+        self.video_max_image_size = video_max_image_size
+        self.video_min_image_size = video_min_image_size
+        self.anyres_vit_max_image_size = anyres_vit_max_image_size
+        self.max_vit_seq_len = max_vit_seq_len
+        self.text_hidden_size = text_hidden_size
+
+
+class HunYuanVLTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HunYuanVLTextConfig`]. It is used to instantiate an
+    HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the HunYuan-7B.
+    Hunyuan-7B-Instruct [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 290943):
+            Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HunYuanVLTextConfig`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations or shared MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        eod_token_id (int, *optional*, defaults to 3):
+            Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
+            Example: In multi-document processing, this token helps the model distinguish between separate documents.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+    """  # noqa: E501
+
+    model_type = "hunyuan_vl_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=290943,
+        hidden_size=4096,
+        intermediate_size: int = 11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        eod_token_id=3,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # self._rope_scaling_validation()   # TODO: Need validation?
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and "
+                f"`factor` or `type` and `alpha`, got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        rope_scaling_alpha = self.rope_scaling.get("alpha", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                "`rope_scaling`'s type field must be one of ['linear', 'dynamic'], "
+                f"got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None and rope_scaling_alpha is None:
+            raise ValueError(
+                "`rope_scaling`'s factor or alpha field must be have one, "
+                "got both of none"
+            )
+        if rope_scaling_factor is not None and (
+            not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0
+        ):
+            raise ValueError(
+                "`rope_scaling`'s factor field must be a float > 1.0, "
+                f"got {rope_scaling_factor}"
+            )
+        if rope_scaling_alpha is not None and (
+            not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0
+        ):
+            raise ValueError(
+                "`rope_scaling`'s alpha field must be a float > 1.0, "
+                f"got {rope_scaling_alpha}"
+            )
+
+
+class HunYuanVLConfig(PretrainedConfig):
+    model_type = "hunyuan_vl"
+    sub_configs = {
+        "vision_config": HunYuanVLVisionConfig,
+        "text_config": HunYuanVLTextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        im_start_id=120118,
+        im_end_id=120119,
+        image_token_id=120120,
+        im_newline_id=120121,
+        video_start_id=120122,
+        video_end_id=120123,
+        **kwargs,
+    ):
+        # We need to init super() here so that it does not reset values
+        # that are in text config to the BaseClass defaults. The Base
+        # config has many text related defaults and not all defaults are
+        # same as for `HunYuanVLTextConfig`.
+        super().__init__(**kwargs)
+
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init `TextConfig`
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+
+        self.image_token_id = image_token_id
+        self.im_start_id = im_start_id
+        self.im_end_id = im_end_id
+        self.im_newline_id = im_newline_id
+        self.video_start_id = video_start_id
+        self.video_end_id = video_end_id
+
+        self.vision_config.text_hidden_size = self.text_config.hidden_size
+
+        # Attention implementation to use. It sets it recursively on sub-configs
+        # so we call it again in the end.
+        self._attn_implementation = kwargs.pop("attn_implementation", None)
+
+    def __setattr__(self, key, value):
+        if (
+            (text_config := super().__getattribute__("__dict__").get("text_config"))
+            is not None
+            and key not in ["dtype", "_attn_implementation_internal"]
+            and key in text_config.__dict__
+        ):
+            setattr(text_config, key, value)
+        else:
+            super().__setattr__(key, value)
+
+    def __getattribute__(self, key):
+        if "text_config" in super().__getattribute__("__dict__") and key not in [
+            "_name_or_path",
+            "model_type",
+            "dtype",
+            "_attn_implementation_internal",
+        ]:
+            text_config = super().__getattribute__("text_config")
+            if key in text_config.__dict__:
+                return getattr(text_config, key)
+
+        return super().__getattribute__(key)
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 76b6d3dc9c99a..b49fdbe9ce776 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -9,7 +9,15 @@ reasons:
 """
 
 from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
+from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
+from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
 from vllm.transformers_utils.processors.ovis import OvisProcessor
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 
-__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
+__all__ = [
+    "DeepseekVLV2Processor",
+    "HunYuanVLProcessor",
+    "HunYuanVLImageProcessor",
+    "OvisProcessor",
+    "Ovis2_5Processor",
+]
diff --git a/vllm/transformers_utils/processors/hunyuan_vl.py b/vllm/transformers_utils/processors/hunyuan_vl.py
new file mode 100644
index 0000000000000..615a8bff85912
--- /dev/null
+++ b/vllm/transformers_utils/processors/hunyuan_vl.py
@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/processing_hunyuan_vl.py
+
+import numpy as np
+import torch
+from transformers import AutoProcessor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.video_utils import VideoInput
+
+
+class HunYuanVLProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"  # ("AutoTokenizer", None)
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        video_processor=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        # TODO Fix the init
+        self.tokenizer = tokenizer
+        self.image_token_id = 120120  # self.tokenizer.image_token_id
+        self.image_token = self.tokenizer.convert_ids_to_tokens(self.image_token_id)
+        self.im_start_token_id = 120118  # self.tokenizer.im_start_id
+        self.im_start_token = self.tokenizer.convert_ids_to_tokens(
+            self.im_start_token_id
+        )
+        self.im_end_token_id = 120119  # self.tokenizer.im_end_id
+        self.im_end_token = self.tokenizer.convert_ids_to_tokens(self.im_end_token_id)
+        self.placeholder_token = self.tokenizer.convert_ids_to_tokens(
+            self.tokenizer.vocab_size - 1
+        )
+        self.pad_id = 120002  # self.tokenizer.pad_token_id
+
+        super().__init__(
+            image_processor, tokenizer, video_processor, chat_template=chat_template
+        )
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
+        videos: VideoInput = None,
+        **kwargs,
+    ) -> BatchFeature:
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images)
+            image_grid_thw = image_inputs["image_grid_thw"]
+
+        if not isinstance(text, list):
+            text = [text]
+
+        text = text.copy()  # below lines change text in-place
+
+        image_tokens_cumsum = [0]
+        if images is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    grid_h, grid_w = image_grid_thw[index][-2:]
+                    patch_h = grid_h // self.image_processor.merge_size
+                    patch_w = grid_w // self.image_processor.merge_size
+                    num_image_tokens = patch_h * (patch_w + 1) + 2
+                    image_tokens_cumsum.append(
+                        image_tokens_cumsum[-1] + num_image_tokens
+                    )
+                    # text[i] = text[i].replace(self.image_token, self.im_start_token + self.placeholder_token * num_image_tokens + self.im_end_token, 1) # noqa: E501
+                    text[i] = text[i].replace(
+                        self.image_token, self.placeholder_token * num_image_tokens, 1
+                    )
+                    index += 1
+                text[i] = text[i].replace(self.placeholder_token, self.image_token)
+                # text[i] = self.tokenizer.bos_token + text[i]
+
+        text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        input_ids = text_inputs["input_ids"]
+        position_ids = torch.arange(len(input_ids[0]))
+        position_ids_w = torch.arange(len(input_ids[0]))
+        position_ids_h = torch.arange(len(input_ids[0]))
+        position_ids_t = torch.arange(len(input_ids[0]))
+
+        if images is not None:
+            image_token_pos_indices = torch.where(input_ids[0] == self.image_token_id)[
+                0
+            ]
+            for i in range(len(image_grid_thw)):
+                grid_h, grid_w = image_grid_thw[i][-2:]
+                patch_h = grid_h // self.image_processor.merge_size
+                patch_w = grid_w // self.image_processor.merge_size
+                start_pos = image_token_pos_indices[image_tokens_cumsum[i]].item() + 1
+                replace_num = (patch_w + 1) * patch_h
+                position_ids_w[start_pos : start_pos + replace_num] = torch.tensor(
+                    list(range(patch_w + 1)) * patch_h, dtype=torch.int64
+                )
+                patch_h_list = []
+                for h in range(patch_h):
+                    patch_h_list += [h] * (patch_w + 1)
+                position_ids_h[start_pos : start_pos + replace_num] = torch.tensor(
+                    patch_h_list, dtype=torch.int64
+                )
+                position_ids_t[start_pos : start_pos + replace_num] = 0
+
+        position_ids = torch.stack(
+            [position_ids, position_ids_w, position_ids_h, position_ids_t]
+        ).unsqueeze(0)
+        text_inputs["position_ids"] = position_ids
+
+        attention_mask = input_ids.ne(self.pad_id)
+        text_inputs["attention_mask"] = attention_mask
+        text_inputs["imgs_pos"] = [self.get_imgs_pos(input_ids)]
+        # image_inputs["imgs"] = [[image_inputs["pixel_values"]]]
+
+        return_tensors = kwargs.pop("return_tensors", None)
+        return BatchFeature(
+            data={**text_inputs, **image_inputs},
+            tensor_type=return_tensors,
+        )
+
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def post_process_image_text_to_text(
+        self,
+        generated_outputs,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        assert 0
+
+    def apply_chat_template(self, *args, **kwargs):
+        token_ids = self.tokenizer.apply_chat_template(*args, **kwargs)
+        return token_ids
+
+    def get_imgs_pos(self, doc_ids):
+        doc_ids = np.array(doc_ids, dtype=np.int64)
+        img_begin_index = np.where(doc_ids == self.im_start_token_id)[0]
+        img_end_index = np.where(doc_ids == self.im_end_token_id)[0]
+        imgs_pos = np.concatenate(
+            (
+                np.reshape(img_begin_index + 1, (-1, 1)),
+                np.reshape(img_end_index, (-1, 1)),
+            ),
+            axis=-1,
+        ).tolist()
+        return imgs_pos
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+def split_image_into_patch_blocks(
+    pixel_values: torch.Tensor,  # shape: [batch_size, 3, H, W]
+    patch_size: int = 16,  # e.g. 16
+    adaptor_patch_div: int = 4,  # e.g. 4 --> each patch_size is cut into 4x4 small regions, i.e. patch_size // 4 # noqa: E501
+) -> torch.Tensor:
+    """
+    Split the input image tensor (supporting batch) into large patches of size `patch_size`,
+    and then further divide each large patch into smaller regions of size
+    (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div).
+    Each small region is extracted as a tensor of shape [3, patch_size, patch_size].
+    The final output contains all such small region tensors.
+
+    Args:
+        pixel_values: Input image tensor of shape [batch_size, 3, H, W].
+        patch_size: Size of the large patch, e.g., 16.
+        adaptor_patch_div: Each large patch is divided into
+                          (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div)
+                          smaller regions.
+
+    Returns:
+        patches: A tensor of shape [N, 3, patch_size, patch_size],
+                 where N = batch_size * (H // patch_size) * (W // patch_size) * (patch_size // adaptor_patch_div)^2.
+                 Each element in the batch corresponds to one small image region.
+    """  # noqa: E501
+    batch_size, channels, height, width = pixel_values.shape
+    assert channels == 3, "Pixel values must have 3 channels in dim=1"
+    assert height % patch_size == 0 and width % patch_size == 0, (
+        "H and W must be divisible by patch_size"
+    )
+
+    patch_height_num = height // patch_size
+    patch_width_num = width // patch_size
+
+    # Reshape to [B, 3, ph, ps, pw, ps]
+    img = pixel_values.reshape(
+        batch_size, 3, patch_height_num, patch_size, patch_width_num, patch_size
+    )
+
+    # Further split each psxps patch into (ps//aps)x(ps//aps) small regions
+    img = img.reshape(
+        batch_size,
+        3,
+        patch_height_num,
+        patch_size // adaptor_patch_div,  # ps // aps
+        adaptor_patch_div,
+        patch_width_num,
+        patch_size // adaptor_patch_div,  # ps // aps
+        adaptor_patch_div,
+    )
+
+    # Permute to group the small regions: [B, ph, pw, ps//aps, ps//aps, 3, aps, aps]
+    img = img.permute(0, 2, 5, 3, 6, 1, 4, 7)
+
+    # Reshape into [B * ph * pw * (ps//aps)^2, 3, patch_size, patch_size]
+    patches = img.reshape(-1, 3, patch_size, patch_size)
+
+    return patches
+
+
+AutoProcessor.register("HunYuanVLProcessor", HunYuanVLProcessor)
diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py
new file mode 100644
index 0000000000000..0a7e7865c783a
--- /dev/null
+++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py
@@ -0,0 +1,477 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/image_processing_hunyuan_vl.py
+"""Image processor class for HunYuanVL."""
+
+# isort conflicts with ruff for transformers imports
+# isort: skip_file
+import math
+
+import numpy as np
+import torchvision.transforms as transforms
+from transformers import AutoImageProcessor
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_flat_list_of_images,
+    make_list_of_images,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, logging
+from transformers.video_utils import VideoInput, make_batched_videos
+
+logger = logging.get_logger(__name__)
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 16,
+    min_pixels: int = 512 * 512,
+    max_pixels: int = 2048 * 2048,
+):
+    """Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+
+    """
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            "absolute aspect ratio must be smaller than 200, got "
+            f"{max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = max(factor, math.floor(height / beta / factor) * factor)
+        w_bar = max(factor, math.floor(width / beta / factor) * factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+class HunYuanVLImageProcessor(BaseImageProcessor):
+    model_input_names = [
+        "pixel_values",
+        "image_grid_thw",
+        "pixel_values_videos",
+        "video_grid_thw",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: int | float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int | None = None,
+        max_pixels: int | None = None,
+        patch_size: int = 16,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if size is not None and (
+            "shortest_edge" not in size or "longest_edge" not in size
+        ):
+            raise ValueError(
+                "size must contain 'shortest_edge' and 'longest_edge' keys."
+            )
+        else:
+            size = {"shortest_edge": 512 * 512, "longest_edge": 2048 * 2048}
+        # backward compatibility: override size with min_pixels and max_pixels
+        # if they are provided.
+        if min_pixels is not None:
+            size["shortest_edge"] = min_pixels
+        if max_pixels is not None:
+            size["longest_edge"] = max_pixels
+        self.min_pixels = size["shortest_edge"]
+        self.max_pixels = size["longest_edge"]
+        self.size = size
+
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.do_convert_rgb = do_convert_rgb
+
+        # hard-code
+
+    def _preprocess(
+        self,
+        images: ImageInput | VideoInput,
+        do_resize: bool | None = None,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool | None = None,
+        rescale_factor: float | None = None,
+        do_normalize: bool | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        patch_size: int = 16,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        do_convert_rgb: bool | None = None,
+        data_format: ChannelDimension | None = ChannelDimension.FIRST,
+        input_data_format: str | ChannelDimension | None = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """  # noqa: E501
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        width, height = images[0].width, images[0].height
+        resized_width, resized_height = width, height
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_width, resized_height = smart_resize(
+                    width,
+                    height,
+                    factor=patch_size * merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = image.resize((resized_width, resized_height))
+
+            if do_normalize:
+                image = transforms.Compose(
+                    [
+                        transforms.ToTensor(),
+                        transforms.Normalize(self.image_mean, self.image_std),
+                    ]
+                )(image)
+            processed_images.append(image)
+
+        patches = np.array(processed_images)
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // temporal_patch_size
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+        patches = patches.reshape(
+            1,
+            channel,
+            grid_h // merge_size,
+            merge_size,
+            patch_size,
+            grid_w // merge_size,
+            merge_size,
+            patch_size,
+        )
+        patches = patches.transpose(0, 2, 3, 5, 6, 1, 4, 7)
+        flatten_patches = patches.reshape(
+            1 * grid_h * grid_w, channel * patch_size * patch_size
+        )
+
+        return flatten_patches, (grid_t, grid_h, grid_w)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool | None = None,
+        size: dict[str, int] | None = None,
+        min_pixels: int | None = None,
+        max_pixels: int | None = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool | None = None,
+        rescale_factor: float | None = None,
+        do_normalize: bool | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        patch_size: int | None = None,
+        temporal_patch_size: int | None = None,
+        merge_size: int | None = None,
+        do_convert_rgb: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+        data_format: ChannelDimension | None = ChannelDimension.FIRST,
+        input_data_format: str | ChannelDimension | None = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
+                The min pixels of the image to resize the image.
+            max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
+                The max pixels of the image to resize the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """  # noqa: E501
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+
+        if size is not None:
+            if "shortest_edge" not in size or "longest_edge" not in size:
+                raise ValueError(
+                    "size must contain 'shortest_edge' and 'longest_edge' keys."
+                )
+            min_pixels = size["shortest_edge"]
+        elif min_pixels is not None and max_pixels is not None:
+            # backward compatibility: override size with min_pixels and max_pixels
+            # if they are provided.
+            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
+        else:
+            size = {**self.size}
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = (
+            rescale_factor if rescale_factor is not None else self.rescale_factor
+        )
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        temporal_patch_size = (
+            temporal_patch_size
+            if temporal_patch_size is not None
+            else self.temporal_patch_size
+        )
+        merge_size = merge_size if merge_size is not None else self.merge_size
+        do_convert_rgb = (
+            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        )
+
+        if images is not None:
+            images = make_flat_list_of_images(images)
+
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        data = {}
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data.update(
+                {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+            )
+
+        # kept for BC only and should be removed after v5.0
+        if videos is not None:
+            logger.warning(
+                "`HunYuanVLV1ImageProcessor` works only with image inputs "
+                "and doesn't process videos anymore. "
+                "This is a deprecated behavior and will be removed in v5.0. "
+                "Your videos should be forwarded to `HunYuanVLV1VideoProcessor`. "
+            )
+            videos = make_batched_videos(videos)
+            pixel_values_videos, vision_grid_thws_videos = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values_videos.extend(patches)
+                vision_grid_thws_videos.append(video_grid_thw)
+            data.update(
+                {
+                    "pixel_values_videos": np.array(pixel_values_videos),
+                    "video_grid_thw": np.array(vision_grid_thws_videos),
+                }
+            )
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number of image patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*):
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of image patches per image.
+        """
+        min_pixels = (
+            images_kwargs["min_pixels"]
+            if "min_pixels" in images_kwargs
+            else self.size["shortest_edge"]
+        )
+        max_pixels = (
+            images_kwargs["max_pixels"]
+            if "max_pixels" in images_kwargs
+            else self.size["longest_edge"]
+        )
+        patch_size = images_kwargs.get("patch_size", self.patch_size)
+        merge_size = images_kwargs.get("merge_size", self.merge_size)
+
+        factor = patch_size * merge_size
+        resized_height, resized_width = smart_resize(
+            height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
+        )
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+        return grid_h * (grid_w + 1) + 2
+
+
+AutoImageProcessor.register("HunYuanVLImageProcessor", HunYuanVLImageProcessor)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 4a2818ab1bfd8..e7991baeaa1b8 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -43,6 +43,8 @@ class CachedRequestState:
     mrope_positions: torch.Tensor | None = None
     mrope_position_delta: int | None = None
 
+    xdrope_positions: torch.Tensor | None = None
+
     lora_request: LoRARequest | None = None
     prompt_embeds: torch.Tensor | None = None
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6a83ac14e0b3f..6413be66b141c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -50,16 +50,21 @@ from vllm.distributed.parallel_state import (
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.layers.rotary_embedding import (
+    MRotaryEmbedding,
+    XDRotaryEmbedding,
+)
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.interfaces import (
     SupportsMRoPE,
     SupportsMultiModal,
+    SupportsXDRoPE,
     is_mixture_of_experts,
     supports_eagle3,
     supports_mrope,
     supports_multimodal_pruning,
     supports_transcription,
+    supports_xdrope,
 )
 from vllm.model_executor.models.interfaces_base import (
     VllmModelForPooling,
@@ -324,6 +329,7 @@ class GPUModelRunner(
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
+        self.uses_xdrope_dim = model_config.uses_xdrope_dim
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
             model_config
         )
@@ -512,6 +518,13 @@ class GPUModelRunner(
                 (3, self.max_num_tokens + 1), dtype=torch.int64
             )
 
+        # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+        if self.uses_xdrope_dim > 0:
+            # Similar to mrope but use assigned dimension number for RoPE, 4 as default.
+            self.xdrope_positions = self._make_buffer(
+                (self.uses_xdrope_dim, self.max_num_tokens + 1), dtype=torch.int64
+            )
+
         # None in the first PP rank. The rest are set after load_model.
         self.intermediate_tensors: IntermediateTensors | None = None
 
@@ -593,10 +606,14 @@ class GPUModelRunner(
         if isinstance(num_tokens, int):
             if self.uses_mrope:
                 return self.mrope_positions.gpu[:, :num_tokens]
+            if self.uses_xdrope_dim > 0:
+                return self.xdrope_positions.gpu[:, :num_tokens]
             return self.positions.gpu[:num_tokens]
         else:
             if self.uses_mrope:
                 return self.mrope_positions.gpu[:, num_tokens]
+            if self.uses_xdrope_dim > 0:
+                return self.xdrope_positions.gpu[:, num_tokens]
             return self.positions.gpu[num_tokens]
 
     def _make_buffer(
@@ -772,6 +789,10 @@ class GPUModelRunner(
             if self.uses_mrope:
                 self._init_mrope_positions(req_state)
 
+            # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+            if self.uses_xdrope_dim > 0:
+                self._init_xdrope_positions(req_state)
+
             reqs_to_add.append(req_state)
 
         # Update the states of the running/resumed requests.
@@ -987,6 +1008,19 @@ class GPUModelRunner(
             )
         )
 
+    def _init_xdrope_positions(self, req_state: CachedRequestState):
+        model = self.get_model()
+        xdrope_model = cast(SupportsXDRoPE, model)
+        assert req_state.prompt_token_ids is not None, (
+            "XD-RoPE requires prompt_token_ids to be available."
+        )
+        assert supports_xdrope(model), "XD-RoPE support is not implemented."
+
+        req_state.xdrope_positions = xdrope_model.get_xdrope_input_positions(
+            req_state.prompt_token_ids,
+            req_state.mm_features,
+        )
+
     def _extract_mm_kwargs(
         self,
         scheduler_output: "SchedulerOutput",
@@ -1231,6 +1265,11 @@ class GPUModelRunner(
         if self.uses_mrope:
             self._calc_mrope_positions(scheduler_output)
 
+        # Calculate XD-RoPE positions.
+        # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+        if self.uses_xdrope_dim > 0:
+            self._calc_xdrope_positions(scheduler_output)
+
         # Get token indices.
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
@@ -1364,6 +1403,12 @@ class GPUModelRunner(
                 self.mrope_positions.cpu[:, :total_num_scheduled_tokens],
                 non_blocking=True,
             )
+        elif self.uses_xdrope_dim > 0:
+            # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+            self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
+                self.xdrope_positions.cpu[:, :total_num_scheduled_tokens],
+                non_blocking=True,
+            )
         else:
             # Common case (1D positions)
             self.positions.copy_to_gpu(total_num_scheduled_tokens)
@@ -1793,6 +1838,53 @@ class GPUModelRunner(
 
                 mrope_pos_ptr += completion_part_len
 
+    def _calc_xdrope_positions(self, scheduler_output: "SchedulerOutput"):
+        xdrope_pos_ptr = 0
+        for index, req_id in enumerate(self.input_batch.req_ids):
+            req = self.requests[req_id]
+            assert req.xdrope_positions is not None
+
+            num_computed_tokens = self.input_batch.num_computed_tokens_cpu[index]
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
+                req.prompt_token_ids, req.prompt_embeds
+            )
+
+            if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
+                prompt_part_len = max(0, num_prompt_tokens - num_computed_tokens)
+                completion_part_len = max(0, num_scheduled_tokens - prompt_part_len)
+            else:
+                prompt_part_len = num_scheduled_tokens
+                completion_part_len = 0
+
+            assert num_scheduled_tokens == prompt_part_len + completion_part_len
+
+            if prompt_part_len > 0:
+                # prompt's xdrope_positions are pre-computed
+                dst_start = xdrope_pos_ptr
+                dst_end = xdrope_pos_ptr + prompt_part_len
+                src_start = num_computed_tokens
+                src_end = num_computed_tokens + prompt_part_len
+
+                self.xdrope_positions.cpu[:, dst_start:dst_end] = req.xdrope_positions[
+                    :, src_start:src_end
+                ]
+                xdrope_pos_ptr += prompt_part_len
+
+            if completion_part_len > 0:
+                # compute completion's xdrope_positions on-the-fly
+                dst_start = xdrope_pos_ptr
+                dst_end = xdrope_pos_ptr + completion_part_len
+
+                XDRotaryEmbedding.get_next_input_positions_tensor(
+                    out=self.xdrope_positions.np,
+                    out_offset=dst_start,
+                    context_len=num_computed_tokens + prompt_part_len,
+                    num_new_tokens=completion_part_len,
+                )
+
+                xdrope_pos_ptr += completion_part_len
+
     def _calc_spec_decode_metadata(
         self,
         num_draft_tokens: np.ndarray,
@@ -2037,6 +2129,7 @@ class GPUModelRunner(
 
         req_start_idx = 0
         should_sync_mrope_positions = False
+        should_sync_xdrope_positions = False
 
         for req_id in self.input_batch.req_ids:
             mm_embeds_req: list[torch.Tensor] = []
@@ -2110,6 +2203,10 @@ class GPUModelRunner(
             self._calc_mrope_positions(scheduler_output)
             self.mrope_positions.copy_to_gpu(total_num_scheduled_tokens)
 
+        if should_sync_xdrope_positions:
+            self._calc_xdrope_positions(scheduler_output)
+            self.xdrope_positions.copy_to_gpu(total_num_scheduled_tokens)
+
         return mm_embeds, is_mm_embed
 
     def get_model(self) -> nn.Module:
@@ -2384,8 +2481,11 @@ class GPUModelRunner(
             input_ids = self.input_ids.gpu[:num_input_tokens]
             inputs_embeds = None
             model_kwargs = self._init_model_kwargs(num_input_tokens)
+
         if self.uses_mrope:
             positions = self.mrope_positions.gpu[:, :num_input_tokens]
+        elif self.uses_xdrope_dim > 0:
+            positions = self.xdrope_positions.gpu[:, :num_input_tokens]
         else:
             positions = self.positions.gpu[:num_input_tokens]
 
@@ -3824,6 +3924,8 @@ class GPUModelRunner(
 
             if self.uses_mrope:
                 positions = self.mrope_positions.gpu[:, :num_tokens_after_padding]
+            elif self.uses_xdrope_dim > 0:
+                positions = self.xdrope_positions.gpu[:, :num_tokens_after_padding]
             else:
                 positions = self.positions.gpu[:num_tokens_after_padding]
 

From 81db702ed28d9a6edbd59fbd0ec039e107d36bc0 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Tue, 25 Nov 2025 12:25:20 +0800
Subject: [PATCH 228/249] [Attention] add `_cudagraph_support` for linear
 attention (#28934)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/v1/attention/backends/linear_attn.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py
index 1900c50849eca..004baa2d09cde 100644
--- a/vllm/v1/attention/backends/linear_attn.py
+++ b/vllm/v1/attention/backends/linear_attn.py
@@ -7,6 +7,7 @@ import torch
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
     split_decodes_and_prefills,
@@ -35,6 +36,8 @@ class LinearAttentionMetadata:
 class LinearAttentionMetadataBuilder(AttentionMetadataBuilder[LinearAttentionMetadata]):
     reorder_batch_threshold: int = 1
 
+    _cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+
     def __init__(
         self,
         kv_cache_spec: AttentionSpec,

From 2d9ee28cab204b90aa304f60fd7083ea45204bd7 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Mon, 24 Nov 2025 23:55:57 -0500
Subject: [PATCH 229/249] [CI/Test Fix] Fix CP tests on Blackwell (#29338)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/attention/ops/common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index 67c5f7dbba9c0..af6766bdd1615 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -194,7 +194,6 @@ def _cp_lse_common(
     cp_attn_lse = cp_attn_lse.contiguous()
     lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
     out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
-    assert out.is_contiguous()
     return out, lse
 
 
From 316c8492bf4d5fca8f9f8ea6f8ef1d76a0cb940f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 05:24:05 +0000
Subject: [PATCH 230/249] Scheduled removal of `guided_*` config fields
 (#29326)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/features/structured_outputs.md           |   2 +-
 .../llm/test_struct_output_generate.py        |  29 +--
 vllm/engine/arg_utils.py                      |  33 ---
 vllm/entrypoints/openai/protocol.py           | 203 ++++--------------
 vllm/sampling_params.py                       |  38 ----
 5 files changed, 43 insertions(+), 262 deletions(-)

diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index e38627c707884..7d52891bea7b9 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -7,7 +7,7 @@ This document shows you some examples of the different options that are
 available to generate structured outputs.
 
 !!! warning
-    If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
+    If you are still using the following deprecated API fields which were removed in v0.12.0, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
 
     - `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
     - `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index d1b037b7956cf..85f108786c05a 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -3,7 +3,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
-from dataclasses import fields
 from enum import Enum
 from typing import TYPE_CHECKING, Any
 
@@ -21,7 +20,6 @@ from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
 from vllm.sampling_params import (
-    GuidedDecodingParams,
     SamplingParams,
     StructuredOutputsParams,
 )
@@ -108,23 +106,6 @@ class CarDescription(BaseModel):
     car_type: CarType
 
 
-def test_guided_decoding_deprecated():
-    with pytest.warns(DeprecationWarning, match="GuidedDecodingParams is deprecated.*"):
-        guided_decoding = GuidedDecodingParams(json_object=True)
-
-    structured_outputs = StructuredOutputsParams(json_object=True)
-    assert fields(guided_decoding) == fields(structured_outputs)
-
-    with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
-        sp1 = SamplingParams(guided_decoding=guided_decoding)
-
-    with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
-        sp2 = SamplingParams.from_optional(guided_decoding=guided_decoding)
-
-    assert sp1 == sp2
-    assert sp1.structured_outputs == guided_decoding
-
-
 @pytest.mark.parametrize(
     "model_name, backend, tokenizer_mode, speculative_config",
     PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
@@ -899,13 +880,11 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
                 output_json = json.loads(generated_text)
 
 
-@pytest.mark.parametrize("guided_decoding_backend", ["xgrammar"])
-def test_structured_output_with_structural_tag(
-    guided_decoding_backend: str,
-):
+@pytest.mark.parametrize("backend", ["xgrammar"])
+def test_structured_output_with_structural_tag(backend: str):
     llm = LLM(
         model="Qwen/Qwen2.5-1.5B-Instruct",
-        guided_decoding_backend=guided_decoding_backend,
+        structured_outputs_config=StructuredOutputsConfig(backend=backend),
     )
 
     structural_tag_config = {
@@ -923,7 +902,7 @@ def test_structured_output_with_structural_tag(
     sampling_params = SamplingParams(
         temperature=0.0,
         max_tokens=500,
-        guided_decoding=StructuredOutputsParams(
+        structured_outputs=StructuredOutputsParams(
             structural_tag=json.dumps(structural_tag_config)
         ),
     )
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a7c6b11ccd5a8..3cb76fc63f69c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -502,11 +502,6 @@ class EngineArgs:
     )
     reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
     reasoning_parser_plugin: str | None = None
-    # Deprecated guided decoding fields
-    guided_decoding_backend: str | None = None
-    guided_decoding_disable_fallback: bool | None = None
-    guided_decoding_disable_any_whitespace: bool | None = None
-    guided_decoding_disable_additional_properties: bool | None = None
 
     logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern
 
@@ -725,19 +720,6 @@ class EngineArgs:
             "--reasoning-parser-plugin",
             **structured_outputs_kwargs["reasoning_parser_plugin"],
         )
-        # Deprecated guided decoding arguments
-        for arg, type in [
-            ("--guided-decoding-backend", str),
-            ("--guided-decoding-disable-fallback", bool),
-            ("--guided-decoding-disable-any-whitespace", bool),
-            ("--guided-decoding-disable-additional-properties", bool),
-        ]:
-            structured_outputs_group.add_argument(
-                arg,
-                type=type,
-                help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."),
-                deprecated=True,
-            )
 
         # Parallel arguments
         parallel_kwargs = get_kwargs(ParallelConfig)
@@ -1712,21 +1694,6 @@ class EngineArgs:
                 self.reasoning_parser_plugin
             )
 
-        # Forward the deprecated CLI args to the StructuredOutputsConfig
-        so_config = self.structured_outputs_config
-        if self.guided_decoding_backend is not None:
-            so_config.guided_decoding_backend = self.guided_decoding_backend
-        if self.guided_decoding_disable_fallback is not None:
-            so_config.disable_fallback = self.guided_decoding_disable_fallback
-        if self.guided_decoding_disable_any_whitespace is not None:
-            so_config.disable_any_whitespace = (
-                self.guided_decoding_disable_any_whitespace
-            )
-        if self.guided_decoding_disable_additional_properties is not None:
-            so_config.disable_additional_properties = (
-                self.guided_decoding_disable_additional_properties
-            )
-
         observability_config = ObservabilityConfig(
             show_hidden_metrics_for_version=self.show_hidden_metrics_for_version,
             otlp_traces_endpoint=self.otlp_traces_endpoint,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b352c3ad01db0..5a0a05f9af323 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -652,62 +652,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
         default=None,
         description="Additional kwargs for structured outputs",
     )
-    guided_json: str | dict | BaseModel | None = Field(
-        default=None,
-        description=(
-            "`guided_json` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `json` to `structured_outputs` instead."
-        ),
-    )
-    guided_regex: str | None = Field(
-        default=None,
-        description=(
-            "`guided_regex` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `regex` to `structured_outputs` instead."
-        ),
-    )
-    guided_choice: list[str] | None = Field(
-        default=None,
-        description=(
-            "`guided_choice` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `choice` to `structured_outputs` instead."
-        ),
-    )
-    guided_grammar: str | None = Field(
-        default=None,
-        description=(
-            "`guided_grammar` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `grammar` to `structured_outputs` instead."
-        ),
-    )
-    structural_tag: str | None = Field(
-        default=None,
-        description=(
-            "`structural_tag` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `structural_tag` to `structured_outputs` instead."
-        ),
-    )
-    guided_decoding_backend: str | None = Field(
-        default=None,
-        description=(
-            "`guided_decoding_backend` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please remove it from your request."
-        ),
-    )
-    guided_whitespace_pattern: str | None = Field(
-        default=None,
-        description=(
-            "`guided_whitespace_pattern` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `whitespace_pattern` to `structured_outputs` instead."
-        ),
-    )
     priority: int = Field(
         default=0,
         description=(
@@ -841,20 +785,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
 
-        # Forward deprecated guided_* parameters to structured_outputs
-        if self.structured_outputs is None:
-            kwargs = dict[str, Any](
-                json=self.guided_json,
-                regex=self.guided_regex,
-                choice=self.guided_choice,
-                grammar=self.guided_grammar,
-                whitespace_pattern=self.guided_whitespace_pattern,
-                structural_tag=self.structural_tag,
-            )
-            kwargs = {k: v for k, v in kwargs.items() if v is not None}
-            if len(kwargs) > 0:
-                self.structured_outputs = StructuredOutputsParams(**kwargs)
-
         response_format = self.response_format
         if response_format is not None:
             # If structured outputs wasn't already enabled,
@@ -863,24 +793,23 @@ class ChatCompletionRequest(OpenAIBaseModel):
                 self.structured_outputs = StructuredOutputsParams()
 
             # Set structured output params for response format
-            if response_format is not None:
-                if response_format.type == "json_object":
-                    self.structured_outputs.json_object = True
-                elif response_format.type == "json_schema":
-                    json_schema = response_format.json_schema
-                    assert json_schema is not None
-                    self.structured_outputs.json = json_schema.json_schema
-                elif response_format.type == "structural_tag":
-                    structural_tag = response_format
-                    assert structural_tag is not None and isinstance(
-                        structural_tag,
-                        (
-                            LegacyStructuralTagResponseFormat,
-                            StructuralTagResponseFormat,
-                        ),
-                    )
-                    s_tag_obj = structural_tag.model_dump(by_alias=True)
-                    self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
+            if response_format.type == "json_object":
+                self.structured_outputs.json_object = True
+            elif response_format.type == "json_schema":
+                json_schema = response_format.json_schema
+                assert json_schema is not None
+                self.structured_outputs.json = json_schema.json_schema
+            elif response_format.type == "structural_tag":
+                structural_tag = response_format
+                assert structural_tag is not None and isinstance(
+                    structural_tag,
+                    (
+                        LegacyStructuralTagResponseFormat,
+                        StructuralTagResponseFormat,
+                    ),
+                )
+                s_tag_obj = structural_tag.model_dump(by_alias=True)
+                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
@@ -1140,58 +1069,6 @@ class CompletionRequest(OpenAIBaseModel):
         default=None,
         description="Additional kwargs for structured outputs",
     )
-    guided_json: str | dict | BaseModel | None = Field(
-        default=None,
-        description=(
-            "`guided_json` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `json` to `structured_outputs` instead."
-        ),
-    )
-    guided_regex: str | None = Field(
-        default=None,
-        description=(
-            "`guided_regex` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `regex` to `structured_outputs` instead."
-        ),
-    )
-    guided_choice: list[str] | None = Field(
-        default=None,
-        description=(
-            "`guided_choice` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `choice` to `structured_outputs` instead."
-        ),
-    )
-    guided_grammar: str | None = Field(
-        default=None,
-        description=(
-            "`guided_grammar` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `grammar` to `structured_outputs` instead."
-        ),
-    )
-    structural_tag: str | None = Field(
-        default=None,
-        description=("If specified, the output will follow the structural tag schema."),
-    )
-    guided_decoding_backend: str | None = Field(
-        default=None,
-        description=(
-            "`guided_decoding_backend` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please remove it from your request."
-        ),
-    )
-    guided_whitespace_pattern: str | None = Field(
-        default=None,
-        description=(
-            "`guided_whitespace_pattern` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `whitespace_pattern` to `structured_outputs` instead."
-        ),
-    )
     priority: int = Field(
         default=0,
         description=(
@@ -1336,35 +1213,31 @@ class CompletionRequest(OpenAIBaseModel):
 
         echo_without_generation = self.echo and self.max_tokens == 0
 
-        guided_json_object = None
-        if self.response_format is not None:
-            if self.response_format.type == "json_object":
-                guided_json_object = True
-            elif self.response_format.type == "json_schema":
-                json_schema = self.response_format.json_schema
+        response_format = self.response_format
+        if response_format is not None:
+            # If structured outputs wasn't already enabled,
+            # we must enable it for these features to work
+            if self.structured_outputs is None:
+                self.structured_outputs = StructuredOutputsParams()
+
+            # Set structured output params for response format
+            if response_format.type == "json_object":
+                self.structured_outputs.json_object = True
+            elif response_format.type == "json_schema":
+                json_schema = response_format.json_schema
                 assert json_schema is not None
-                self.guided_json = json_schema.json_schema
-            elif self.response_format.type == "structural_tag":
-                structural_tag = self.response_format
+                self.structured_outputs.json = json_schema.json_schema
+            elif response_format.type == "structural_tag":
+                structural_tag = response_format
                 assert structural_tag is not None and isinstance(
-                    structural_tag, StructuralTagResponseFormat
+                    structural_tag,
+                    (
+                        LegacyStructuralTagResponseFormat,
+                        StructuralTagResponseFormat,
+                    ),
                 )
                 s_tag_obj = structural_tag.model_dump(by_alias=True)
-                self.structural_tag = json.dumps(s_tag_obj)
-
-        # Forward deprecated guided_* parameters to structured_outputs
-        if self.structured_outputs is None:
-            kwargs = dict[str, Any](
-                json=self.guided_json,
-                json_object=guided_json_object,
-                regex=self.guided_regex,
-                choice=self.guided_choice,
-                grammar=self.guided_grammar,
-                whitespace_pattern=self.guided_whitespace_pattern,
-            )
-            kwargs = {k: v for k, v in kwargs.items() if v is not None}
-            if len(kwargs) > 0:
-                self.structured_outputs = StructuredOutputsParams(**kwargs)
+                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index fbbe3d4cabb9a..142853ff0ff0e 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -3,7 +3,6 @@
 """Sampling parameters for text generation."""
 
 import copy
-import warnings
 from dataclasses import field
 from enum import Enum, IntEnum
 from functools import cached_property
@@ -100,19 +99,6 @@ class StructuredOutputsParams:
         )
 
 
-@dataclass
-class GuidedDecodingParams(StructuredOutputsParams):
-    def __post_init__(self):
-        warnings.warn(
-            "GuidedDecodingParams is deprecated. This will be removed in "
-            "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-            "StructuredOutputsParams instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        return super().__post_init__()
-
-
 class RequestOutputKind(Enum):
     # Return entire output so far in every RequestOutput
     CUMULATIVE = 0
@@ -234,8 +220,6 @@ class SamplingParams(
     # Fields used to construct logits processors
     structured_outputs: StructuredOutputsParams | None = None
     """Parameters for configuring structured outputs."""
-    guided_decoding: GuidedDecodingParams | None = None
-    """Deprecated alias for structured_outputs."""
     logit_bias: dict[int, float] | None = None
     """If provided, the engine will construct a logits processor that applies
     these logit biases."""
@@ -283,7 +267,6 @@ class SamplingParams(
         truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None,
         output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
         structured_outputs: StructuredOutputsParams | None = None,
-        guided_decoding: GuidedDecodingParams | None = None,
         logit_bias: dict[int, float] | dict[str, float] | None = None,
         allowed_token_ids: list[int] | None = None,
         extra_args: dict[str, Any] | None = None,
@@ -295,16 +278,6 @@ class SamplingParams(
                 int(token): min(100.0, max(-100.0, bias))
                 for token, bias in logit_bias.items()
             }
-        if guided_decoding is not None:
-            warnings.warn(
-                "guided_decoding is deprecated. This will be removed in "
-                "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-                "structured_outputs instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            structured_outputs = guided_decoding
-            guided_decoding = None
 
         return SamplingParams(
             n=1 if n is None else n,
@@ -387,17 +360,6 @@ class SamplingParams(
         # eos_token_id is added to this by the engine
         self._all_stop_token_ids.update(self.stop_token_ids)
 
-        if self.guided_decoding is not None:
-            warnings.warn(
-                "guided_decoding is deprecated. This will be removed in "
-                "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-                "structured_outputs instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            self.structured_outputs = self.guided_decoding
-            self.guided_decoding = None
-
         if self.skip_reading_prefix_cache is None:
             # If prefix caching is enabled,
             # the output of prompt logprobs may less than n_prompt_tokens,

From a21256c46327ec366b7804d22ba66ed04c2ae18b Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Tue, 25 Nov 2025 14:03:20 +0800
Subject: [PATCH 231/249] Add TP CLI argument to multimodal inference examples
 (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
---
 examples/offline_inference/audio_language.py  | 15 +++++++
 examples/offline_inference/vision_language.py | 15 +++++++
 .../vision_language_multi_image.py            | 40 ++++++++++++++++---
 3 files changed, 65 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 examples/offline_inference/audio_language.py
 mode change 100644 => 100755 examples/offline_inference/vision_language.py
 mode change 100644 => 100755 examples/offline_inference/vision_language_multi_image.py

diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
old mode 100644
new mode 100755
index 04e6f99f8957e..df6e96ca375fc
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -425,6 +425,13 @@ def parse_args():
         default=None,
         help="Set the seed when initializing `vllm.LLM`.",
     )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=None,
+        help="Tensor parallel size to override the model's default setting. ",
+    )
 
     return parser.parse_args()
 
@@ -434,6 +441,12 @@ def main(args):
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
+    if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
+        raise ValueError(
+            f"tensor_parallel_size must be a positive integer, "
+            f"got {args.tensor_parallel_size}"
+        )
+
     audio_count = args.num_audios
     req_data = model_example_map[model](
         question_per_audio_count[audio_count], audio_count
@@ -446,6 +459,8 @@ def main(args):
     )
 
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    if args.tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
     llm = LLM(**engine_args)
 
     # We set temperature to 0.2 so that outputs can be different
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
old mode 100644
new mode 100755
index 65ea4df4a3099..8f72bf6f0b0d1
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -2064,6 +2064,13 @@ def parse_args():
         help="If True, will send all requests in a second batch with empty mm "
         "data to verify cache hits with UUIDs.",
     )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=None,
+        help="Tensor parallel size to override the model's default setting. ",
+    )
     return parser.parse_args()
 
 
@@ -2072,6 +2079,12 @@ def main(args):
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
+    if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
+        raise ValueError(
+            f"tensor_parallel_size must be a positive integer, "
+            f"got {args.tensor_parallel_size}"
+        )
+
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
@@ -2089,6 +2102,8 @@ def main(args):
         "seed": args.seed,
         "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
     }
+    if args.tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
     llm = LLM(**engine_args)
 
     # Don't want to check the flag multiple times, so just hijack `prompts`.
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
old mode 100644
new mode 100755
index 301265d4e17f7..7ba4e64b567de
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1352,10 +1352,18 @@ model_example_map = {
 }
 
 
-def run_generate(model, question: str, image_urls: list[str], seed: int | None):
+def run_generate(
+    model,
+    question: str,
+    image_urls: list[str],
+    seed: int | None,
+    tensor_parallel_size: int | None,
+):
     req_data = model_example_map[model](question, image_urls)
 
-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    if tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = tensor_parallel_size
     llm = LLM(**engine_args)
 
     sampling_params = SamplingParams(
@@ -1378,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None):
         print("-" * 50)
 
 
-def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
+def run_chat(
+    model: str,
+    question: str,
+    image_urls: list[str],
+    seed: int | None,
+    tensor_parallel_size: int | None,
+):
     req_data = model_example_map[model](question, image_urls)
 
     # Disable other modalities to save memory
@@ -1388,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
     )
 
     engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    if tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = tensor_parallel_size
     llm = LLM(**engine_args)
 
     sampling_params = (
@@ -1463,6 +1479,13 @@ def parse_args():
         default=2,
         help="Number of images to use for the demo.",
     )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=None,
+        help="Tensor parallel size to override the model's default setting. ",
+    )
     return parser.parse_args()
 
 
@@ -1470,13 +1493,20 @@ def main(args: Namespace):
     model = args.model_type
     method = args.method
     seed = args.seed
+    tensor_parallel_size = args.tensor_parallel_size
+
+    if tensor_parallel_size is not None and tensor_parallel_size < 1:
+        raise ValueError(
+            f"tensor_parallel_size must be a positive integer, "
+            f"got {tensor_parallel_size}"
+        )
 
     image_urls = IMAGE_URLS[: args.num_images]
 
     if method == "generate":
-        run_generate(model, QUESTION, image_urls, seed)
+        run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
     elif method == "chat":
-        run_chat(model, QUESTION, image_urls, seed)
+        run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
     else:
         raise ValueError(f"Invalid method: {method}")
 

From ce58fdc1c366b0257c2b2d8310b14d4ea8f8dd30 Mon Sep 17 00:00:00 2001
From: kflu <kflu@users.noreply.github.com>
Date: Mon, 24 Nov 2025 22:39:29 -0800
Subject: [PATCH 232/249] Fix PoolingParams.skip_reading_prefix_cache type
 (#29364)

Signed-off-by: KFL <kludev@gmail.com>
---
 vllm/pooling_params.py  | 2 +-
 vllm/sampling_params.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 5c3dfa8ac9cbc..d1aab98c274e1 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -57,7 +57,7 @@ class PoolingParams(
     ## Internal use only
     task: PoolingTask | None = None
     requires_token_ids: bool = False
-    skip_reading_prefix_cache: bool = None
+    skip_reading_prefix_cache: bool | None = None
     extra_kwargs: dict[str, Any] | None = None
     output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 142853ff0ff0e..8de961e62db1b 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -238,7 +238,7 @@ class SamplingParams(
     generated token can complete the sequence."""
     _bad_words_token_ids: list[list[int]] | None = None
 
-    skip_reading_prefix_cache: bool = None
+    skip_reading_prefix_cache: bool | None = None
 
     @staticmethod
     def from_optional(

From 40a6f53f6c09cd15b07436acc8d631a3a86f7416 Mon Sep 17 00:00:00 2001
From: Inoki <inoki@inoki.cc>
Date: Tue, 25 Nov 2025 07:40:06 +0100
Subject: [PATCH 233/249] Display warning only when ROCm version is less than
 Pytorch required version (#29200)

Signed-off-by: Inoki <inoki@inoki.cc>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4cf51d17e982..86746a0db4c0e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,7 +136,7 @@ elseif(HIP_FOUND)
 
   # ROCm 5.X and 6.X
   if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+      Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
     message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
       "expected for ROCm build, saw ${Torch_VERSION} instead.")
   endif()

From 7992324f23478bebf5e39542a4ce198cd7a1ab2a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 24 Nov 2025 22:55:16 -0800
Subject: [PATCH 234/249] [BugFix] Use unique ids for different transcription
 prompts (#29372)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/speech_to_text.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index b9b9b1ab30ad8..3dece07748cc4 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -201,10 +201,10 @@ class OpenAISpeechToText(OpenAIServing):
                 self.engine_client.generate(
                     prompt,
                     sampling_params,
-                    request_id,
+                    f"{request_id}_{i}",
                     lora_request=lora_request,
                 )
-                for prompt in prompts
+                for i, prompt in enumerate(prompts)
             ]
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error

From 64deead719cc181a1930982b0a5f4d280c284156 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 25 Nov 2025 14:56:06 +0800
Subject: [PATCH 235/249] [Bugfix] [ROCm] [UX]: revert Flex attention backend
 (#29371)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../v1/attention/test_rocm_attention_backends_selection.py  | 6 ++++++
 vllm/platforms/rocm.py                                      | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
index 4ec79e9eb6ba4..80158d4b7278c 100644
--- a/tests/v1/attention/test_rocm_attention_backends_selection.py
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -36,6 +36,12 @@ def mock_on_gfx9():
 @pytest.mark.parametrize(
     "env_vars, selected_backend, expected_backend_path",
     [
+        # Test Case: Explicit FLEX_ATTENTION backend
+        (
+            {},
+            "FLEX_ATTENTION",
+            AttentionBackendEnum.FLEX_ATTENTION.get_path(),
+        ),
         # Test Case 1: Default (no env vars, no explicit backend)
         (
             {},
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f3ec965bd0881..b0434b9642f07 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -262,6 +262,10 @@ class RocmPlatform(Platform):
                 f"is not MLA type while requested for MLA backend."
             )
 
+        if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
+            logger.info("Using FlexAttention backend.")
+            return AttentionBackendEnum.FLEX_ATTENTION.get_path()
+
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
             logger.info("Using Triton Attention backend on V1 engine.")
             return AttentionBackendEnum.TRITON_ATTN.get_path()

From 98caeadd54599c8038fab5b19cc8ef5688b7b03a Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Tue, 25 Nov 2025 07:11:11 +0000
Subject: [PATCH 236/249] [fix][cpu] Use a SwigluOAI impl which supports
 interleaved gate-up wei (#29273)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .../layers/fused_moe/cpu_fused_moe.py         | 29 +++++--------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 572307052b489..659a2d4ee5b39 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -6,22 +6,7 @@ import torch
 from torch.nn import functional as F
 
 from vllm import _custom_ops as ops
-
-
-def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
-    d = x.shape[-1] // 2
-    return F.silu(x[..., :d]) * x[..., d:]
-
-
-def swigluoai_and_mul(
-    x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0
-) -> torch.Tensor:
-    d = x.shape[-1] // 2
-    gate, up = x[..., :d], x[..., d:]
-    gate = gate.clamp(max=limit)
-    up = up.clamp(min=-limit, max=limit)
-    glu = gate * torch.sigmoid(alpha * gate)
-    return (up + 1) * glu
+from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
 
 
 def grouped_topk(
@@ -227,6 +212,11 @@ class CPUFusedMOE:
             layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
 
+        self.act_to_impl = {
+            "silu": SiluAndMul(),
+            "swigluoai": SwigluOAIAndMul(),
+        }
+
     def __call__(
         self,
         layer: torch.nn.Module,
@@ -246,7 +236,7 @@ class CPUFusedMOE:
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
-        assert activation in {"silu", "swigluoai"}, f"{activation} is not supported."
+        assert activation in self.act_to_impl, f"{activation} is not supported."
         assert not apply_router_weight_on_input
         topk_weights, topk_ids = select_experts(
             hidden_states=x,
@@ -283,10 +273,7 @@ class CPUFusedMOE:
             tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
 
             gate_up = layer.gate_up_linear[i](tokens_for_this_expert)
-            if activation == "swigluoai":
-                gate_up = swigluoai_and_mul(gate_up)
-            else:
-                gate_up = silu_and_mul(gate_up)
+            gate_up = self.act_to_impl[activation].forward_native(gate_up)
             expert_out = layer.down_linear[i](gate_up)
             outputs.append(expert_out)
             start_idx = end_idx

From fe3a4f5b347c64f1d5f2cb10990437a56f720660 Mon Sep 17 00:00:00 2001
From: Ryan Rock <ryan.rock@amd.com>
Date: Tue, 25 Nov 2025 01:14:59 -0600
Subject: [PATCH 237/249] [CI/Build] Pin torchgeo dependency for AMD (#29353)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
---
 requirements/rocm-test.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index f9bddc23420b4..8a91b59de6f72 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -45,6 +45,7 @@ multiprocess==0.70.16
 
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
+torchgeo==0.7.0
 
 # Required for suffix decoding test
-arctic-inference == 0.1.1
\ No newline at end of file
+arctic-inference == 0.1.1

From 888152bf87d62c9f5929d06f386068990b618db7 Mon Sep 17 00:00:00 2001
From: Icey <1790571317@qq.com>
Date: Tue, 25 Nov 2025 15:25:15 +0800
Subject: [PATCH 238/249] Allow oot custom compiler extension via
 CompilerInterface (#28623)

Signed-off-by: wxsIcey <1790571317@qq.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/compilation/backends.py | 34 +++++++++++++++++-----------------
 vllm/config/compilation.py   | 12 +++++-------
 vllm/platforms/interface.py  | 20 ++++++++++++++++++++
 3 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 1e66f21ff6388..2d8dd4c51c7ef 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -63,13 +63,14 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
         else:
             logger.debug("Using InductorAdaptor")
             return InductorAdaptor()
-    else:
-        assert compilation_config.backend == "eager", (
-            "Custom backends not supported with CompilationMode.VLLM_COMPILE"
-        )
-
+    elif compilation_config.backend == "eager":
         logger.debug("Using EagerAdaptor")
         return EagerAdaptor()
+    else:
+        logger.debug("Using custom backend: %s", compilation_config.backend)
+        compiler = resolve_obj_by_qualname(current_platform.get_compile_backend())()
+        assert isinstance(compiler, CompilerInterface)
+        return compiler
 
 
 class CompilerManager:
@@ -545,7 +546,10 @@ class VllmBackend:
         self.prefix = prefix or model_tag
 
         # Passes to run on the graph post-grad.
-        self.post_grad_pass_manager = PostGradPassManager()
+        self.pass_manager = resolve_obj_by_qualname(
+            current_platform.get_pass_manager_cls()
+        )()
+        self.pass_key = current_platform.pass_key
 
         self.sym_tensor_indices = []
         self.input_buffers = []
@@ -562,24 +566,20 @@ class VllmBackend:
 
     def configure_post_pass(self):
         config = self.compilation_config
-        self.post_grad_pass_manager.configure(self.vllm_config)
+        self.pass_manager.configure(self.vllm_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
         # hook. If a pass for that hook exists, add it to the pass manager.
         inductor_config = config.inductor_compile_config
-        PASS_KEY = "post_grad_custom_post_pass"
-        if PASS_KEY in inductor_config:
-            if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
+        if self.pass_key in inductor_config:
+            if isinstance(inductor_config[self.pass_key], PostGradPassManager):
                 # PassManager already added to config, make sure it's correct
-                assert (
-                    inductor_config[PASS_KEY].uuid()
-                    == self.post_grad_pass_manager.uuid()
-                )
+                assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid()
             else:
                 # Config should automatically wrap all inductor passes
-                assert isinstance(inductor_config[PASS_KEY], InductorPass)
-                self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
-        inductor_config[PASS_KEY] = self.post_grad_pass_manager
+                assert isinstance(inductor_config[self.pass_key], InductorPass)
+                self.pass_manager.add(inductor_config[self.pass_key])
+        inductor_config[self.pass_key] = self.pass_manager
 
     def __call__(
         self, graph: fx.GraphModule, example_inputs
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 42eccf9f41123..556b2d9168b32 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -331,9 +331,9 @@ class CompilationConfig:
     We use string to avoid serialization issues when using compilation in a
     distributed setting. When the compilation mode is 1 or 2, the backend is
     used for the compilation directly (it sees the whole graph). When the
-    compilation mode is 3, the backend is used for the piecewise compilation
-    (it sees a part of the graph). The backend can not be custom for compilation
-    mode 3, i.e. the backend must be either eager or inductor. Furthermore,
+    compilation mode is 3, the backend supports both whole graph and piecewise 
+    compilation, available backends include eager, inductor, and custom backends, 
+    the latter of which can be defined via `get_compile_backend`. Furthermore,
     compilation is only piecewise if splitting ops is set accordingly and
     use_inductor_graph_partition is off. Note that the default options for
     splitting ops are sufficient for piecewise compilation.
@@ -768,7 +768,7 @@ class CompilationConfig:
             self.backend = "inductor" if self.use_inductor else "eager"
 
         if self.backend == "":
-            self.backend = current_platform.simple_compile_backend
+            self.backend = current_platform.get_compile_backend()
 
     def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
         """
@@ -800,9 +800,7 @@ class CompilationConfig:
 
         assert self.mode == CompilationMode.VLLM_COMPILE
         if self.backend not in ["eager", "inductor"]:
-            raise ValueError(
-                f"Invalid backend for piecewise compilation: {self.backend}"
-            )
+            logger.info("Using OOT custom backend for compilation.")
 
         from vllm.compilation.backends import VllmBackend
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 0471c20429b1d..1e6b53021f888 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -134,6 +134,11 @@ class Platform:
 
     _global_graph_pool: Any | None = None
 
+    @property
+    def pass_key(self) -> str:
+        """Inductor config key for the PassManager custom pass"""
+        return "post_grad_custom_post_pass"
+
     @property
     def supported_dtypes(self) -> list[torch.dtype]:
         """Returns the supported dtypes for the current platform."""
@@ -177,6 +182,21 @@ class Platform:
         # all ROCm platforms for now.
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
+    @classmethod
+    def get_pass_manager_cls(cls) -> str:
+        """
+        Get the pass manager class for this platform.
+        It will be registered as a custom pass under the current_platform.pass_key.
+        """
+        return "vllm.compilation.pass_manager.PostGradPassManager"
+
+    @classmethod
+    def get_compile_backend(cls) -> str:
+        """
+        Get the custom compile backend for current platform.
+        """
+        return cls.simple_compile_backend
+
     @classmethod
     def device_id_to_physical_device_id(cls, device_id: int):
         # Treat empty device control env var as unset. This is a valid

From f242cfcdd5f1db4e005503a02a1317369d2a8e3d Mon Sep 17 00:00:00 2001
From: zhrrr <43847754+izhuhaoran@users.noreply.github.com>
Date: Tue, 25 Nov 2025 15:31:07 +0800
Subject: [PATCH 239/249] [Perf] use cpu all reduce to avoid sync when
 async_scheduling & dp > 1 (#29311)

Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
---
 vllm/engine/arg_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3cb76fc63f69c..8338e54d4fd85 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1570,6 +1570,12 @@ class EngineArgs:
             model_config.skip_tokenizer_init = True
             logger.info("Skipping tokenizer initialization for tokens-only mode.")
 
+        if self.async_scheduling and not self.disable_nccl_for_dp_synchronization:
+            logger.info(
+                "Disabling NCCL for DP synchronization when using async scheduling."
+            )
+            self.disable_nccl_for_dp_synchronization = True
+
         # Forward the deprecated CLI args to the EPLB config.
         if self.num_redundant_experts is not None:
             self.eplb_config.num_redundant_experts = self.num_redundant_experts

From 12c007e288bf5c0ae3bd438036fbafbad88e706b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Delacourt?=
 <54138269+Flechman@users.noreply.github.com>
Date: Tue, 25 Nov 2025 08:32:21 +0100
Subject: [PATCH 240/249] EAGLE Support DP>1 (#26086)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Delacourt <remi@mistral.ai>
Signed-off-by: Rémi Delacourt <54138269+Flechman@users.noreply.github.com>
Signed-off-by: remi <remi@mistral.ai>
---
 .buildkite/test-pipeline.yaml         |   2 +
 tests/v1/distributed/test_eagle_dp.py |  77 ++++++++++++++++
 vllm/v1/spec_decode/eagle.py          | 123 +++++++++++++++++++-------
 vllm/v1/worker/gpu_model_runner.py    |   5 +-
 4 files changed, 176 insertions(+), 31 deletions(-)
 create mode 100644 tests/v1/distributed/test_eagle_dp.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f1cd39ef4f948..e88e693a2dda5 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -192,6 +192,7 @@ steps:
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -1116,6 +1117,7 @@ steps:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
diff --git a/tests/v1/distributed/test_eagle_dp.py b/tests/v1/distributed/test_eagle_dp.py
new file mode 100644
index 0000000000000..9f6a6614fc1fd
--- /dev/null
+++ b/tests/v1/distributed/test_eagle_dp.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+from contextlib import AsyncExitStack
+from dataclasses import replace
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+
+DP_SIZE = int(os.getenv("DP_SIZE", 2))
+
+
+@pytest.mark.asyncio
+async def test_run_eagle_dp():
+    target_model = "meta-llama/Llama-3.1-8B-Instruct"
+    draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+
+    engine_args = AsyncEngineArgs(
+        model=target_model,
+        tokenizer_mode="auto",
+        enforce_eager=False,
+        tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+        data_parallel_size=DP_SIZE,
+        data_parallel_backend="mp",  # ray takes more time
+        trust_remote_code=True,
+        max_model_len=16384,
+    )
+
+    eagle_engine_args = replace(
+        engine_args,
+        speculative_config={
+            "model": draft_model,
+            "method": "eagle",
+            "num_speculative_tokens": 3,
+        },
+    )
+
+    prompt = "This is a test of data parallel with eagle"
+    num_expected_tokens = 100
+    sampling_params = SamplingParams(
+        min_tokens=num_expected_tokens,
+        max_tokens=num_expected_tokens,
+        ignore_eos=True,
+        output_kind=RequestOutputKind.FINAL_ONLY,
+        temperature=0,
+    )
+
+    async def generate_with_timeout(given_engine: AsyncLLM):
+        async for out in given_engine.generate(
+            request_id="test-eagle-dp", prompt=prompt, sampling_params=sampling_params
+        ):
+            token_ids = out.outputs[0].token_ids
+            assert len(token_ids) == num_expected_tokens
+            return token_ids
+
+    async def engine_create_and_generate(engine_args: AsyncEngineArgs):
+        async with AsyncExitStack() as after:
+            engine = AsyncLLM.from_engine_args(engine_args)
+            after.callback(engine.shutdown)
+
+            token_ids = await asyncio.wait_for(
+                generate_with_timeout(engine), timeout=30
+            )
+
+            assert not engine.output_processor.has_unfinished_requests()
+        return token_ids
+
+    token_ids_with_eagle = await engine_create_and_generate(eagle_engine_args)
+    token_ids_no_eagle = await engine_create_and_generate(engine_args)
+
+    # Test for correctness
+    assert token_ids_with_eagle == token_ids_no_eagle
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index afa16573eea10..784ccbc04932f 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -40,6 +40,7 @@ from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import _SAMPLING_EPS
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 logger = init_logger(__name__)
@@ -65,6 +66,7 @@ class EagleProposer:
         self.dtype = vllm_config.model_config.dtype
         self.max_model_len = vllm_config.model_config.max_model_len
         self.block_size = vllm_config.cache_config.block_size
+        self.dp_rank = vllm_config.parallel_config.data_parallel_rank
         self.num_speculative_tokens = self.speculative_config.num_speculative_tokens
         self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
         self.token_arange_np = np.arange(self.max_num_tokens)
@@ -271,15 +273,24 @@ class EagleProposer:
             assert draft_indexer_metadata is not None
             per_layer_attn_metadata[layer_name] = draft_indexer_metadata
 
+        num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
+            num_tokens_unpadded=num_tokens,
+            num_tokens_padded=num_tokens,
+        )
+
         cudagraph_runtime_mode = CUDAGraphMode.NONE
         if (
             self.use_cuda_graph
-            and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+            and num_tokens_dp_padded
+            <= self.compilation_config.max_cudagraph_capture_size
         ):
-            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens_dp_padded)
             cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
-            num_input_tokens = num_tokens
+            num_input_tokens = num_tokens_dp_padded
+        if num_tokens_across_dp is not None:
+            num_tokens_across_dp[self.dp_rank] = num_input_tokens
+
         # copy inputs to buffer for cudagraph
         self._set_positions(num_tokens, target_positions)
         self.hidden_states[:num_tokens] = target_hidden_states
@@ -303,6 +314,7 @@ class EagleProposer:
             per_layer_attn_metadata,
             self.vllm_config,
             num_tokens=num_input_tokens,
+            num_tokens_across_dp=num_tokens_across_dp,
             cudagraph_runtime_mode=cudagraph_runtime_mode,
         ):
             ret_hidden_states = self.model(
@@ -365,15 +377,23 @@ class EagleProposer:
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
 
+        batch_size_dp_padded, batch_size_across_dp = self._pad_batch_across_dp(
+            num_tokens_unpadded=batch_size,
+            num_tokens_padded=batch_size,
+        )
+
         if (
             self.use_cuda_graph
-            and batch_size <= self.compilation_config.max_cudagraph_capture_size
+            and batch_size_dp_padded
+            <= self.compilation_config.max_cudagraph_capture_size
         ):
-            input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
+            input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size_dp_padded)
             cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
-            input_batch_size = batch_size
+            input_batch_size = batch_size_dp_padded
             cudagraph_runtime_mode = CUDAGraphMode.NONE
+        if batch_size_across_dp is not None:
+            batch_size_across_dp[self.dp_rank] = input_batch_size
 
         common_attn_metadata.num_actual_tokens = batch_size
         common_attn_metadata.max_query_len = 1
@@ -474,6 +494,7 @@ class EagleProposer:
                 per_layer_attn_metadata,
                 self.vllm_config,
                 num_tokens=input_batch_size,
+                num_tokens_across_dp=batch_size_across_dp,
                 cudagraph_runtime_mode=cudagraph_runtime_mode,
             ):
                 ret_hidden_states = self.model(
@@ -1116,36 +1137,56 @@ class EagleProposer:
         self,
         num_tokens: int,
         use_cudagraphs=True,
+        is_graph_capturing=False,
     ) -> None:
         # Determine if CUDA graphs should be used for this run.
         cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
-        if (
-            cudagraphs_enabled
-            and num_tokens <= self.compilation_config.max_cudagraph_capture_size
-        ):
-            num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
 
-        with set_forward_context(
-            None,
-            self.vllm_config,
-            num_tokens=num_tokens,
-            cudagraph_runtime_mode=(
-                CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE
-            ),
+        # FIXME: when using tree-based specdec, adjust number of forward-passes
+        # according to the depth of the tree.
+        for fwd_idx in range(
+            self.num_speculative_tokens if not is_graph_capturing else 1
         ):
-            if self.supports_mm_inputs:
-                input_ids = None
-                inputs_embeds = self.inputs_embeds[:num_tokens]
-            else:
-                input_ids = self.input_ids[:num_tokens]
-                inputs_embeds = None
+            if fwd_idx <= 1:
+                num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
+                    num_tokens_unpadded=num_tokens,
+                    num_tokens_padded=num_tokens,
+                )
+                if (
+                    cudagraphs_enabled
+                    and num_tokens_dp_padded
+                    <= self.compilation_config.max_cudagraph_capture_size
+                ):
+                    num_input_tokens = self.vllm_config.pad_for_cudagraph(
+                        num_tokens_dp_padded
+                    )
+                else:
+                    num_input_tokens = num_tokens_dp_padded
+                if num_tokens_across_dp is not None:
+                    num_tokens_across_dp[self.dp_rank] = num_input_tokens
 
-            self.model(
-                input_ids=input_ids,
-                positions=self._get_positions(num_tokens),
-                hidden_states=self.hidden_states[:num_tokens],
-                inputs_embeds=inputs_embeds,
-            )
+            with set_forward_context(
+                None,
+                self.vllm_config,
+                num_tokens=num_input_tokens,
+                num_tokens_across_dp=num_tokens_across_dp,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE
+                if cudagraphs_enabled
+                else CUDAGraphMode.NONE,
+            ):
+                if self.supports_mm_inputs:
+                    input_ids = None
+                    inputs_embeds = self.inputs_embeds[:num_input_tokens]
+                else:
+                    input_ids = self.input_ids[:num_input_tokens]
+                    inputs_embeds = None
+
+                self.model(
+                    input_ids=input_ids,
+                    positions=self._get_positions(num_input_tokens),
+                    hidden_states=self.hidden_states[:num_input_tokens],
+                    inputs_embeds=inputs_embeds,
+                )
 
     def _get_attention_metadata_builder(self) -> AttentionMetadataBuilder:
         """Find and return the attention metadata builders for EAGLE layers.
@@ -1211,6 +1252,28 @@ class EagleProposer:
             == 1
         ), "All eagle layers should belong to the same kv cache group"
 
+    def _pad_batch_across_dp(
+        self,
+        num_tokens_unpadded: int,
+        num_tokens_padded: int,
+    ) -> tuple[int, torch.Tensor]:
+        # TODO(Flechman): support DBO ubatching
+        ubatch_slices, num_toks_across_dp = coordinate_batch_across_dp(
+            num_tokens_unpadded=num_tokens_unpadded,
+            parallel_config=self.vllm_config.parallel_config,
+            allow_microbatching=False,
+            allow_dp_padding=self.use_cuda_graph,
+            num_tokens_padded=num_tokens_padded,
+            uniform_decode=None,
+            num_scheduled_tokens_per_request=None,
+        )
+        assert ubatch_slices is None, "DBO ubatching not implemented for EAGLE"
+
+        num_tokens_dp_padded = num_tokens_padded
+        if num_toks_across_dp is not None:
+            num_tokens_dp_padded = int(num_toks_across_dp[self.dp_rank].item())
+        return num_tokens_dp_padded, num_toks_across_dp
+
 
 # NOTE(woosuk): Currently, the below code is not used and we always use argmax
 # to sample the draft tokens. We will use this after we find a way to manage
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6413be66b141c..74fd2a1e2a2c0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3746,6 +3746,7 @@ class GPUModelRunner(
         create_mixed_batch: bool = False,
         remove_lora: bool = True,
         activate_lora: bool = False,
+        is_graph_capturing: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Run a dummy forward pass to warm up/profile run or capture the
@@ -3981,7 +3982,7 @@ class GPUModelRunner(
             if self.speculative_config and self.speculative_config.use_eagle():
                 assert isinstance(self.drafter, EagleProposer)
                 use_cudagraphs = (
-                    cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+                    cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE)
                     and not self.speculative_config.enforce_eager
                 )
 
@@ -3995,6 +3996,7 @@ class GPUModelRunner(
                 self.drafter.dummy_run(
                     num_tokens,
                     use_cudagraphs=use_cudagraphs,
+                    is_graph_capturing=is_graph_capturing,
                 )
 
         # This is necessary to avoid blocking DP.
@@ -4427,6 +4429,7 @@ class GPUModelRunner(
                 skip_eplb=True,
                 remove_lora=False,
                 activate_lora=activate_lora,
+                is_graph_capturing=True,
             )
         self.maybe_remove_all_loras(self.lora_config)
 

From ef1f7030f016cc811236517e02fa51ee8876cc31 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Tue, 25 Nov 2025 01:55:09 -0600
Subject: [PATCH 241/249] [ROCm][CI] Fix test_cudagraph_mode failure in AMD CI
 (#29367)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/attention/utils.py               |  7 +++
 tests/v1/cudagraph/test_cudagraph_mode.py | 62 +++++++++++++++--------
 vllm/platforms/rocm.py                    |  4 +-
 3 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index dea89babd4b47..df3d53332c7cd 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -340,4 +340,11 @@ full_cg_backend_configs = {
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
     ),
+    "RocmAttn": BackendConfig(
+        name="RocmAttn",
+        env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+        comp_config={
+            "cudagraph_mode": "FULL",
+        },
+    ),
 }
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index d6bde16eba36b..7f9c2a0571c3c 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -35,14 +35,22 @@ def temporary_environ(env_vars):
 
 # test attention backend and cudagraph_mode combo
 # (backend_name, cudagraph_mode, supported)
-combo_cases_1 = [
-    ("FA3", "FULL", True),
-    ("FA3", "FULL_AND_PIECEWISE", True),
-    ("FA2", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
-    ("FA2", "FULL_AND_PIECEWISE", True),
-    ("FlashInfer", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
-    ("FlashInfer", "FULL_AND_PIECEWISE", True),
-]
+if current_platform.is_rocm():
+    combo_cases_1 = [
+        ("RocmAttn", "FULL", True),
+        ("RocmAttn", "FULL_AND_PIECEWISE", True),
+        ("TritonAttn", "FULL", True),
+        ("TritonAttn", "FULL_AND_PIECEWISE", True),
+    ]
+else:
+    combo_cases_1 = [
+        ("FA3", "FULL", True),
+        ("FA3", "FULL_AND_PIECEWISE", True),
+        ("FA2", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
+        ("FA2", "FULL_AND_PIECEWISE", True),
+        ("FlashInfer", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
+        ("FlashInfer", "FULL_AND_PIECEWISE", True),
+    ]
 
 
 @pytest.mark.parametrize("backend_name, cudagraph_mode, supported", combo_cases_1)
@@ -92,18 +100,32 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
 
 # test cudagraph_mode with different compilation mode.
 # (backend_name, cudagraph_mode, compilation_mode, supported)
-combo_cases_2 = [
-    ("FA2", "FULL", CompilationMode.NONE, True),
-    ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "PIECEWISE", CompilationMode.NONE, False),
-    ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
-    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
-    ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "NONE", CompilationMode.NONE, True),
-    ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
-]
+if current_platform.is_rocm():
+    combo_cases_2 = [
+        ("RocmAttn", "FULL", CompilationMode.NONE, True),
+        ("RocmAttn", "FULL", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "PIECEWISE", CompilationMode.NONE, False),
+        ("RocmAttn", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+        ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+        ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "NONE", CompilationMode.NONE, True),
+        ("RocmAttn", "NONE", CompilationMode.VLLM_COMPILE, True),
+    ]
+else:
+    combo_cases_2 = [
+        ("FA2", "FULL", CompilationMode.NONE, True),
+        ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "PIECEWISE", CompilationMode.NONE, False),
+        ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+        ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "NONE", CompilationMode.NONE, True),
+        ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
+    ]
 
 
 @pytest.mark.parametrize(
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index b0434b9642f07..0483f6c06ada8 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -321,8 +321,8 @@ class RocmPlatform(Platform):
             return AttentionBackendEnum.TRITON_ATTN.get_path()
 
         raise RuntimeError(
-            "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
-            "to select a supported backend."
+            f"Attention backend {selected_backend.name} is not supported on "
+            "ROCm. Note that V0 attention backends have been removed."
         )
 
     @classmethod

From 6330f9477db214477004df6546f86e3f14f8eab9 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Tue, 25 Nov 2025 15:59:40 +0800
Subject: [PATCH 242/249] [Bugfix] Fix GPT-OSS AR+NORM fusion (#28841)

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                   |  1 +
 tests/compile/distributed/test_fusions_e2e.py   | 11 +++++++++++
 .../device_communicators/symm_mem.py            |  2 +-
 vllm/model_executor/layers/fused_moe/layer.py   | 17 +++++++++++------
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e88e693a2dda5..e444becd9867b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -971,6 +971,7 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - vllm/model_executor/layers/fused_moe/layer.py
   - tests/compile/test_fusion_attn.py
   - tests/compile/test_silu_mul_quant_fusion.py
   - tests/compile/distributed/test_fusion_all_reduce.py
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 661172e1965b5..53c3f875d2003 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -111,6 +111,17 @@ if current_platform.is_cuda():
                 async_tp=96,  # MLP is MoE, half the fusions of dense
             ),
         ),
+        ModelBackendTestCase(
+            model_name="openai/gpt-oss-20b",
+            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
+            backend=AttentionBackendEnum.FLASHINFER,
+            matches=Matches(
+                attention_fusion=0,
+                allreduce_fusion=49,
+                sequence_parallel=49,
+                async_tp=48,
+            ),
+        ),
     ]
 
 elif current_platform.is_rocm():
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index eb1f173b11925..7a049b003cf73 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -131,7 +131,7 @@ class SymmMemCommunicator:
             return None
         if out is None:
             out = torch.empty_like(inp)
-        self.buffer[: inp.numel()].copy_(inp.view(-1))
+        self.buffer[: inp.numel()].copy_(inp.reshape(-1))
 
         # Determine which algorithm to use
         use_multimem = False
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 0ef3130b26333..bb30f1292a5fa 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1690,6 +1690,10 @@ class FusedMoE(CustomOp):
             )
 
         def reduce_output(states: torch.Tensor) -> torch.Tensor:
+            # Slice before all_reduce to enable possible fusion
+            if self.hidden_size != og_hidden_states:
+                states = states[..., :og_hidden_states]
+
             if (
                 not self.is_sequence_parallel
                 and not self.use_dp_chunking
@@ -1712,11 +1716,12 @@ class FusedMoE(CustomOp):
             if self.zero_expert_num is not None and self.zero_expert_num > 0:
                 assert isinstance(fused_output, tuple)
                 fused_output, zero_expert_result = fused_output
-                return (reduce_output(fused_output) + zero_expert_result)[
-                    ..., :og_hidden_states
-                ]
+                return (
+                    reduce_output(fused_output)
+                    + zero_expert_result[..., :og_hidden_states]
+                )
             else:
-                return reduce_output(fused_output)[..., :og_hidden_states]
+                return reduce_output(fused_output)
         else:
             if current_platform.is_tpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
@@ -1729,8 +1734,8 @@ class FusedMoE(CustomOp):
                     hidden_states, router_logits, self.layer_name
                 )
             return (
-                reduce_output(shared_output)[..., :og_hidden_states],
-                reduce_output(fused_output)[..., :og_hidden_states],
+                reduce_output(shared_output),
+                reduce_output(fused_output),
             )
 
     def forward_cuda(

From 67fc16cd8cf778a30ad0f7619fe77bd85f1d1633 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 25 Nov 2025 16:06:09 +0800
Subject: [PATCH 243/249] [Bugfix] If chunked_prefill is disabled, end the
 scheduling early. (#28911)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 tests/v1/core/test_scheduler.py | 28 ++++++++++++++++++++++++++++
 tests/v1/core/utils.py          |  3 ++-
 vllm/v1/core/sched/scheduler.py |  6 +++---
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 09acde6e08faa..fe4153e609971 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -641,6 +641,34 @@ def test_schedule_concurrent_batches(
     scheduler.update_from_output(scheduler_output1, model_runner_output)
 
 
+@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
+def test_schedule_order(enable_chunked_prefill: bool):
+    scheduler = create_scheduler(
+        max_num_batched_tokens=1024,
+        max_num_seqs=3,
+        enable_chunked_prefill=enable_chunked_prefill,
+    )
+
+    # long requests
+    requests = create_requests(num_requests=2, num_tokens=800)
+    # short requests
+    requests += create_requests(num_requests=2, num_tokens=10)
+
+    for request in requests:
+        scheduler.add_request(request)
+
+    scheduler_output1 = scheduler.schedule()
+
+    if enable_chunked_prefill:
+        # When enable chunked prefill, long requests will be chunked.
+        assert len(scheduler_output1.scheduled_new_reqs) == 2
+    else:
+        # When disable chunked prefill, should not skip the long requests,
+        # and scheduling subsequent short requests in advance,
+        # even though there is still token budgets remaining.
+        assert len(scheduler_output1.scheduled_new_reqs) == 1
+
+
 def test_preempt_during_execution():
     # NOTE(woosuk): The actual number of available blocks is 10 instead of 11
     # because block 0 is reserved as the null block.
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 6830f68736453..7537c7a60476b 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -42,6 +42,7 @@ def create_scheduler(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
+    enable_chunked_prefill: bool = True,
     enable_prefix_caching: bool = False,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
@@ -76,7 +77,7 @@ def create_scheduler(
         max_model_len=max_model_len,
         long_prefill_token_threshold=long_prefill_token_threshold,
         disable_chunked_mm_input=disable_chunked_mm_input,
-        enable_chunked_prefill=True,
+        enable_chunked_prefill=enable_chunked_prefill,
         async_scheduling=async_scheduling,
     )
     model_config = ModelConfig(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index a7ec0de372631..23af014c10364 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -508,9 +508,9 @@ class Scheduler(SchedulerInterface):
                         not self.scheduler_config.enable_chunked_prefill
                         and num_new_tokens > token_budget
                     ):
-                        self.waiting.pop_request()
-                        skipped_waiting_requests.prepend_request(request)
-                        continue
+                        # If chunked_prefill is disabled,
+                        # we can stop the scheduling here.
+                        break
 
                     num_new_tokens = min(num_new_tokens, token_budget)
                     assert num_new_tokens > 0

From db2906108acdc141e8a21e390228c69b1379e3c2 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 25 Nov 2025 00:30:11 -0800
Subject: [PATCH 244/249] [Misc] Streamline unique id generation (#29375)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/protocol.py       | 16 ++++++++--------
 vllm/entrypoints/openai/serving_engine.py |  9 +++++----
 vllm/utils/__init__.py                    |  4 +++-
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 5a0a05f9af323..c4023a6185289 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -661,7 +661,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1078,7 +1078,7 @@ class CompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1375,7 +1375,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1470,7 +1470,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1892,7 +1892,7 @@ class ClassificationCompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1983,7 +1983,7 @@ class ClassificationChatRequest(OpenAIBaseModel):
     )
 
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -3094,7 +3094,7 @@ class TranslationResponseVerbose(OpenAIBaseModel):
 ####### Tokens IN <> Tokens OUT #######
 class GenerateRequest(BaseModel):
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -3151,7 +3151,7 @@ class GenerateResponseChoice(BaseModel):
 
 class GenerateResponse(BaseModel):
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index de22c48809dc8..09a135b701d05 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1349,11 +1349,12 @@ class OpenAIServing:
         raw_request: Request | None, default: str | None = None
     ) -> str | None:
         """Pulls the request id to use from a header, if provided"""
-        default = default or random_uuid()
-        if raw_request is None:
-            return default
+        if raw_request is not None and (
+            (req_id := raw_request.headers.get("X-Request-Id")) is not None
+        ):
+            return req_id
 
-        return raw_request.headers.get("X-Request-Id", default)
+        return random_uuid() if default is None else default
 
     @staticmethod
     def _get_data_parallel_rank(raw_request: Request | None) -> int | None:
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index d94da71b289f3..fddcc27204307 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -52,9 +52,11 @@ STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"
 
+MASK_64_BITS = (1 << 64) - 1
+
 
 def random_uuid() -> str:
-    return str(uuid.uuid4().hex)
+    return f"{uuid.uuid4().int & MASK_64_BITS:016x}"  # 16 hex chars
 
 
 def length_from_prompt_token_ids_or_embeds(

From 32c40b95e09f26fc140d442d687072d01ea9ff2b Mon Sep 17 00:00:00 2001
From: Avishek Goswami <86944690+GOavi101@users.noreply.github.com>
Date: Tue, 25 Nov 2025 15:06:34 +0530
Subject: [PATCH 245/249] [BugFix] bad_words filtering ineffective when n > 1
 (#29313)

Signed-off-by: GOavi101 <1704178@kiit.ac.in>
---
 vllm/v1/engine/__init__.py   | 8 ++++++++
 vllm/v1/engine/async_llm.py  | 9 +++++----
 vllm/v1/engine/llm_engine.py | 7 +++++--
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 3f621d77c0241..ce2aae77108da 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -72,6 +72,14 @@ class EngineCoreRequest(
 
     trace_headers: Mapping[str, str] | None = None
 
+    @property
+    def params(self) -> SamplingParams | PoolingParams:
+        """Return the processed params (sampling or pooling)."""
+        if self.sampling_params is not None:
+            return self.sampling_params
+        assert self.pooling_params is not None
+        return self.pooling_params
+
 
 class EngineCoreEventType(enum.IntEnum):
     """The type of engine core request event."""
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index c64b3cccfc652..55087baadff97 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -321,14 +321,15 @@ class AsyncLLM(EngineClient):
             elif isinstance(prompt, Mapping):
                 prompt_text = cast(str | None, prompt.get("prompt"))
 
+        # Use cloned params that may have been updated in process_inputs()
+        params = request.params
+
         if is_pooling or params.n == 1:
             await self._add_request(request, prompt_text, None, 0, queue)
             return queue
 
-        # Get the updated SamplingParams from the request, which
-        # were cloned/updated in processor.process_inputs above.
-        parent_params = request.sampling_params
-        assert parent_params is not None
+        parent_params = params
+        assert isinstance(parent_params, SamplingParams)
 
         # Fan out child requests (for n>1).
         parent_request = ParentRequest(request_id, parent_params)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index e403cea87788b..dffe05445ee46 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -250,6 +250,9 @@ class LLMEngine:
             elif isinstance(prompt, Mapping):
                 prompt_text = cast(str | None, prompt.get("prompt"))
 
+        # Use cloned params that may have been updated in process_inputs()
+        params = request.params
+
         n = params.n if isinstance(params, SamplingParams) else 1
 
         if n == 1:
@@ -262,10 +265,10 @@ class LLMEngine:
         # Fan out child requests (for n>1).
         parent_req = ParentRequest(request_id, params)
         for idx in range(n):
-            request_id, params = parent_req.get_child_info(idx)
+            request_id, child_params = parent_req.get_child_info(idx)
             child_request = request if idx == n - 1 else copy(request)
             child_request.request_id = request_id
-            child_request.sampling_params = params
+            child_request.sampling_params = child_params
 
             # Make a new RequestState and queue.
             self.output_processor.add_request(

From a685b47c575de7bf1c8adf309f9eba33af354535 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Tue, 25 Nov 2025 01:47:10 -0800
Subject: [PATCH 246/249] [responsesAPI] refactor construct_input_messages
 (#29359)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 vllm/entrypoints/openai/serving_responses.py | 50 +++-----------------
 vllm/entrypoints/responses_utils.py          | 46 +++++++++++++++++-
 2 files changed, 52 insertions(+), 44 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 06efb43ecb7b8..f546dbda7fef5 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -94,7 +94,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.responses_utils import (
-    construct_chat_message_with_tool_call,
+    construct_input_messages,
     convert_tool_responses_to_completions_format,
     extract_tool_types,
 )
@@ -504,7 +504,12 @@ class OpenAIServingResponses(OpenAIServing):
                 for tool in request.tools
             ]
         # Construct the input messages.
-        messages = self._construct_input_messages(request, prev_response)
+        messages = construct_input_messages(
+            request_instructions=request.instructions,
+            request_input=request.input,
+            prev_msg=self.msg_store.get(prev_response.id) if prev_response else None,
+            prev_response_output=prev_response.output if prev_response else None,
+        )
         _, request_prompts, engine_prompts = await self._preprocess_chat(
             request,
             tokenizer,
@@ -869,47 +874,6 @@ class OpenAIServingResponses(OpenAIServing):
             output_items.extend(last_items)
         return output_items
 
-    def _construct_input_messages(
-        self,
-        request: ResponsesRequest,
-        prev_response: ResponsesResponse | None = None,
-    ) -> list[ChatCompletionMessageParam]:
-        messages: list[ChatCompletionMessageParam] = []
-        if request.instructions:
-            messages.append(
-                {
-                    "role": "system",
-                    "content": request.instructions,
-                }
-            )
-
-        # Prepend the conversation history.
-        if prev_response is not None:
-            # Add the previous messages.
-            prev_msg = self.msg_store[prev_response.id]
-            messages.extend(prev_msg)
-
-            # Add the previous output.
-            for output_item in prev_response.output:
-                # NOTE: We skip the reasoning output.
-                if isinstance(output_item, ResponseOutputMessage):
-                    for content in output_item.content:
-                        messages.append(
-                            {
-                                "role": "assistant",
-                                "content": content.text,
-                            }
-                        )
-
-        # Append the new input.
-        # Responses API supports simple text inputs without chat format.
-        if isinstance(request.input, str):
-            messages.append({"role": "user", "content": request.input})
-        else:
-            for item in request.input:
-                messages.append(construct_chat_message_with_tool_call(item))
-        return messages
-
     def _construct_harmony_system_input_message(
         self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str]
     ) -> OpenAIHarmonyMessage:
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 912e8a690573d..b02c43c7f8246 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -9,7 +9,8 @@ from openai.types.chat import (
 from openai.types.chat.chat_completion_message_tool_call_param import (
     Function as FunctionCallTool,
 )
-from openai.types.responses import ResponseFunctionToolCall
+from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
 
@@ -20,6 +21,49 @@ from vllm.entrypoints.openai.protocol import (
 )
 
 
+def construct_input_messages(
+    *,
+    request_instructions: str | None = None,
+    request_input: str | list[ResponseInputOutputItem],
+    prev_msg: list[ChatCompletionMessageParam] | None = None,
+    prev_response_output: list[ResponseOutputItem] | None = None,
+):
+    messages: list[ChatCompletionMessageParam] = []
+    if request_instructions:
+        messages.append(
+            {
+                "role": "system",
+                "content": request_instructions,
+            }
+        )
+
+    # Prepend the conversation history.
+    if prev_msg is not None:
+        # Add the previous messages.
+        messages.extend(prev_msg)
+    if prev_response_output is not None:
+        # Add the previous output.
+        for output_item in prev_response_output:
+            # NOTE: We skip the reasoning output.
+            if isinstance(output_item, ResponseOutputMessage):
+                for content in output_item.content:
+                    messages.append(
+                        {
+                            "role": "assistant",
+                            "content": content.text,
+                        }
+                    )
+
+    # Append the new input.
+    # Responses API supports simple text inputs without chat format.
+    if isinstance(request_input, str):
+        messages.append({"role": "user", "content": request_input})
+    else:
+        for item in request_input:
+            messages.append(construct_chat_message_with_tool_call(item))
+    return messages
+
+
 def construct_chat_message_with_tool_call(
     item: ResponseInputOutputItem,
 ) -> ChatCompletionMessageParam:

From e1dd706cd1f2b008f6295a6a64634bf9e9b202c1 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Tue, 25 Nov 2025 04:56:15 -0500
Subject: [PATCH 247/249] [Frontend] Respect Chat Completion
 parallel_tool_calls param (#26233)

Signed-off-by: Ben Browning <bbrownin@redhat.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 docs/serving/openai_compatible_server.md   |  3 +-
 tests/tool_use/test_parallel_tool_calls.py | 57 ++++++++++++++++++++++
 vllm/entrypoints/openai/protocol.py        |  4 +-
 vllm/entrypoints/openai/serving_chat.py    |  3 ++
 vllm/entrypoints/openai/serving_engine.py  |  6 +--
 vllm/entrypoints/openai/utils.py           | 37 ++++++++++++++
 6 files changed, 102 insertions(+), 8 deletions(-)
 create mode 100644 vllm/entrypoints/openai/utils.py

diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 23df3963823aa..e3280bd15b55c 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -49,7 +49,8 @@ We currently support the following OpenAI APIs:
     - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
     - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
-    - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
+    - *Note: `user` parameter is ignored.*
+    - *Note:* Setting the `parallel_tool_calls` parameter to `false` ensures vLLM only returns zero or one tool call per request. Setting it to `true` (the default) allows returning more than one tool call per request. There is no guarantee more than one tool call will be returned if this is set to `true`, as that behavior is model dependent and not all models are designed to support parallel tool calls.
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
     - Only applicable to [embedding models](../models/pooling_models.md).
 - [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index 9af94a6a64a25..77084ec2d9456 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -212,3 +212,60 @@ async def test_parallel_tool_calls_with_results(
     assert finish_reason_count == 1
     assert len(chunks)
     assert "".join(chunks) == choice.message.content
+
+
+@pytest.mark.asyncio
+async def test_parallel_tool_calls_false(client: openai.AsyncOpenAI):
+    """
+    Ensure only one tool call is returned when parallel_tool_calls is False.
+    """
+
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        parallel_tool_calls=False,
+    )
+
+    stop_reason = chat_completion.choices[0].finish_reason
+    non_streamed_tool_calls = chat_completion.choices[0].message.tool_calls
+
+    # make sure only 1 tool call is present
+    assert len(non_streamed_tool_calls) == 1
+    assert stop_reason == "tool_calls"
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        parallel_tool_calls=False,
+        stream=True,
+    )
+
+    finish_reason_count: int = 0
+    tool_call_id_count: int = 0
+
+    async for chunk in stream:
+        # if there's a finish reason make sure it's tools
+        if chunk.choices[0].finish_reason:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == "tool_calls"
+
+        streamed_tool_calls = chunk.choices[0].delta.tool_calls
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            tool_call = streamed_tool_calls[0]
+            if tool_call.id:
+                tool_call_id_count += 1
+
+    # make sure only 1 streaming tool call is present
+    assert tool_call_id_count == 1
+    assert finish_reason_count == 1
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index c4023a6185289..98a385a1dcd5f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -559,9 +559,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
     ) = "none"
     reasoning_effort: Literal["low", "medium", "high"] | None = None
     include_reasoning: bool = True
+    parallel_tool_calls: bool | None = True
 
-    # NOTE this will be ignored by vLLM -- the model determines the behavior
-    parallel_tool_calls: bool | None = False
+    # NOTE this will be ignored by vLLM
     user: str | None = None
 
     # --8<-- [start:chat-completion-sampling-params]
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 2a870dbc3afac..9a7051e0920af 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -55,6 +55,7 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_l
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
+from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
 from vllm.entrypoints.utils import get_max_tokens, should_include_usage
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
@@ -1206,6 +1207,7 @@ class OpenAIServingChat(OpenAIServing):
 
                         finish_reason_sent[i] = True
 
+                    choice_data = maybe_filter_parallel_tool_calls(choice_data, request)
                     chunk = ChatCompletionStreamResponse(
                         id=request_id,
                         object=chunk_object_type,
@@ -1531,6 +1533,7 @@ class OpenAIServingChat(OpenAIServing):
                     as_list(output.token_ids) if request.return_token_ids else None
                 ),
             )
+            choice_data = maybe_filter_parallel_tool_calls(choice_data, request)
 
             choices.append(choice_data)
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 09a135b701d05..d9feee917ff4e 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -296,11 +296,7 @@ class OpenAIServing:
         parser = None
         if not enable_auto_tools or tool_parser_name is None:
             return parser
-        logger.info(
-            '"auto" tool choice has been enabled please note that while'
-            " the parallel_tool_calls client option is preset for "
-            "compatibility reasons, it will be ignored."
-        )
+        logger.info('"auto" tool choice has been enabled.')
 
         try:
             if tool_parser_name == "pythonic" and self.model_config.model.startswith(
diff --git a/vllm/entrypoints/openai/utils.py b/vllm/entrypoints/openai/utils.py
new file mode 100644
index 0000000000000..6f37f6adff4c2
--- /dev/null
+++ b/vllm/entrypoints/openai/utils.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TypeVar
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+)
+
+# Used internally
+_ChatCompletionResponseChoiceT = TypeVar(
+    "_ChatCompletionResponseChoiceT",
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+)
+
+
+def maybe_filter_parallel_tool_calls(
+    choice: _ChatCompletionResponseChoiceT, request: ChatCompletionRequest
+) -> _ChatCompletionResponseChoiceT:
+    """Filter to first tool call only when parallel_tool_calls is False."""
+
+    if request.parallel_tool_calls:
+        return choice
+
+    if isinstance(choice, ChatCompletionResponseChoice) and choice.message.tool_calls:
+        choice.message.tool_calls = choice.message.tool_calls[:1]
+    elif (
+        isinstance(choice, ChatCompletionResponseStreamChoice)
+        and choice.delta.tool_calls
+    ):
+        choice.delta.tool_calls = [
+            tool_call for tool_call in choice.delta.tool_calls if tool_call.index == 0
+        ]
+
+    return choice

From 7a80b01889a963f1769a3a6f9cd509dc50d5b8ad Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 25 Nov 2025 18:39:10 +0800
Subject: [PATCH 248/249] [CI] Resettle pooling entrypoints tests.  (#29370)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 tests/entrypoints/pooling/{correctness => basic}/__init__.py      | 0
 tests/entrypoints/pooling/{llm => basic}/test_encode.py           | 0
 tests/entrypoints/pooling/{openai => basic}/test_truncation.py    | 0
 tests/entrypoints/pooling/{llm => classify}/__init__.py           | 0
 .../pooling/{llm/test_classify.py => classify/test_offline.py}    | 0
 .../{openai/test_classification.py => classify/test_online.py}    | 0
 .../test_online_vision.py}                                        | 0
 tests/entrypoints/pooling/{openai => embed}/__init__.py           | 0
 .../test_mteb_embed.py => embed/test_correctness_mteb.py}         | 0
 .../pooling/{llm/test_embedding.py => embed/test_offline.py}      | 0
 .../pooling/{openai/test_embedding.py => embed/test_online.py}    | 0
 .../test_online_dimensions.py}                                    | 0
 .../test_online_long_text.py}                                     | 0
 .../test_vision_embedding.py => embed/test_online_vision.py}      | 0
 tests/entrypoints/pooling/pooling/__init__.py                     | 0
 .../pooling/{openai/test_pooling.py => pooling/test_online.py}    | 0
 tests/entrypoints/pooling/reward/__init__.py                      | 0
 .../pooling/{llm/test_reward.py => reward/test_offline.py}        | 0
 tests/entrypoints/pooling/score/__init__.py                       | 0
 .../test_mteb_score.py => score/test_correctness_mteb.py}         | 0
 .../pooling/{llm/test_score.py => score/test_offline.py}          | 0
 .../{openai/test_rerank.py => score/test_online_rerank.py}        | 0
 .../pooling/{openai/test_score.py => score/test_online_score.py}  | 0
 23 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/entrypoints/pooling/{correctness => basic}/__init__.py (100%)
 rename tests/entrypoints/pooling/{llm => basic}/test_encode.py (100%)
 rename tests/entrypoints/pooling/{openai => basic}/test_truncation.py (100%)
 rename tests/entrypoints/pooling/{llm => classify}/__init__.py (100%)
 rename tests/entrypoints/pooling/{llm/test_classify.py => classify/test_offline.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_classification.py => classify/test_online.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_vision_classification.py => classify/test_online_vision.py} (100%)
 rename tests/entrypoints/pooling/{openai => embed}/__init__.py (100%)
 rename tests/entrypoints/pooling/{correctness/test_mteb_embed.py => embed/test_correctness_mteb.py} (100%)
 rename tests/entrypoints/pooling/{llm/test_embedding.py => embed/test_offline.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_embedding.py => embed/test_online.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_embedding_dimensions.py => embed/test_online_dimensions.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_embedding_long_text.py => embed/test_online_long_text.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_vision_embedding.py => embed/test_online_vision.py} (100%)
 create mode 100644 tests/entrypoints/pooling/pooling/__init__.py
 rename tests/entrypoints/pooling/{openai/test_pooling.py => pooling/test_online.py} (100%)
 create mode 100644 tests/entrypoints/pooling/reward/__init__.py
 rename tests/entrypoints/pooling/{llm/test_reward.py => reward/test_offline.py} (100%)
 create mode 100644 tests/entrypoints/pooling/score/__init__.py
 rename tests/entrypoints/pooling/{correctness/test_mteb_score.py => score/test_correctness_mteb.py} (100%)
 rename tests/entrypoints/pooling/{llm/test_score.py => score/test_offline.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_rerank.py => score/test_online_rerank.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_score.py => score/test_online_score.py} (100%)

diff --git a/tests/entrypoints/pooling/correctness/__init__.py b/tests/entrypoints/pooling/basic/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/correctness/__init__.py
rename to tests/entrypoints/pooling/basic/__init__.py
diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/basic/test_encode.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_encode.py
rename to tests/entrypoints/pooling/basic/test_encode.py
diff --git a/tests/entrypoints/pooling/openai/test_truncation.py b/tests/entrypoints/pooling/basic/test_truncation.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_truncation.py
rename to tests/entrypoints/pooling/basic/test_truncation.py
diff --git a/tests/entrypoints/pooling/llm/__init__.py b/tests/entrypoints/pooling/classify/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/__init__.py
rename to tests/entrypoints/pooling/classify/__init__.py
diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/classify/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_classify.py
rename to tests/entrypoints/pooling/classify/test_offline.py
diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/classify/test_online.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_classification.py
rename to tests/entrypoints/pooling/classify/test_online.py
diff --git a/tests/entrypoints/pooling/openai/test_vision_classification.py b/tests/entrypoints/pooling/classify/test_online_vision.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_vision_classification.py
rename to tests/entrypoints/pooling/classify/test_online_vision.py
diff --git a/tests/entrypoints/pooling/openai/__init__.py b/tests/entrypoints/pooling/embed/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/__init__.py
rename to tests/entrypoints/pooling/embed/__init__.py
diff --git a/tests/entrypoints/pooling/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
similarity index 100%
rename from tests/entrypoints/pooling/correctness/test_mteb_embed.py
rename to tests/entrypoints/pooling/embed/test_correctness_mteb.py
diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/embed/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_embedding.py
rename to tests/entrypoints/pooling/embed/test_offline.py
diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/embed/test_online.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_embedding.py
rename to tests/entrypoints/pooling/embed/test_online.py
diff --git a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/embed/test_online_dimensions.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_embedding_dimensions.py
rename to tests/entrypoints/pooling/embed/test_online_dimensions.py
diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/embed/test_online_long_text.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_embedding_long_text.py
rename to tests/entrypoints/pooling/embed/test_online_long_text.py
diff --git a/tests/entrypoints/pooling/openai/test_vision_embedding.py b/tests/entrypoints/pooling/embed/test_online_vision.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_vision_embedding.py
rename to tests/entrypoints/pooling/embed/test_online_vision.py
diff --git a/tests/entrypoints/pooling/pooling/__init__.py b/tests/entrypoints/pooling/pooling/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/pooling/openai/test_pooling.py b/tests/entrypoints/pooling/pooling/test_online.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_pooling.py
rename to tests/entrypoints/pooling/pooling/test_online.py
diff --git a/tests/entrypoints/pooling/reward/__init__.py b/tests/entrypoints/pooling/reward/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/reward/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_reward.py
rename to tests/entrypoints/pooling/reward/test_offline.py
diff --git a/tests/entrypoints/pooling/score/__init__.py b/tests/entrypoints/pooling/score/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/score/test_correctness_mteb.py
similarity index 100%
rename from tests/entrypoints/pooling/correctness/test_mteb_score.py
rename to tests/entrypoints/pooling/score/test_correctness_mteb.py
diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/score/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_score.py
rename to tests/entrypoints/pooling/score/test_offline.py
diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/score/test_online_rerank.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_rerank.py
rename to tests/entrypoints/pooling/score/test_online_rerank.py
diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/score/test_online_score.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_score.py
rename to tests/entrypoints/pooling/score/test_online_score.py

From de6889946bd10045f2ee79b252e75d8f3e323956 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 25 Nov 2025 19:00:44 +0800
Subject: [PATCH 249/249] [Misc] Suppress log outputs when constructing the
 default vllm config. (#29291)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 14 ++++++++------
 vllm/logger.py           | 11 ++++++++++-
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8338e54d4fd85..6b5c8ba87ecbf 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -77,7 +77,7 @@ from vllm.config.observability import DetailedTraceModules
 from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
-from vllm.logger import init_logger
+from vllm.logger import init_logger, suppress_logging
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
@@ -247,11 +247,13 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
             default = field.default
             # Handle pydantic.Field defaults
             if isinstance(default, FieldInfo):
-                default = (
-                    default.default
-                    if default.default_factory is None
-                    else default.default_factory()
-                )
+                if default.default_factory is None:
+                    default = default.default
+                else:
+                    # VllmConfig's Fields have default_factory set to config classes.
+                    # These could emit logs on init, which would be confusing.
+                    with suppress_logging():
+                        default = default.default_factory()
         elif field.default_factory is not MISSING:
             default = field.default_factory()
 
diff --git a/vllm/logger.py b/vllm/logger.py
index 772e36497b45e..ad3123c0f0149 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -7,7 +7,8 @@ import json
 import logging
 import os
 import sys
-from collections.abc import Hashable
+from collections.abc import Generator, Hashable
+from contextlib import contextmanager
 from functools import lru_cache, partial
 from logging import Logger
 from logging.config import dictConfig
@@ -212,6 +213,14 @@ def init_logger(name: str) -> _VllmLogger:
     return cast(_VllmLogger, logger)
 
 
+@contextmanager
+def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]:
+    current_level = logging.root.manager.disable
+    logging.disable(level)
+    yield
+    logging.disable(current_level)
+
+
 # The root logger is initialized when the module is imported.
 # This is thread-safe as the module is only imported once,
 # guaranteed by the Python GIL.